blob: c99af71c0ae78386f18e42536fa6b606b84bfa6d [file] [log] [blame]
Jacob Pand6d71ee2013-01-21 04:37:57 -08001/*
2 * intel_powerclamp.c - package c-state idle injection
3 *
4 * Copyright (c) 2012, Intel Corporation.
5 *
6 * Authors:
7 * Arjan van de Ven <arjan@linux.intel.com>
8 * Jacob Pan <jacob.jun.pan@linux.intel.com>
9 *
10 * This program is free software; you can redistribute it and/or modify it
11 * under the terms and conditions of the GNU General Public License,
12 * version 2, as published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope it will be useful, but WITHOUT
15 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
16 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
17 * more details.
18 *
19 * You should have received a copy of the GNU General Public License along with
20 * this program; if not, write to the Free Software Foundation, Inc.,
21 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
22 *
23 *
24 * TODO:
25 * 1. better handle wakeup from external interrupts, currently a fixed
26 * compensation is added to clamping duration when excessive amount
27 * of wakeups are observed during idle time. the reason is that in
28 * case of external interrupts without need for ack, clamping down
29 * cpu in non-irq context does not reduce irq. for majority of the
30 * cases, clamping down cpu does help reduce irq as well, we should
31 * be able to differenciate the two cases and give a quantitative
32 * solution for the irqs that we can control. perhaps based on
33 * get_cpu_iowait_time_us()
34 *
35 * 2. synchronization with other hw blocks
36 *
37 *
38 */
39
40#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
41
42#include <linux/module.h>
43#include <linux/kernel.h>
44#include <linux/delay.h>
45#include <linux/kthread.h>
Jacob Pand6d71ee2013-01-21 04:37:57 -080046#include <linux/cpu.h>
47#include <linux/thermal.h>
48#include <linux/slab.h>
49#include <linux/tick.h>
50#include <linux/debugfs.h>
51#include <linux/seq_file.h>
Linus Torvalds19cc90f2013-02-28 20:23:09 -080052#include <linux/sched/rt.h>
Jacob Pand6d71ee2013-01-21 04:37:57 -080053
54#include <asm/nmi.h>
55#include <asm/msr.h>
56#include <asm/mwait.h>
57#include <asm/cpu_device_id.h>
58#include <asm/idle.h>
59#include <asm/hardirq.h>
60
61#define MAX_TARGET_RATIO (50U)
62/* For each undisturbed clamping period (no extra wake ups during idle time),
63 * we increment the confidence counter for the given target ratio.
64 * CONFIDENCE_OK defines the level where runtime calibration results are
65 * valid.
66 */
67#define CONFIDENCE_OK (3)
68/* Default idle injection duration, driver adjust sleep time to meet target
69 * idle ratio. Similar to frequency modulation.
70 */
71#define DEFAULT_DURATION_JIFFIES (6)
72
73static unsigned int target_mwait;
74static struct dentry *debug_dir;
75
76/* user selected target */
77static unsigned int set_target_ratio;
78static unsigned int current_ratio;
79static bool should_skip;
80static bool reduce_irq;
81static atomic_t idle_wakeup_counter;
82static unsigned int control_cpu; /* The cpu assigned to collect stat and update
83 * control parameters. default to BSP but BSP
84 * can be offlined.
85 */
86static bool clamping;
87
Petr Mladek8d962ac2016-11-28 13:44:50 -080088static const struct sched_param sparam = {
89 .sched_priority = MAX_USER_RT_PRIO / 2,
90};
91struct powerclamp_worker_data {
92 struct kthread_worker *worker;
93 struct kthread_work balancing_work;
94 struct kthread_delayed_work idle_injection_work;
95 struct timer_list wakeup_timer;
96 unsigned int cpu;
97 unsigned int count;
98 unsigned int guard;
99 unsigned int window_size_now;
100 unsigned int target_ratio;
101 unsigned int duration_jiffies;
102 bool clamping;
103};
Jacob Pand6d71ee2013-01-21 04:37:57 -0800104
Petr Mladek8d962ac2016-11-28 13:44:50 -0800105static struct powerclamp_worker_data * __percpu worker_data;
Jacob Pand6d71ee2013-01-21 04:37:57 -0800106static struct thermal_cooling_device *cooling_dev;
107static unsigned long *cpu_clamping_mask; /* bit map for tracking per cpu
Petr Mladek8d962ac2016-11-28 13:44:50 -0800108 * clamping kthread worker
Jacob Pand6d71ee2013-01-21 04:37:57 -0800109 */
110
111static unsigned int duration;
112static unsigned int pkg_cstate_ratio_cur;
113static unsigned int window_size;
114
115static int duration_set(const char *arg, const struct kernel_param *kp)
116{
117 int ret = 0;
118 unsigned long new_duration;
119
120 ret = kstrtoul(arg, 10, &new_duration);
121 if (ret)
122 goto exit;
123 if (new_duration > 25 || new_duration < 6) {
124 pr_err("Out of recommended range %lu, between 6-25ms\n",
125 new_duration);
126 ret = -EINVAL;
127 }
128
129 duration = clamp(new_duration, 6ul, 25ul);
130 smp_mb();
131
132exit:
133
134 return ret;
135}
136
Luis R. Rodriguez9c278472015-05-27 11:09:38 +0930137static const struct kernel_param_ops duration_ops = {
Jacob Pand6d71ee2013-01-21 04:37:57 -0800138 .set = duration_set,
139 .get = param_get_int,
140};
141
142
143module_param_cb(duration, &duration_ops, &duration, 0644);
144MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
145
146struct powerclamp_calibration_data {
147 unsigned long confidence; /* used for calibration, basically a counter
148 * gets incremented each time a clamping
149 * period is completed without extra wakeups
150 * once that counter is reached given level,
151 * compensation is deemed usable.
152 */
153 unsigned long steady_comp; /* steady state compensation used when
154 * no extra wakeups occurred.
155 */
156 unsigned long dynamic_comp; /* compensate excessive wakeup from idle
157 * mostly from external interrupts.
158 */
159};
160
161static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
162
163static int window_size_set(const char *arg, const struct kernel_param *kp)
164{
165 int ret = 0;
166 unsigned long new_window_size;
167
168 ret = kstrtoul(arg, 10, &new_window_size);
169 if (ret)
170 goto exit_win;
171 if (new_window_size > 10 || new_window_size < 2) {
172 pr_err("Out of recommended window size %lu, between 2-10\n",
173 new_window_size);
174 ret = -EINVAL;
175 }
176
177 window_size = clamp(new_window_size, 2ul, 10ul);
178 smp_mb();
179
180exit_win:
181
182 return ret;
183}
184
Luis R. Rodriguez9c278472015-05-27 11:09:38 +0930185static const struct kernel_param_ops window_size_ops = {
Jacob Pand6d71ee2013-01-21 04:37:57 -0800186 .set = window_size_set,
187 .get = param_get_int,
188};
189
190module_param_cb(window_size, &window_size_ops, &window_size, 0644);
191MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
192 "\tpowerclamp controls idle ratio within this window. larger\n"
193 "\twindow size results in slower response time but more smooth\n"
194 "\tclamping results. default to 2.");
195
196static void find_target_mwait(void)
197{
198 unsigned int eax, ebx, ecx, edx;
199 unsigned int highest_cstate = 0;
200 unsigned int highest_subcstate = 0;
201 int i;
202
203 if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
204 return;
205
206 cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
207
208 if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
209 !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
210 return;
211
212 edx >>= MWAIT_SUBSTATE_SIZE;
213 for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
214 if (edx & MWAIT_SUBSTATE_MASK) {
215 highest_cstate = i;
216 highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
217 }
218 }
219 target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
220 (highest_subcstate - 1);
221
222}
223
Jacob Pand8186112015-05-07 09:03:59 -0700224struct pkg_cstate_info {
225 bool skip;
226 int msr_index;
227 int cstate_id;
228};
229
230#define PKG_CSTATE_INIT(id) { \
231 .msr_index = MSR_PKG_C##id##_RESIDENCY, \
232 .cstate_id = id \
233 }
234
235static struct pkg_cstate_info pkg_cstates[] = {
236 PKG_CSTATE_INIT(2),
237 PKG_CSTATE_INIT(3),
238 PKG_CSTATE_INIT(6),
239 PKG_CSTATE_INIT(7),
240 PKG_CSTATE_INIT(8),
241 PKG_CSTATE_INIT(9),
242 PKG_CSTATE_INIT(10),
243 {NULL},
244};
245
Yuxuan Shui7734e3a2013-11-18 15:06:35 +0800246static bool has_pkg_state_counter(void)
247{
Jacob Pand8186112015-05-07 09:03:59 -0700248 u64 val;
249 struct pkg_cstate_info *info = pkg_cstates;
250
251 /* check if any one of the counter msrs exists */
252 while (info->msr_index) {
253 if (!rdmsrl_safe(info->msr_index, &val))
254 return true;
255 info++;
256 }
257
258 return false;
Yuxuan Shui7734e3a2013-11-18 15:06:35 +0800259}
260
Jacob Pand6d71ee2013-01-21 04:37:57 -0800261static u64 pkg_state_counter(void)
262{
263 u64 val;
264 u64 count = 0;
Jacob Pand8186112015-05-07 09:03:59 -0700265 struct pkg_cstate_info *info = pkg_cstates;
Jacob Pand6d71ee2013-01-21 04:37:57 -0800266
Jacob Pand8186112015-05-07 09:03:59 -0700267 while (info->msr_index) {
268 if (!info->skip) {
269 if (!rdmsrl_safe(info->msr_index, &val))
270 count += val;
271 else
272 info->skip = true;
273 }
274 info++;
Jacob Pand6d71ee2013-01-21 04:37:57 -0800275 }
276
277 return count;
278}
279
280static void noop_timer(unsigned long foo)
281{
282 /* empty... just the fact that we get the interrupt wakes us up */
283}
284
285static unsigned int get_compensation(int ratio)
286{
287 unsigned int comp = 0;
288
289 /* we only use compensation if all adjacent ones are good */
290 if (ratio == 1 &&
291 cal_data[ratio].confidence >= CONFIDENCE_OK &&
292 cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
293 cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
294 comp = (cal_data[ratio].steady_comp +
295 cal_data[ratio + 1].steady_comp +
296 cal_data[ratio + 2].steady_comp) / 3;
297 } else if (ratio == MAX_TARGET_RATIO - 1 &&
298 cal_data[ratio].confidence >= CONFIDENCE_OK &&
299 cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
300 cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
301 comp = (cal_data[ratio].steady_comp +
302 cal_data[ratio - 1].steady_comp +
303 cal_data[ratio - 2].steady_comp) / 3;
304 } else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
305 cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
306 cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
307 comp = (cal_data[ratio].steady_comp +
308 cal_data[ratio - 1].steady_comp +
309 cal_data[ratio + 1].steady_comp) / 3;
310 }
311
312 /* REVISIT: simple penalty of double idle injection */
313 if (reduce_irq)
314 comp = ratio;
315 /* do not exceed limit */
316 if (comp + ratio >= MAX_TARGET_RATIO)
317 comp = MAX_TARGET_RATIO - ratio - 1;
318
319 return comp;
320}
321
322static void adjust_compensation(int target_ratio, unsigned int win)
323{
324 int delta;
325 struct powerclamp_calibration_data *d = &cal_data[target_ratio];
326
327 /*
328 * adjust compensations if confidence level has not been reached or
329 * there are too many wakeups during the last idle injection period, we
330 * cannot trust the data for compensation.
331 */
332 if (d->confidence >= CONFIDENCE_OK ||
333 atomic_read(&idle_wakeup_counter) >
334 win * num_online_cpus())
335 return;
336
337 delta = set_target_ratio - current_ratio;
338 /* filter out bad data */
339 if (delta >= 0 && delta <= (1+target_ratio/10)) {
340 if (d->steady_comp)
341 d->steady_comp =
342 roundup(delta+d->steady_comp, 2)/2;
343 else
344 d->steady_comp = delta;
345 d->confidence++;
346 }
347}
348
349static bool powerclamp_adjust_controls(unsigned int target_ratio,
350 unsigned int guard, unsigned int win)
351{
352 static u64 msr_last, tsc_last;
353 u64 msr_now, tsc_now;
354 u64 val64;
355
356 /* check result for the last window */
357 msr_now = pkg_state_counter();
Andy Lutomirski4ea16362015-06-25 18:44:07 +0200358 tsc_now = rdtsc();
Jacob Pand6d71ee2013-01-21 04:37:57 -0800359
360 /* calculate pkg cstate vs tsc ratio */
361 if (!msr_last || !tsc_last)
362 current_ratio = 1;
363 else if (tsc_now-tsc_last) {
364 val64 = 100*(msr_now-msr_last);
365 do_div(val64, (tsc_now-tsc_last));
366 current_ratio = val64;
367 }
368
369 /* update record */
370 msr_last = msr_now;
371 tsc_last = tsc_now;
372
373 adjust_compensation(target_ratio, win);
374 /*
375 * too many external interrupts, set flag such
376 * that we can take measure later.
377 */
378 reduce_irq = atomic_read(&idle_wakeup_counter) >=
379 2 * win * num_online_cpus();
380
381 atomic_set(&idle_wakeup_counter, 0);
382 /* if we are above target+guard, skip */
383 return set_target_ratio + guard <= current_ratio;
384}
385
Petr Mladek8d962ac2016-11-28 13:44:50 -0800386static void clamp_balancing_func(struct kthread_work *work)
Jacob Pand6d71ee2013-01-21 04:37:57 -0800387{
Petr Mladek8d962ac2016-11-28 13:44:50 -0800388 struct powerclamp_worker_data *w_data;
389 int sleeptime;
390 unsigned long target_jiffies;
391 unsigned int compensated_ratio;
392 int interval; /* jiffies to sleep for each attempt */
Jacob Pand6d71ee2013-01-21 04:37:57 -0800393
Petr Mladek8d962ac2016-11-28 13:44:50 -0800394 w_data = container_of(work, struct powerclamp_worker_data,
395 balancing_work);
Jacob Pand6d71ee2013-01-21 04:37:57 -0800396
Petr Mladek8d962ac2016-11-28 13:44:50 -0800397 /*
398 * make sure user selected ratio does not take effect until
399 * the next round. adjust target_ratio if user has changed
400 * target such that we can converge quickly.
401 */
402 w_data->target_ratio = READ_ONCE(set_target_ratio);
403 w_data->guard = 1 + w_data->target_ratio / 20;
404 w_data->window_size_now = window_size;
405 w_data->duration_jiffies = msecs_to_jiffies(duration);
406 w_data->count++;
Jacob Pand6d71ee2013-01-21 04:37:57 -0800407
Petr Mladek8d962ac2016-11-28 13:44:50 -0800408 /*
409 * systems may have different ability to enter package level
410 * c-states, thus we need to compensate the injected idle ratio
411 * to achieve the actual target reported by the HW.
412 */
413 compensated_ratio = w_data->target_ratio +
414 get_compensation(w_data->target_ratio);
415 if (compensated_ratio <= 0)
416 compensated_ratio = 1;
417 interval = w_data->duration_jiffies * 100 / compensated_ratio;
Jacob Pand6d71ee2013-01-21 04:37:57 -0800418
Petr Mladek8d962ac2016-11-28 13:44:50 -0800419 /* align idle time */
420 target_jiffies = roundup(jiffies, interval);
421 sleeptime = target_jiffies - jiffies;
422 if (sleeptime <= 0)
423 sleeptime = 1;
Jacob Pand6d71ee2013-01-21 04:37:57 -0800424
Petr Mladek8d962ac2016-11-28 13:44:50 -0800425 if (clamping && w_data->clamping && cpu_online(w_data->cpu))
426 kthread_queue_delayed_work(w_data->worker,
427 &w_data->idle_injection_work,
428 sleeptime);
429}
Jacob Pand6d71ee2013-01-21 04:37:57 -0800430
Petr Mladek8d962ac2016-11-28 13:44:50 -0800431static void clamp_idle_injection_func(struct kthread_work *work)
432{
433 struct powerclamp_worker_data *w_data;
434 unsigned long target_jiffies;
Jacob Pand6d71ee2013-01-21 04:37:57 -0800435
Petr Mladek8d962ac2016-11-28 13:44:50 -0800436 w_data = container_of(work, struct powerclamp_worker_data,
437 idle_injection_work.work);
Jacob Pand6d71ee2013-01-21 04:37:57 -0800438
Petr Mladek8d962ac2016-11-28 13:44:50 -0800439 /*
440 * only elected controlling cpu can collect stats and update
441 * control parameters.
442 */
443 if (w_data->cpu == control_cpu &&
444 !(w_data->count % w_data->window_size_now)) {
445 should_skip =
446 powerclamp_adjust_controls(w_data->target_ratio,
447 w_data->guard,
448 w_data->window_size_now);
449 smp_mb();
Jacob Pand6d71ee2013-01-21 04:37:57 -0800450 }
Jacob Pand6d71ee2013-01-21 04:37:57 -0800451
Petr Mladek8d962ac2016-11-28 13:44:50 -0800452 if (should_skip)
453 goto balance;
454
455 target_jiffies = jiffies + w_data->duration_jiffies;
456 mod_timer(&w_data->wakeup_timer, target_jiffies);
457 if (unlikely(local_softirq_pending()))
458 goto balance;
459 /*
460 * stop tick sched during idle time, interrupts are still
461 * allowed. thus jiffies are updated properly.
462 */
463 preempt_disable();
464 /* mwait until target jiffies is reached */
465 while (time_before(jiffies, target_jiffies)) {
466 unsigned long ecx = 1;
467 unsigned long eax = target_mwait;
468
469 /*
470 * REVISIT: may call enter_idle() to notify drivers who
471 * can save power during cpu idle. same for exit_idle()
472 */
473 local_touch_nmi();
474 stop_critical_timings();
475 mwait_idle_with_hints(eax, ecx);
476 start_critical_timings();
477 atomic_inc(&idle_wakeup_counter);
478 }
479 preempt_enable();
480
481balance:
482 if (clamping && w_data->clamping && cpu_online(w_data->cpu))
483 kthread_queue_work(w_data->worker, &w_data->balancing_work);
Jacob Pand6d71ee2013-01-21 04:37:57 -0800484}
485
486/*
487 * 1 HZ polling while clamping is active, useful for userspace
488 * to monitor actual idle ratio.
489 */
490static void poll_pkg_cstate(struct work_struct *dummy);
491static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
492static void poll_pkg_cstate(struct work_struct *dummy)
493{
494 static u64 msr_last;
495 static u64 tsc_last;
496 static unsigned long jiffies_last;
497
498 u64 msr_now;
499 unsigned long jiffies_now;
500 u64 tsc_now;
501 u64 val64;
502
503 msr_now = pkg_state_counter();
Andy Lutomirski4ea16362015-06-25 18:44:07 +0200504 tsc_now = rdtsc();
Jacob Pand6d71ee2013-01-21 04:37:57 -0800505 jiffies_now = jiffies;
506
507 /* calculate pkg cstate vs tsc ratio */
508 if (!msr_last || !tsc_last)
509 pkg_cstate_ratio_cur = 1;
510 else {
511 if (tsc_now - tsc_last) {
512 val64 = 100 * (msr_now - msr_last);
513 do_div(val64, (tsc_now - tsc_last));
514 pkg_cstate_ratio_cur = val64;
515 }
516 }
517
518 /* update record */
519 msr_last = msr_now;
520 jiffies_last = jiffies_now;
521 tsc_last = tsc_now;
522
523 if (true == clamping)
524 schedule_delayed_work(&poll_pkg_cstate_work, HZ);
525}
526
Petr Mladek8d962ac2016-11-28 13:44:50 -0800527static void start_power_clamp_worker(unsigned long cpu)
Petr Mladek14f3f7d2016-11-28 13:44:49 -0800528{
Petr Mladek8d962ac2016-11-28 13:44:50 -0800529 struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
530 struct kthread_worker *worker;
Petr Mladek14f3f7d2016-11-28 13:44:49 -0800531
Sebastian Andrzej Siewiorcb91fef2016-11-28 13:44:51 -0800532 worker = kthread_create_worker_on_cpu(cpu, 0, "kidle_inject/%ld", cpu);
Petr Mladek8d962ac2016-11-28 13:44:50 -0800533 if (IS_ERR(worker))
Petr Mladek14f3f7d2016-11-28 13:44:49 -0800534 return;
535
Petr Mladek8d962ac2016-11-28 13:44:50 -0800536 w_data->worker = worker;
537 w_data->count = 0;
538 w_data->cpu = cpu;
539 w_data->clamping = true;
540 set_bit(cpu, cpu_clamping_mask);
541 setup_timer(&w_data->wakeup_timer, noop_timer, 0);
542 sched_setscheduler(worker->task, SCHED_FIFO, &sparam);
543 kthread_init_work(&w_data->balancing_work, clamp_balancing_func);
544 kthread_init_delayed_work(&w_data->idle_injection_work,
545 clamp_idle_injection_func);
546 kthread_queue_work(w_data->worker, &w_data->balancing_work);
547}
548
549static void stop_power_clamp_worker(unsigned long cpu)
550{
551 struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
552
553 if (!w_data->worker)
554 return;
555
556 w_data->clamping = false;
557 /*
558 * Make sure that all works that get queued after this point see
559 * the clamping disabled. The counter part is not needed because
560 * there is an implicit memory barrier when the queued work
561 * is proceed.
562 */
563 smp_wmb();
564 kthread_cancel_work_sync(&w_data->balancing_work);
565 kthread_cancel_delayed_work_sync(&w_data->idle_injection_work);
566 /*
567 * The balancing work still might be queued here because
568 * the handling of the "clapming" variable, cancel, and queue
569 * operations are not synchronized via a lock. But it is not
570 * a big deal. The balancing work is fast and destroy kthread
571 * will wait for it.
572 */
573 del_timer_sync(&w_data->wakeup_timer);
574 clear_bit(w_data->cpu, cpu_clamping_mask);
575 kthread_destroy_worker(w_data->worker);
576
577 w_data->worker = NULL;
Petr Mladek14f3f7d2016-11-28 13:44:49 -0800578}
579
Jacob Pand6d71ee2013-01-21 04:37:57 -0800580static int start_power_clamp(void)
581{
582 unsigned long cpu;
Jacob Pand6d71ee2013-01-21 04:37:57 -0800583
Dan Carpenterc8165dc2013-01-24 08:51:22 +0000584 set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
Jacob Pand6d71ee2013-01-21 04:37:57 -0800585 /* prevent cpu hotplug */
586 get_online_cpus();
587
588 /* prefer BSP */
589 control_cpu = 0;
590 if (!cpu_online(control_cpu))
591 control_cpu = smp_processor_id();
592
593 clamping = true;
594 schedule_delayed_work(&poll_pkg_cstate_work, 0);
595
Petr Mladek8d962ac2016-11-28 13:44:50 -0800596 /* start one kthread worker per online cpu */
Jacob Pand6d71ee2013-01-21 04:37:57 -0800597 for_each_online_cpu(cpu) {
Petr Mladek8d962ac2016-11-28 13:44:50 -0800598 start_power_clamp_worker(cpu);
Jacob Pand6d71ee2013-01-21 04:37:57 -0800599 }
600 put_online_cpus();
601
602 return 0;
603}
604
605static void end_power_clamp(void)
606{
607 int i;
Jacob Pand6d71ee2013-01-21 04:37:57 -0800608
Jacob Pand6d71ee2013-01-21 04:37:57 -0800609 /*
Petr Mladek8d962ac2016-11-28 13:44:50 -0800610 * Block requeuing in all the kthread workers. They will flush and
611 * stop faster.
Jacob Pand6d71ee2013-01-21 04:37:57 -0800612 */
Petr Mladek8d962ac2016-11-28 13:44:50 -0800613 clamping = false;
Jacob Pand6d71ee2013-01-21 04:37:57 -0800614 if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
615 for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
Petr Mladek8d962ac2016-11-28 13:44:50 -0800616 pr_debug("clamping worker for cpu %d alive, destroy\n",
617 i);
618 stop_power_clamp_worker(i);
Jacob Pand6d71ee2013-01-21 04:37:57 -0800619 }
620 }
621}
622
Sebastian Andrzej Siewiorcb91fef2016-11-28 13:44:51 -0800623static int powerclamp_cpu_online(unsigned int cpu)
Jacob Pand6d71ee2013-01-21 04:37:57 -0800624{
Sebastian Andrzej Siewiorcb91fef2016-11-28 13:44:51 -0800625 if (clamping == false)
626 return 0;
627 start_power_clamp_worker(cpu);
628 /* prefer BSP as controlling CPU */
629 if (cpu == 0) {
630 control_cpu = 0;
631 smp_mb();
Jacob Pand6d71ee2013-01-21 04:37:57 -0800632 }
Sebastian Andrzej Siewiorcb91fef2016-11-28 13:44:51 -0800633 return 0;
Jacob Pand6d71ee2013-01-21 04:37:57 -0800634}
635
Sebastian Andrzej Siewiorcb91fef2016-11-28 13:44:51 -0800636static int powerclamp_cpu_predown(unsigned int cpu)
637{
638 if (clamping == false)
639 return 0;
640
641 stop_power_clamp_worker(cpu);
642 if (cpu != control_cpu)
643 return 0;
644
645 control_cpu = cpumask_first(cpu_online_mask);
646 if (control_cpu == cpu)
647 control_cpu = cpumask_next(cpu, cpu_online_mask);
648 smp_mb();
649 return 0;
650}
Jacob Pand6d71ee2013-01-21 04:37:57 -0800651
652static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
653 unsigned long *state)
654{
655 *state = MAX_TARGET_RATIO;
656
657 return 0;
658}
659
660static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
661 unsigned long *state)
662{
663 if (true == clamping)
664 *state = pkg_cstate_ratio_cur;
665 else
666 /* to save power, do not poll idle ratio while not clamping */
667 *state = -1; /* indicates invalid state */
668
669 return 0;
670}
671
672static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
673 unsigned long new_target_ratio)
674{
675 int ret = 0;
676
677 new_target_ratio = clamp(new_target_ratio, 0UL,
678 (unsigned long) (MAX_TARGET_RATIO-1));
679 if (set_target_ratio == 0 && new_target_ratio > 0) {
680 pr_info("Start idle injection to reduce power\n");
681 set_target_ratio = new_target_ratio;
682 ret = start_power_clamp();
683 goto exit_set;
684 } else if (set_target_ratio > 0 && new_target_ratio == 0) {
685 pr_info("Stop forced idle injection\n");
Jacob Pand6d71ee2013-01-21 04:37:57 -0800686 end_power_clamp();
Petr Mladek70c50ee2016-08-05 15:20:41 +0200687 set_target_ratio = 0;
Jacob Pand6d71ee2013-01-21 04:37:57 -0800688 } else /* adjust currently running */ {
689 set_target_ratio = new_target_ratio;
690 /* make new set_target_ratio visible to other cpus */
691 smp_mb();
692 }
693
694exit_set:
695 return ret;
696}
697
698/* bind to generic thermal layer as cooling device*/
699static struct thermal_cooling_device_ops powerclamp_cooling_ops = {
700 .get_max_state = powerclamp_get_max_state,
701 .get_cur_state = powerclamp_get_cur_state,
702 .set_cur_state = powerclamp_set_cur_state,
703};
704
Mathias Krause4d2b6e42015-03-25 22:16:24 +0100705static const struct x86_cpu_id intel_powerclamp_ids[] __initconst = {
Jacob Panb721ca02016-03-17 11:26:13 -0700706 { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_MWAIT },
707 { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_ARAT },
708 { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_NONSTOP_TSC },
709 { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_CONSTANT_TSC},
Jacob Pand6d71ee2013-01-21 04:37:57 -0800710 {}
711};
712MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
713
Mathias Krause4d2b6e42015-03-25 22:16:24 +0100714static int __init powerclamp_probe(void)
Jacob Pand6d71ee2013-01-21 04:37:57 -0800715{
716 if (!x86_match_cpu(intel_powerclamp_ids)) {
717 pr_err("Intel powerclamp does not run on family %d model %d\n",
718 boot_cpu_data.x86, boot_cpu_data.x86_model);
719 return -ENODEV;
720 }
Jacob Panb721ca02016-03-17 11:26:13 -0700721
722 /* The goal for idle time alignment is to achieve package cstate. */
723 if (!has_pkg_state_counter()) {
724 pr_info("No package C-state available");
Jacob Pand6d71ee2013-01-21 04:37:57 -0800725 return -ENODEV;
Jacob Panb721ca02016-03-17 11:26:13 -0700726 }
Jacob Pand6d71ee2013-01-21 04:37:57 -0800727
728 /* find the deepest mwait value */
729 find_target_mwait();
730
731 return 0;
732}
733
734static int powerclamp_debug_show(struct seq_file *m, void *unused)
735{
736 int i = 0;
737
738 seq_printf(m, "controlling cpu: %d\n", control_cpu);
739 seq_printf(m, "pct confidence steady dynamic (compensation)\n");
740 for (i = 0; i < MAX_TARGET_RATIO; i++) {
741 seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
742 i,
743 cal_data[i].confidence,
744 cal_data[i].steady_comp,
745 cal_data[i].dynamic_comp);
746 }
747
748 return 0;
749}
750
751static int powerclamp_debug_open(struct inode *inode,
752 struct file *file)
753{
754 return single_open(file, powerclamp_debug_show, inode->i_private);
755}
756
757static const struct file_operations powerclamp_debug_fops = {
758 .open = powerclamp_debug_open,
759 .read = seq_read,
760 .llseek = seq_lseek,
761 .release = single_release,
762 .owner = THIS_MODULE,
763};
764
765static inline void powerclamp_create_debug_files(void)
766{
767 debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
768 if (!debug_dir)
769 return;
770
771 if (!debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir,
772 cal_data, &powerclamp_debug_fops))
773 goto file_error;
774
775 return;
776
777file_error:
778 debugfs_remove_recursive(debug_dir);
779}
780
Sebastian Andrzej Siewiorcb91fef2016-11-28 13:44:51 -0800781static enum cpuhp_state hp_state;
782
Mathias Krause4d2b6e42015-03-25 22:16:24 +0100783static int __init powerclamp_init(void)
Jacob Pand6d71ee2013-01-21 04:37:57 -0800784{
785 int retval;
786 int bitmap_size;
787
788 bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long);
789 cpu_clamping_mask = kzalloc(bitmap_size, GFP_KERNEL);
790 if (!cpu_clamping_mask)
791 return -ENOMEM;
792
793 /* probe cpu features and ids here */
794 retval = powerclamp_probe();
795 if (retval)
durgadoss.r@intel.comc32a5082013-10-04 21:53:24 +0530796 goto exit_free;
797
Jacob Pand6d71ee2013-01-21 04:37:57 -0800798 /* set default limit, maybe adjusted during runtime based on feedback */
799 window_size = 2;
Sebastian Andrzej Siewiorcb91fef2016-11-28 13:44:51 -0800800 retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
801 "thermal/intel_powerclamp:online",
802 powerclamp_cpu_online,
803 powerclamp_cpu_predown);
804 if (retval < 0)
805 goto exit_free;
806
807 hp_state = retval;
durgadoss.r@intel.comc32a5082013-10-04 21:53:24 +0530808
Petr Mladek8d962ac2016-11-28 13:44:50 -0800809 worker_data = alloc_percpu(struct powerclamp_worker_data);
810 if (!worker_data) {
durgadoss.r@intel.comc32a5082013-10-04 21:53:24 +0530811 retval = -ENOMEM;
812 goto exit_unregister;
813 }
814
Jacob Pand6d71ee2013-01-21 04:37:57 -0800815 cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
816 &powerclamp_cooling_ops);
durgadoss.r@intel.comc32a5082013-10-04 21:53:24 +0530817 if (IS_ERR(cooling_dev)) {
818 retval = -ENODEV;
819 goto exit_free_thread;
820 }
Jacob Pand6d71ee2013-01-21 04:37:57 -0800821
822 if (!duration)
823 duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES);
durgadoss.r@intel.comc32a5082013-10-04 21:53:24 +0530824
Jacob Pand6d71ee2013-01-21 04:37:57 -0800825 powerclamp_create_debug_files();
826
827 return 0;
durgadoss.r@intel.comc32a5082013-10-04 21:53:24 +0530828
829exit_free_thread:
Petr Mladek8d962ac2016-11-28 13:44:50 -0800830 free_percpu(worker_data);
durgadoss.r@intel.comc32a5082013-10-04 21:53:24 +0530831exit_unregister:
Sebastian Andrzej Siewiorcb91fef2016-11-28 13:44:51 -0800832 cpuhp_remove_state_nocalls(hp_state);
durgadoss.r@intel.comc32a5082013-10-04 21:53:24 +0530833exit_free:
834 kfree(cpu_clamping_mask);
835 return retval;
Jacob Pand6d71ee2013-01-21 04:37:57 -0800836}
837module_init(powerclamp_init);
838
Mathias Krause4d2b6e42015-03-25 22:16:24 +0100839static void __exit powerclamp_exit(void)
Jacob Pand6d71ee2013-01-21 04:37:57 -0800840{
Jacob Pand6d71ee2013-01-21 04:37:57 -0800841 end_power_clamp();
Sebastian Andrzej Siewiorcb91fef2016-11-28 13:44:51 -0800842 cpuhp_remove_state_nocalls(hp_state);
Petr Mladek8d962ac2016-11-28 13:44:50 -0800843 free_percpu(worker_data);
Jacob Pand6d71ee2013-01-21 04:37:57 -0800844 thermal_cooling_device_unregister(cooling_dev);
845 kfree(cpu_clamping_mask);
846
847 cancel_delayed_work_sync(&poll_pkg_cstate_work);
848 debugfs_remove_recursive(debug_dir);
849}
850module_exit(powerclamp_exit);
851
852MODULE_LICENSE("GPL");
853MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
854MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
855MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");