blob: b0eb5ece9243b2007002ed6beb5e76888348e049 [file] [log] [blame]
Thomas Gleixnera61127c2019-05-29 16:57:49 -07001// SPDX-License-Identifier: GPL-2.0-only
Jacob Pand6d71ee2013-01-21 04:37:57 -08002/*
3 * intel_powerclamp.c - package c-state idle injection
4 *
5 * Copyright (c) 2012, Intel Corporation.
6 *
7 * Authors:
8 * Arjan van de Ven <arjan@linux.intel.com>
9 * Jacob Pan <jacob.jun.pan@linux.intel.com>
10 *
Jacob Pand6d71ee2013-01-21 04:37:57 -080011 * TODO:
12 * 1. better handle wakeup from external interrupts, currently a fixed
13 * compensation is added to clamping duration when excessive amount
14 * of wakeups are observed during idle time. the reason is that in
15 * case of external interrupts without need for ack, clamping down
16 * cpu in non-irq context does not reduce irq. for majority of the
17 * cases, clamping down cpu does help reduce irq as well, we should
Finn Thain3cc97be2018-08-23 17:00:52 -070018 * be able to differentiate the two cases and give a quantitative
Jacob Pand6d71ee2013-01-21 04:37:57 -080019 * solution for the irqs that we can control. perhaps based on
20 * get_cpu_iowait_time_us()
21 *
22 * 2. synchronization with other hw blocks
Jacob Pand6d71ee2013-01-21 04:37:57 -080023 */
24
25#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
26
27#include <linux/module.h>
28#include <linux/kernel.h>
29#include <linux/delay.h>
30#include <linux/kthread.h>
Jacob Pand6d71ee2013-01-21 04:37:57 -080031#include <linux/cpu.h>
32#include <linux/thermal.h>
33#include <linux/slab.h>
34#include <linux/tick.h>
35#include <linux/debugfs.h>
36#include <linux/seq_file.h>
Linus Torvalds19cc90f2013-02-28 20:23:09 -080037#include <linux/sched/rt.h>
Ingo Molnarae7e81c2017-02-01 18:07:51 +010038#include <uapi/linux/sched/types.h>
Jacob Pand6d71ee2013-01-21 04:37:57 -080039
40#include <asm/nmi.h>
41#include <asm/msr.h>
42#include <asm/mwait.h>
43#include <asm/cpu_device_id.h>
Jacob Pand6d71ee2013-01-21 04:37:57 -080044#include <asm/hardirq.h>
45
46#define MAX_TARGET_RATIO (50U)
47/* For each undisturbed clamping period (no extra wake ups during idle time),
48 * we increment the confidence counter for the given target ratio.
49 * CONFIDENCE_OK defines the level where runtime calibration results are
50 * valid.
51 */
52#define CONFIDENCE_OK (3)
53/* Default idle injection duration, driver adjust sleep time to meet target
54 * idle ratio. Similar to frequency modulation.
55 */
56#define DEFAULT_DURATION_JIFFIES (6)
57
58static unsigned int target_mwait;
59static struct dentry *debug_dir;
60
61/* user selected target */
62static unsigned int set_target_ratio;
63static unsigned int current_ratio;
64static bool should_skip;
65static bool reduce_irq;
66static atomic_t idle_wakeup_counter;
67static unsigned int control_cpu; /* The cpu assigned to collect stat and update
68 * control parameters. default to BSP but BSP
69 * can be offlined.
70 */
71static bool clamping;
72
Petr Mladek8d962ac2016-11-28 13:44:50 -080073struct powerclamp_worker_data {
74 struct kthread_worker *worker;
75 struct kthread_work balancing_work;
76 struct kthread_delayed_work idle_injection_work;
Petr Mladek8d962ac2016-11-28 13:44:50 -080077 unsigned int cpu;
78 unsigned int count;
79 unsigned int guard;
80 unsigned int window_size_now;
81 unsigned int target_ratio;
82 unsigned int duration_jiffies;
83 bool clamping;
84};
Jacob Pand6d71ee2013-01-21 04:37:57 -080085
Luc Van Oostenryckaa36e362019-01-19 17:15:23 +010086static struct powerclamp_worker_data __percpu *worker_data;
Jacob Pand6d71ee2013-01-21 04:37:57 -080087static struct thermal_cooling_device *cooling_dev;
88static unsigned long *cpu_clamping_mask; /* bit map for tracking per cpu
Petr Mladek8d962ac2016-11-28 13:44:50 -080089 * clamping kthread worker
Jacob Pand6d71ee2013-01-21 04:37:57 -080090 */
91
92static unsigned int duration;
93static unsigned int pkg_cstate_ratio_cur;
94static unsigned int window_size;
95
96static int duration_set(const char *arg, const struct kernel_param *kp)
97{
98 int ret = 0;
99 unsigned long new_duration;
100
101 ret = kstrtoul(arg, 10, &new_duration);
102 if (ret)
103 goto exit;
104 if (new_duration > 25 || new_duration < 6) {
105 pr_err("Out of recommended range %lu, between 6-25ms\n",
106 new_duration);
107 ret = -EINVAL;
108 }
109
110 duration = clamp(new_duration, 6ul, 25ul);
111 smp_mb();
112
113exit:
114
115 return ret;
116}
117
Luis R. Rodriguez9c278472015-05-27 11:09:38 +0930118static const struct kernel_param_ops duration_ops = {
Jacob Pand6d71ee2013-01-21 04:37:57 -0800119 .set = duration_set,
120 .get = param_get_int,
121};
122
123
124module_param_cb(duration, &duration_ops, &duration, 0644);
125MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
126
127struct powerclamp_calibration_data {
128 unsigned long confidence; /* used for calibration, basically a counter
129 * gets incremented each time a clamping
130 * period is completed without extra wakeups
131 * once that counter is reached given level,
132 * compensation is deemed usable.
133 */
134 unsigned long steady_comp; /* steady state compensation used when
135 * no extra wakeups occurred.
136 */
137 unsigned long dynamic_comp; /* compensate excessive wakeup from idle
138 * mostly from external interrupts.
139 */
140};
141
142static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
143
144static int window_size_set(const char *arg, const struct kernel_param *kp)
145{
146 int ret = 0;
147 unsigned long new_window_size;
148
149 ret = kstrtoul(arg, 10, &new_window_size);
150 if (ret)
151 goto exit_win;
152 if (new_window_size > 10 || new_window_size < 2) {
153 pr_err("Out of recommended window size %lu, between 2-10\n",
154 new_window_size);
155 ret = -EINVAL;
156 }
157
158 window_size = clamp(new_window_size, 2ul, 10ul);
159 smp_mb();
160
161exit_win:
162
163 return ret;
164}
165
Luis R. Rodriguez9c278472015-05-27 11:09:38 +0930166static const struct kernel_param_ops window_size_ops = {
Jacob Pand6d71ee2013-01-21 04:37:57 -0800167 .set = window_size_set,
168 .get = param_get_int,
169};
170
171module_param_cb(window_size, &window_size_ops, &window_size, 0644);
172MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
173 "\tpowerclamp controls idle ratio within this window. larger\n"
174 "\twindow size results in slower response time but more smooth\n"
175 "\tclamping results. default to 2.");
176
177static void find_target_mwait(void)
178{
179 unsigned int eax, ebx, ecx, edx;
180 unsigned int highest_cstate = 0;
181 unsigned int highest_subcstate = 0;
182 int i;
183
184 if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
185 return;
186
187 cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
188
189 if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
190 !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
191 return;
192
193 edx >>= MWAIT_SUBSTATE_SIZE;
194 for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
195 if (edx & MWAIT_SUBSTATE_MASK) {
196 highest_cstate = i;
197 highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
198 }
199 }
200 target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
201 (highest_subcstate - 1);
202
203}
204
Jacob Pand8186112015-05-07 09:03:59 -0700205struct pkg_cstate_info {
206 bool skip;
207 int msr_index;
208 int cstate_id;
209};
210
211#define PKG_CSTATE_INIT(id) { \
212 .msr_index = MSR_PKG_C##id##_RESIDENCY, \
213 .cstate_id = id \
214 }
215
216static struct pkg_cstate_info pkg_cstates[] = {
217 PKG_CSTATE_INIT(2),
218 PKG_CSTATE_INIT(3),
219 PKG_CSTATE_INIT(6),
220 PKG_CSTATE_INIT(7),
221 PKG_CSTATE_INIT(8),
222 PKG_CSTATE_INIT(9),
223 PKG_CSTATE_INIT(10),
224 {NULL},
225};
226
Yuxuan Shui7734e3a2013-11-18 15:06:35 +0800227static bool has_pkg_state_counter(void)
228{
Jacob Pand8186112015-05-07 09:03:59 -0700229 u64 val;
230 struct pkg_cstate_info *info = pkg_cstates;
231
232 /* check if any one of the counter msrs exists */
233 while (info->msr_index) {
234 if (!rdmsrl_safe(info->msr_index, &val))
235 return true;
236 info++;
237 }
238
239 return false;
Yuxuan Shui7734e3a2013-11-18 15:06:35 +0800240}
241
Jacob Pand6d71ee2013-01-21 04:37:57 -0800242static u64 pkg_state_counter(void)
243{
244 u64 val;
245 u64 count = 0;
Jacob Pand8186112015-05-07 09:03:59 -0700246 struct pkg_cstate_info *info = pkg_cstates;
Jacob Pand6d71ee2013-01-21 04:37:57 -0800247
Jacob Pand8186112015-05-07 09:03:59 -0700248 while (info->msr_index) {
249 if (!info->skip) {
250 if (!rdmsrl_safe(info->msr_index, &val))
251 count += val;
252 else
253 info->skip = true;
254 }
255 info++;
Jacob Pand6d71ee2013-01-21 04:37:57 -0800256 }
257
258 return count;
259}
260
Jacob Pand6d71ee2013-01-21 04:37:57 -0800261static unsigned int get_compensation(int ratio)
262{
263 unsigned int comp = 0;
264
265 /* we only use compensation if all adjacent ones are good */
266 if (ratio == 1 &&
267 cal_data[ratio].confidence >= CONFIDENCE_OK &&
268 cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
269 cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
270 comp = (cal_data[ratio].steady_comp +
271 cal_data[ratio + 1].steady_comp +
272 cal_data[ratio + 2].steady_comp) / 3;
273 } else if (ratio == MAX_TARGET_RATIO - 1 &&
274 cal_data[ratio].confidence >= CONFIDENCE_OK &&
275 cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
276 cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
277 comp = (cal_data[ratio].steady_comp +
278 cal_data[ratio - 1].steady_comp +
279 cal_data[ratio - 2].steady_comp) / 3;
280 } else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
281 cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
282 cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
283 comp = (cal_data[ratio].steady_comp +
284 cal_data[ratio - 1].steady_comp +
285 cal_data[ratio + 1].steady_comp) / 3;
286 }
287
288 /* REVISIT: simple penalty of double idle injection */
289 if (reduce_irq)
290 comp = ratio;
291 /* do not exceed limit */
292 if (comp + ratio >= MAX_TARGET_RATIO)
293 comp = MAX_TARGET_RATIO - ratio - 1;
294
295 return comp;
296}
297
298static void adjust_compensation(int target_ratio, unsigned int win)
299{
300 int delta;
301 struct powerclamp_calibration_data *d = &cal_data[target_ratio];
302
303 /*
304 * adjust compensations if confidence level has not been reached or
305 * there are too many wakeups during the last idle injection period, we
306 * cannot trust the data for compensation.
307 */
308 if (d->confidence >= CONFIDENCE_OK ||
309 atomic_read(&idle_wakeup_counter) >
310 win * num_online_cpus())
311 return;
312
313 delta = set_target_ratio - current_ratio;
314 /* filter out bad data */
315 if (delta >= 0 && delta <= (1+target_ratio/10)) {
316 if (d->steady_comp)
317 d->steady_comp =
318 roundup(delta+d->steady_comp, 2)/2;
319 else
320 d->steady_comp = delta;
321 d->confidence++;
322 }
323}
324
325static bool powerclamp_adjust_controls(unsigned int target_ratio,
326 unsigned int guard, unsigned int win)
327{
328 static u64 msr_last, tsc_last;
329 u64 msr_now, tsc_now;
330 u64 val64;
331
332 /* check result for the last window */
333 msr_now = pkg_state_counter();
Andy Lutomirski4ea16362015-06-25 18:44:07 +0200334 tsc_now = rdtsc();
Jacob Pand6d71ee2013-01-21 04:37:57 -0800335
336 /* calculate pkg cstate vs tsc ratio */
337 if (!msr_last || !tsc_last)
338 current_ratio = 1;
339 else if (tsc_now-tsc_last) {
340 val64 = 100*(msr_now-msr_last);
341 do_div(val64, (tsc_now-tsc_last));
342 current_ratio = val64;
343 }
344
345 /* update record */
346 msr_last = msr_now;
347 tsc_last = tsc_now;
348
349 adjust_compensation(target_ratio, win);
350 /*
351 * too many external interrupts, set flag such
352 * that we can take measure later.
353 */
354 reduce_irq = atomic_read(&idle_wakeup_counter) >=
355 2 * win * num_online_cpus();
356
357 atomic_set(&idle_wakeup_counter, 0);
358 /* if we are above target+guard, skip */
359 return set_target_ratio + guard <= current_ratio;
360}
361
Petr Mladek8d962ac2016-11-28 13:44:50 -0800362static void clamp_balancing_func(struct kthread_work *work)
Jacob Pand6d71ee2013-01-21 04:37:57 -0800363{
Petr Mladek8d962ac2016-11-28 13:44:50 -0800364 struct powerclamp_worker_data *w_data;
365 int sleeptime;
366 unsigned long target_jiffies;
367 unsigned int compensated_ratio;
368 int interval; /* jiffies to sleep for each attempt */
Jacob Pand6d71ee2013-01-21 04:37:57 -0800369
Petr Mladek8d962ac2016-11-28 13:44:50 -0800370 w_data = container_of(work, struct powerclamp_worker_data,
371 balancing_work);
Jacob Pand6d71ee2013-01-21 04:37:57 -0800372
Petr Mladek8d962ac2016-11-28 13:44:50 -0800373 /*
374 * make sure user selected ratio does not take effect until
375 * the next round. adjust target_ratio if user has changed
376 * target such that we can converge quickly.
377 */
378 w_data->target_ratio = READ_ONCE(set_target_ratio);
379 w_data->guard = 1 + w_data->target_ratio / 20;
380 w_data->window_size_now = window_size;
381 w_data->duration_jiffies = msecs_to_jiffies(duration);
382 w_data->count++;
Jacob Pand6d71ee2013-01-21 04:37:57 -0800383
Petr Mladek8d962ac2016-11-28 13:44:50 -0800384 /*
385 * systems may have different ability to enter package level
386 * c-states, thus we need to compensate the injected idle ratio
387 * to achieve the actual target reported by the HW.
388 */
389 compensated_ratio = w_data->target_ratio +
390 get_compensation(w_data->target_ratio);
391 if (compensated_ratio <= 0)
392 compensated_ratio = 1;
393 interval = w_data->duration_jiffies * 100 / compensated_ratio;
Jacob Pand6d71ee2013-01-21 04:37:57 -0800394
Petr Mladek8d962ac2016-11-28 13:44:50 -0800395 /* align idle time */
396 target_jiffies = roundup(jiffies, interval);
397 sleeptime = target_jiffies - jiffies;
398 if (sleeptime <= 0)
399 sleeptime = 1;
Jacob Pand6d71ee2013-01-21 04:37:57 -0800400
Petr Mladek8d962ac2016-11-28 13:44:50 -0800401 if (clamping && w_data->clamping && cpu_online(w_data->cpu))
402 kthread_queue_delayed_work(w_data->worker,
403 &w_data->idle_injection_work,
404 sleeptime);
405}
Jacob Pand6d71ee2013-01-21 04:37:57 -0800406
Petr Mladek8d962ac2016-11-28 13:44:50 -0800407static void clamp_idle_injection_func(struct kthread_work *work)
408{
409 struct powerclamp_worker_data *w_data;
Jacob Pand6d71ee2013-01-21 04:37:57 -0800410
Petr Mladek8d962ac2016-11-28 13:44:50 -0800411 w_data = container_of(work, struct powerclamp_worker_data,
412 idle_injection_work.work);
Jacob Pand6d71ee2013-01-21 04:37:57 -0800413
Petr Mladek8d962ac2016-11-28 13:44:50 -0800414 /*
415 * only elected controlling cpu can collect stats and update
416 * control parameters.
417 */
418 if (w_data->cpu == control_cpu &&
419 !(w_data->count % w_data->window_size_now)) {
420 should_skip =
421 powerclamp_adjust_controls(w_data->target_ratio,
422 w_data->guard,
423 w_data->window_size_now);
424 smp_mb();
Jacob Pand6d71ee2013-01-21 04:37:57 -0800425 }
Jacob Pand6d71ee2013-01-21 04:37:57 -0800426
Petr Mladek8d962ac2016-11-28 13:44:50 -0800427 if (should_skip)
428 goto balance;
429
Daniel Lezcano82e430a2019-08-02 19:34:23 +0200430 play_idle(jiffies_to_usecs(w_data->duration_jiffies));
Petr Mladek8d962ac2016-11-28 13:44:50 -0800431
432balance:
433 if (clamping && w_data->clamping && cpu_online(w_data->cpu))
434 kthread_queue_work(w_data->worker, &w_data->balancing_work);
Jacob Pand6d71ee2013-01-21 04:37:57 -0800435}
436
437/*
438 * 1 HZ polling while clamping is active, useful for userspace
439 * to monitor actual idle ratio.
440 */
441static void poll_pkg_cstate(struct work_struct *dummy);
442static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
443static void poll_pkg_cstate(struct work_struct *dummy)
444{
445 static u64 msr_last;
446 static u64 tsc_last;
Jacob Pand6d71ee2013-01-21 04:37:57 -0800447
448 u64 msr_now;
Jacob Pand6d71ee2013-01-21 04:37:57 -0800449 u64 tsc_now;
450 u64 val64;
451
452 msr_now = pkg_state_counter();
Andy Lutomirski4ea16362015-06-25 18:44:07 +0200453 tsc_now = rdtsc();
Jacob Pand6d71ee2013-01-21 04:37:57 -0800454
455 /* calculate pkg cstate vs tsc ratio */
456 if (!msr_last || !tsc_last)
457 pkg_cstate_ratio_cur = 1;
458 else {
459 if (tsc_now - tsc_last) {
460 val64 = 100 * (msr_now - msr_last);
461 do_div(val64, (tsc_now - tsc_last));
462 pkg_cstate_ratio_cur = val64;
463 }
464 }
465
466 /* update record */
467 msr_last = msr_now;
Jacob Pand6d71ee2013-01-21 04:37:57 -0800468 tsc_last = tsc_now;
469
470 if (true == clamping)
471 schedule_delayed_work(&poll_pkg_cstate_work, HZ);
472}
473
Petr Mladek8d962ac2016-11-28 13:44:50 -0800474static void start_power_clamp_worker(unsigned long cpu)
Petr Mladek14f3f7d2016-11-28 13:44:49 -0800475{
Petr Mladek8d962ac2016-11-28 13:44:50 -0800476 struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
477 struct kthread_worker *worker;
Petr Mladek14f3f7d2016-11-28 13:44:49 -0800478
Zhang Ruie925b5b2019-03-18 22:26:33 +0800479 worker = kthread_create_worker_on_cpu(cpu, 0, "kidle_inj/%ld", cpu);
Petr Mladek8d962ac2016-11-28 13:44:50 -0800480 if (IS_ERR(worker))
Petr Mladek14f3f7d2016-11-28 13:44:49 -0800481 return;
482
Petr Mladek8d962ac2016-11-28 13:44:50 -0800483 w_data->worker = worker;
484 w_data->count = 0;
485 w_data->cpu = cpu;
486 w_data->clamping = true;
487 set_bit(cpu, cpu_clamping_mask);
Peter Zijlstraa2bee062020-04-21 12:09:13 +0200488 sched_set_fifo(worker->task);
Petr Mladek8d962ac2016-11-28 13:44:50 -0800489 kthread_init_work(&w_data->balancing_work, clamp_balancing_func);
490 kthread_init_delayed_work(&w_data->idle_injection_work,
491 clamp_idle_injection_func);
492 kthread_queue_work(w_data->worker, &w_data->balancing_work);
493}
494
495static void stop_power_clamp_worker(unsigned long cpu)
496{
497 struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
498
499 if (!w_data->worker)
500 return;
501
502 w_data->clamping = false;
503 /*
504 * Make sure that all works that get queued after this point see
505 * the clamping disabled. The counter part is not needed because
506 * there is an implicit memory barrier when the queued work
507 * is proceed.
508 */
509 smp_wmb();
510 kthread_cancel_work_sync(&w_data->balancing_work);
511 kthread_cancel_delayed_work_sync(&w_data->idle_injection_work);
512 /*
513 * The balancing work still might be queued here because
514 * the handling of the "clapming" variable, cancel, and queue
515 * operations are not synchronized via a lock. But it is not
516 * a big deal. The balancing work is fast and destroy kthread
517 * will wait for it.
518 */
Petr Mladek8d962ac2016-11-28 13:44:50 -0800519 clear_bit(w_data->cpu, cpu_clamping_mask);
520 kthread_destroy_worker(w_data->worker);
521
522 w_data->worker = NULL;
Petr Mladek14f3f7d2016-11-28 13:44:49 -0800523}
524
Jacob Pand6d71ee2013-01-21 04:37:57 -0800525static int start_power_clamp(void)
526{
527 unsigned long cpu;
Jacob Pand6d71ee2013-01-21 04:37:57 -0800528
Dan Carpenterc8165dc2013-01-24 08:51:22 +0000529 set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
Jacob Pand6d71ee2013-01-21 04:37:57 -0800530 /* prevent cpu hotplug */
531 get_online_cpus();
532
533 /* prefer BSP */
534 control_cpu = 0;
535 if (!cpu_online(control_cpu))
536 control_cpu = smp_processor_id();
537
538 clamping = true;
539 schedule_delayed_work(&poll_pkg_cstate_work, 0);
540
Petr Mladek8d962ac2016-11-28 13:44:50 -0800541 /* start one kthread worker per online cpu */
Jacob Pand6d71ee2013-01-21 04:37:57 -0800542 for_each_online_cpu(cpu) {
Petr Mladek8d962ac2016-11-28 13:44:50 -0800543 start_power_clamp_worker(cpu);
Jacob Pand6d71ee2013-01-21 04:37:57 -0800544 }
545 put_online_cpus();
546
547 return 0;
548}
549
550static void end_power_clamp(void)
551{
552 int i;
Jacob Pand6d71ee2013-01-21 04:37:57 -0800553
Jacob Pand6d71ee2013-01-21 04:37:57 -0800554 /*
Petr Mladek8d962ac2016-11-28 13:44:50 -0800555 * Block requeuing in all the kthread workers. They will flush and
556 * stop faster.
Jacob Pand6d71ee2013-01-21 04:37:57 -0800557 */
Petr Mladek8d962ac2016-11-28 13:44:50 -0800558 clamping = false;
Jacob Pand6d71ee2013-01-21 04:37:57 -0800559 if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
560 for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
Petr Mladek8d962ac2016-11-28 13:44:50 -0800561 pr_debug("clamping worker for cpu %d alive, destroy\n",
562 i);
563 stop_power_clamp_worker(i);
Jacob Pand6d71ee2013-01-21 04:37:57 -0800564 }
565 }
566}
567
Sebastian Andrzej Siewiorcb91fef2016-11-28 13:44:51 -0800568static int powerclamp_cpu_online(unsigned int cpu)
Jacob Pand6d71ee2013-01-21 04:37:57 -0800569{
Sebastian Andrzej Siewiorcb91fef2016-11-28 13:44:51 -0800570 if (clamping == false)
571 return 0;
572 start_power_clamp_worker(cpu);
573 /* prefer BSP as controlling CPU */
574 if (cpu == 0) {
575 control_cpu = 0;
576 smp_mb();
Jacob Pand6d71ee2013-01-21 04:37:57 -0800577 }
Sebastian Andrzej Siewiorcb91fef2016-11-28 13:44:51 -0800578 return 0;
Jacob Pand6d71ee2013-01-21 04:37:57 -0800579}
580
Sebastian Andrzej Siewiorcb91fef2016-11-28 13:44:51 -0800581static int powerclamp_cpu_predown(unsigned int cpu)
582{
583 if (clamping == false)
584 return 0;
585
586 stop_power_clamp_worker(cpu);
587 if (cpu != control_cpu)
588 return 0;
589
590 control_cpu = cpumask_first(cpu_online_mask);
591 if (control_cpu == cpu)
592 control_cpu = cpumask_next(cpu, cpu_online_mask);
593 smp_mb();
594 return 0;
595}
Jacob Pand6d71ee2013-01-21 04:37:57 -0800596
597static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
598 unsigned long *state)
599{
600 *state = MAX_TARGET_RATIO;
601
602 return 0;
603}
604
605static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
606 unsigned long *state)
607{
608 if (true == clamping)
609 *state = pkg_cstate_ratio_cur;
610 else
611 /* to save power, do not poll idle ratio while not clamping */
612 *state = -1; /* indicates invalid state */
613
614 return 0;
615}
616
617static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
618 unsigned long new_target_ratio)
619{
620 int ret = 0;
621
622 new_target_ratio = clamp(new_target_ratio, 0UL,
623 (unsigned long) (MAX_TARGET_RATIO-1));
624 if (set_target_ratio == 0 && new_target_ratio > 0) {
625 pr_info("Start idle injection to reduce power\n");
626 set_target_ratio = new_target_ratio;
627 ret = start_power_clamp();
628 goto exit_set;
629 } else if (set_target_ratio > 0 && new_target_ratio == 0) {
630 pr_info("Stop forced idle injection\n");
Jacob Pand6d71ee2013-01-21 04:37:57 -0800631 end_power_clamp();
Petr Mladek70c50ee2016-08-05 15:20:41 +0200632 set_target_ratio = 0;
Jacob Pand6d71ee2013-01-21 04:37:57 -0800633 } else /* adjust currently running */ {
634 set_target_ratio = new_target_ratio;
635 /* make new set_target_ratio visible to other cpus */
636 smp_mb();
637 }
638
639exit_set:
640 return ret;
641}
642
643/* bind to generic thermal layer as cooling device*/
644static struct thermal_cooling_device_ops powerclamp_cooling_ops = {
645 .get_max_state = powerclamp_get_max_state,
646 .get_cur_state = powerclamp_get_cur_state,
647 .set_cur_state = powerclamp_set_cur_state,
648};
649
Jacob Panec638db2016-11-14 11:08:45 -0800650static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = {
Thomas Gleixner9c510442020-03-20 14:13:58 +0100651 X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_MWAIT, NULL),
Jacob Panec638db2016-11-14 11:08:45 -0800652 {}
653};
654MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
655
Mathias Krause4d2b6e42015-03-25 22:16:24 +0100656static int __init powerclamp_probe(void)
Jacob Pand6d71ee2013-01-21 04:37:57 -0800657{
Jacob Panec638db2016-11-14 11:08:45 -0800658
659 if (!x86_match_cpu(intel_powerclamp_ids)) {
Arvind Yadavd344f312017-10-09 17:21:07 +0530660 pr_err("CPU does not support MWAIT\n");
Jacob Pand6d71ee2013-01-21 04:37:57 -0800661 return -ENODEV;
662 }
Jacob Panb721ca02016-03-17 11:26:13 -0700663
664 /* The goal for idle time alignment is to achieve package cstate. */
665 if (!has_pkg_state_counter()) {
Arvind Yadavd344f312017-10-09 17:21:07 +0530666 pr_info("No package C-state available\n");
Jacob Pand6d71ee2013-01-21 04:37:57 -0800667 return -ENODEV;
Jacob Panb721ca02016-03-17 11:26:13 -0700668 }
Jacob Pand6d71ee2013-01-21 04:37:57 -0800669
670 /* find the deepest mwait value */
671 find_target_mwait();
672
673 return 0;
674}
675
676static int powerclamp_debug_show(struct seq_file *m, void *unused)
677{
678 int i = 0;
679
680 seq_printf(m, "controlling cpu: %d\n", control_cpu);
681 seq_printf(m, "pct confidence steady dynamic (compensation)\n");
682 for (i = 0; i < MAX_TARGET_RATIO; i++) {
683 seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
684 i,
685 cal_data[i].confidence,
686 cal_data[i].steady_comp,
687 cal_data[i].dynamic_comp);
688 }
689
690 return 0;
691}
692
Yangtao Li8632ed42018-11-23 09:55:22 -0500693DEFINE_SHOW_ATTRIBUTE(powerclamp_debug);
Jacob Pand6d71ee2013-01-21 04:37:57 -0800694
695static inline void powerclamp_create_debug_files(void)
696{
697 debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
Jacob Pand6d71ee2013-01-21 04:37:57 -0800698
Greg Kroah-Hartmanc008c672019-06-13 20:38:10 +0200699 debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir, cal_data,
700 &powerclamp_debug_fops);
Jacob Pand6d71ee2013-01-21 04:37:57 -0800701}
702
Sebastian Andrzej Siewiorcb91fef2016-11-28 13:44:51 -0800703static enum cpuhp_state hp_state;
704
Mathias Krause4d2b6e42015-03-25 22:16:24 +0100705static int __init powerclamp_init(void)
Jacob Pand6d71ee2013-01-21 04:37:57 -0800706{
707 int retval;
708 int bitmap_size;
709
710 bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long);
711 cpu_clamping_mask = kzalloc(bitmap_size, GFP_KERNEL);
712 if (!cpu_clamping_mask)
713 return -ENOMEM;
714
715 /* probe cpu features and ids here */
716 retval = powerclamp_probe();
717 if (retval)
durgadoss.r@intel.comc32a5082013-10-04 21:53:24 +0530718 goto exit_free;
719
Jacob Pand6d71ee2013-01-21 04:37:57 -0800720 /* set default limit, maybe adjusted during runtime based on feedback */
721 window_size = 2;
Sebastian Andrzej Siewiorcb91fef2016-11-28 13:44:51 -0800722 retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
723 "thermal/intel_powerclamp:online",
724 powerclamp_cpu_online,
725 powerclamp_cpu_predown);
726 if (retval < 0)
727 goto exit_free;
durgadoss.r@intel.comc32a5082013-10-04 21:53:24 +0530728
Sebastian Andrzej Siewiorcb91fef2016-11-28 13:44:51 -0800729 hp_state = retval;
Jacob Pand6d71ee2013-01-21 04:37:57 -0800730
Petr Mladek8d962ac2016-11-28 13:44:50 -0800731 worker_data = alloc_percpu(struct powerclamp_worker_data);
732 if (!worker_data) {
durgadoss.r@intel.comc32a5082013-10-04 21:53:24 +0530733 retval = -ENOMEM;
734 goto exit_unregister;
735 }
736
Jacob Pand6d71ee2013-01-21 04:37:57 -0800737 cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
738 &powerclamp_cooling_ops);
durgadoss.r@intel.comc32a5082013-10-04 21:53:24 +0530739 if (IS_ERR(cooling_dev)) {
740 retval = -ENODEV;
741 goto exit_free_thread;
742 }
Jacob Pand6d71ee2013-01-21 04:37:57 -0800743
744 if (!duration)
745 duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES);
durgadoss.r@intel.comc32a5082013-10-04 21:53:24 +0530746
Jacob Pand6d71ee2013-01-21 04:37:57 -0800747 powerclamp_create_debug_files();
748
749 return 0;
durgadoss.r@intel.comc32a5082013-10-04 21:53:24 +0530750
751exit_free_thread:
Petr Mladek8d962ac2016-11-28 13:44:50 -0800752 free_percpu(worker_data);
durgadoss.r@intel.comc32a5082013-10-04 21:53:24 +0530753exit_unregister:
Sebastian Andrzej Siewiorcb91fef2016-11-28 13:44:51 -0800754 cpuhp_remove_state_nocalls(hp_state);
durgadoss.r@intel.comc32a5082013-10-04 21:53:24 +0530755exit_free:
756 kfree(cpu_clamping_mask);
757 return retval;
Jacob Pand6d71ee2013-01-21 04:37:57 -0800758}
759module_init(powerclamp_init);
760
Mathias Krause4d2b6e42015-03-25 22:16:24 +0100761static void __exit powerclamp_exit(void)
Jacob Pand6d71ee2013-01-21 04:37:57 -0800762{
Jacob Pand6d71ee2013-01-21 04:37:57 -0800763 end_power_clamp();
Sebastian Andrzej Siewiorcb91fef2016-11-28 13:44:51 -0800764 cpuhp_remove_state_nocalls(hp_state);
Petr Mladek8d962ac2016-11-28 13:44:50 -0800765 free_percpu(worker_data);
Jacob Pand6d71ee2013-01-21 04:37:57 -0800766 thermal_cooling_device_unregister(cooling_dev);
767 kfree(cpu_clamping_mask);
768
769 cancel_delayed_work_sync(&poll_pkg_cstate_work);
770 debugfs_remove_recursive(debug_dir);
771}
772module_exit(powerclamp_exit);
773
774MODULE_LICENSE("GPL");
775MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
776MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
777MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");