blob: 9bc1bacd52e5fb2a11631bc1736d985a6dee8e34 [file] [log] [blame]
Syed Rameez Mustafadddcab72016-09-07 16:18:27 -07001/* Copyright (c) 2012-2016, The Linux Foundation. All rights reserved.
2 *
3 * This program is free software; you can redistribute it and/or modify
4 * it under the terms of the GNU General Public License version 2 and
5 * only version 2 as published by the Free Software Foundation.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * Implementation credits: Srivatsa Vaddagiri, Steve Muckle
13 * Syed Rameez Mustafa, Olav haugan, Joonwoo Park, Pavan Kumar Kondeti
14 * and Vikram Mulukutla
15 */
16
17#include <linux/cpufreq.h>
18#include <linux/list_sort.h>
19#include <linux/syscore_ops.h>
20
21#include "sched.h"
22
23#include <trace/events/sched.h>
24
25const char *task_event_names[] = {"PUT_PREV_TASK", "PICK_NEXT_TASK",
26 "TASK_WAKE", "TASK_MIGRATE", "TASK_UPDATE", "IRQ_UPDATE"};
27
28const char *migrate_type_names[] = {"GROUP_TO_RQ", "RQ_TO_GROUP",
29 "RQ_TO_RQ", "GROUP_TO_GROUP"};
30
31
32static ktime_t ktime_last;
33static bool sched_ktime_suspended;
34
35static bool use_cycle_counter;
36static struct cpu_cycle_counter_cb cpu_cycle_counter_cb;
37
38u64 sched_ktime_clock(void)
39{
40 if (unlikely(sched_ktime_suspended))
41 return ktime_to_ns(ktime_last);
42 return ktime_get_ns();
43}
44
45static void sched_resume(void)
46{
47 sched_ktime_suspended = false;
48}
49
50static int sched_suspend(void)
51{
52 ktime_last = ktime_get();
53 sched_ktime_suspended = true;
54 return 0;
55}
56
57static struct syscore_ops sched_syscore_ops = {
58 .resume = sched_resume,
59 .suspend = sched_suspend
60};
61
62static int __init sched_init_ops(void)
63{
64 register_syscore_ops(&sched_syscore_ops);
65 return 0;
66}
67late_initcall(sched_init_ops);
68
69inline void clear_ed_task(struct task_struct *p, struct rq *rq)
70{
71 if (p == rq->ed_task)
72 rq->ed_task = NULL;
73}
74
75inline void set_task_last_wake(struct task_struct *p, u64 wallclock)
76{
77 p->last_wake_ts = wallclock;
78}
79
80inline void set_task_last_switch_out(struct task_struct *p, u64 wallclock)
81{
82 p->last_switch_out_ts = wallclock;
83}
84
85/*
86 * Note C-state for (idle) cpus.
87 *
88 * @cstate = cstate index, 0 -> active state
89 * @wakeup_energy = energy spent in waking up cpu
90 * @wakeup_latency = latency to wakeup from cstate
91 *
92 */
93void
94sched_set_cpu_cstate(int cpu, int cstate, int wakeup_energy, int wakeup_latency)
95{
96 struct rq *rq = cpu_rq(cpu);
97
98 rq->cstate = cstate; /* C1, C2 etc */
99 rq->wakeup_energy = wakeup_energy;
100 rq->wakeup_latency = wakeup_latency;
101}
102
103/*
104 * Note D-state for (idle) cluster.
105 *
106 * @dstate = dstate index, 0 -> active state
107 * @wakeup_energy = energy spent in waking up cluster
108 * @wakeup_latency = latency to wakeup from cluster
109 *
110 */
111void sched_set_cluster_dstate(const cpumask_t *cluster_cpus, int dstate,
112 int wakeup_energy, int wakeup_latency)
113{
114 struct sched_cluster *cluster =
115 cpu_rq(cpumask_first(cluster_cpus))->cluster;
116 cluster->dstate = dstate;
117 cluster->dstate_wakeup_energy = wakeup_energy;
118 cluster->dstate_wakeup_latency = wakeup_latency;
119}
120
121u32 __weak get_freq_max_load(int cpu, u32 freq)
122{
123 /* 100% by default */
124 return 100;
125}
126
127struct freq_max_load_entry {
128 /* The maximum load which has accounted governor's headroom. */
129 u64 hdemand;
130};
131
132struct freq_max_load {
133 struct rcu_head rcu;
134 int length;
135 struct freq_max_load_entry freqs[0];
136};
137
138static DEFINE_PER_CPU(struct freq_max_load *, freq_max_load);
139static DEFINE_SPINLOCK(freq_max_load_lock);
140
141struct cpu_pwr_stats __weak *get_cpu_pwr_stats(void)
142{
143 return NULL;
144}
145
146int sched_update_freq_max_load(const cpumask_t *cpumask)
147{
148 int i, cpu, ret;
149 unsigned int freq;
150 struct cpu_pstate_pwr *costs;
151 struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats();
152 struct freq_max_load *max_load, *old_max_load;
153 struct freq_max_load_entry *entry;
154 u64 max_demand_capacity, max_demand;
155 unsigned long flags;
156 u32 hfreq;
157 int hpct;
158
159 if (!per_cpu_info)
160 return 0;
161
162 spin_lock_irqsave(&freq_max_load_lock, flags);
163 max_demand_capacity = div64_u64(max_task_load(), max_possible_capacity);
164 for_each_cpu(cpu, cpumask) {
165 if (!per_cpu_info[cpu].ptable) {
166 ret = -EINVAL;
167 goto fail;
168 }
169
170 old_max_load = rcu_dereference(per_cpu(freq_max_load, cpu));
171
172 /*
173 * allocate len + 1 and leave the last power cost as 0 for
174 * power_cost() can stop iterating index when
175 * per_cpu_info[cpu].len > len of max_load due to race between
176 * cpu power stats update and get_cpu_pwr_stats().
177 */
178 max_load = kzalloc(sizeof(struct freq_max_load) +
179 sizeof(struct freq_max_load_entry) *
180 (per_cpu_info[cpu].len + 1), GFP_ATOMIC);
181 if (unlikely(!max_load)) {
182 ret = -ENOMEM;
183 goto fail;
184 }
185
186 max_load->length = per_cpu_info[cpu].len;
187
188 max_demand = max_demand_capacity *
189 cpu_max_possible_capacity(cpu);
190
191 i = 0;
192 costs = per_cpu_info[cpu].ptable;
193 while (costs[i].freq) {
194 entry = &max_load->freqs[i];
195 freq = costs[i].freq;
196 hpct = get_freq_max_load(cpu, freq);
197 if (hpct <= 0 && hpct > 100)
198 hpct = 100;
199 hfreq = div64_u64((u64)freq * hpct, 100);
200 entry->hdemand =
201 div64_u64(max_demand * hfreq,
202 cpu_max_possible_freq(cpu));
203 i++;
204 }
205
206 rcu_assign_pointer(per_cpu(freq_max_load, cpu), max_load);
207 if (old_max_load)
208 kfree_rcu(old_max_load, rcu);
209 }
210
211 spin_unlock_irqrestore(&freq_max_load_lock, flags);
212 return 0;
213
214fail:
215 for_each_cpu(cpu, cpumask) {
216 max_load = rcu_dereference(per_cpu(freq_max_load, cpu));
217 if (max_load) {
218 rcu_assign_pointer(per_cpu(freq_max_load, cpu), NULL);
219 kfree_rcu(max_load, rcu);
220 }
221 }
222
223 spin_unlock_irqrestore(&freq_max_load_lock, flags);
224 return ret;
225}
226
227unsigned int max_possible_efficiency = 1;
228unsigned int min_possible_efficiency = UINT_MAX;
229
230unsigned long __weak arch_get_cpu_efficiency(int cpu)
231{
232 return SCHED_CAPACITY_SCALE;
233}
234
235/* Keep track of max/min capacity possible across CPUs "currently" */
236static void __update_min_max_capacity(void)
237{
238 int i;
239 int max_cap = 0, min_cap = INT_MAX;
240
241 for_each_online_cpu(i) {
242 max_cap = max(max_cap, cpu_capacity(i));
243 min_cap = min(min_cap, cpu_capacity(i));
244 }
245
246 max_capacity = max_cap;
247 min_capacity = min_cap;
248}
249
250static void update_min_max_capacity(void)
251{
252 unsigned long flags;
253 int i;
254
255 local_irq_save(flags);
256 for_each_possible_cpu(i)
257 raw_spin_lock(&cpu_rq(i)->lock);
258
259 __update_min_max_capacity();
260
261 for_each_possible_cpu(i)
262 raw_spin_unlock(&cpu_rq(i)->lock);
263 local_irq_restore(flags);
264}
265
266/*
267 * Return 'capacity' of a cpu in reference to "least" efficient cpu, such that
268 * least efficient cpu gets capacity of 1024
269 */
270static unsigned long
271capacity_scale_cpu_efficiency(struct sched_cluster *cluster)
272{
273 return (1024 * cluster->efficiency) / min_possible_efficiency;
274}
275
276/*
277 * Return 'capacity' of a cpu in reference to cpu with lowest max_freq
278 * (min_max_freq), such that one with lowest max_freq gets capacity of 1024.
279 */
280static unsigned long capacity_scale_cpu_freq(struct sched_cluster *cluster)
281{
282 return (1024 * cluster_max_freq(cluster)) / min_max_freq;
283}
284
285/*
286 * Return load_scale_factor of a cpu in reference to "most" efficient cpu, so
287 * that "most" efficient cpu gets a load_scale_factor of 1
288 */
289static inline unsigned long
290load_scale_cpu_efficiency(struct sched_cluster *cluster)
291{
292 return DIV_ROUND_UP(1024 * max_possible_efficiency,
293 cluster->efficiency);
294}
295
296/*
297 * Return load_scale_factor of a cpu in reference to cpu with best max_freq
298 * (max_possible_freq), so that one with best max_freq gets a load_scale_factor
299 * of 1.
300 */
301static inline unsigned long load_scale_cpu_freq(struct sched_cluster *cluster)
302{
303 return DIV_ROUND_UP(1024 * max_possible_freq,
304 cluster_max_freq(cluster));
305}
306
307static int compute_capacity(struct sched_cluster *cluster)
308{
309 int capacity = 1024;
310
311 capacity *= capacity_scale_cpu_efficiency(cluster);
312 capacity >>= 10;
313
314 capacity *= capacity_scale_cpu_freq(cluster);
315 capacity >>= 10;
316
317 return capacity;
318}
319
320static int compute_max_possible_capacity(struct sched_cluster *cluster)
321{
322 int capacity = 1024;
323
324 capacity *= capacity_scale_cpu_efficiency(cluster);
325 capacity >>= 10;
326
327 capacity *= (1024 * cluster->max_possible_freq) / min_max_freq;
328 capacity >>= 10;
329
330 return capacity;
331}
332
333static int compute_load_scale_factor(struct sched_cluster *cluster)
334{
335 int load_scale = 1024;
336
337 /*
338 * load_scale_factor accounts for the fact that task load
339 * is in reference to "best" performing cpu. Task's load will need to be
340 * scaled (up) by a factor to determine suitability to be placed on a
341 * (little) cpu.
342 */
343 load_scale *= load_scale_cpu_efficiency(cluster);
344 load_scale >>= 10;
345
346 load_scale *= load_scale_cpu_freq(cluster);
347 load_scale >>= 10;
348
349 return load_scale;
350}
351
352struct list_head cluster_head;
353static DEFINE_MUTEX(cluster_lock);
354static cpumask_t all_cluster_cpus = CPU_MASK_NONE;
355DECLARE_BITMAP(all_cluster_ids, NR_CPUS);
356struct sched_cluster *sched_cluster[NR_CPUS];
357int num_clusters;
358
359struct sched_cluster init_cluster = {
360 .list = LIST_HEAD_INIT(init_cluster.list),
361 .id = 0,
362 .max_power_cost = 1,
363 .min_power_cost = 1,
364 .capacity = 1024,
365 .max_possible_capacity = 1024,
366 .efficiency = 1,
367 .load_scale_factor = 1024,
368 .cur_freq = 1,
369 .max_freq = 1,
370 .max_mitigated_freq = UINT_MAX,
371 .min_freq = 1,
372 .max_possible_freq = 1,
373 .dstate = 0,
374 .dstate_wakeup_energy = 0,
375 .dstate_wakeup_latency = 0,
376 .exec_scale_factor = 1024,
377 .notifier_sent = 0,
378};
379
380static void update_all_clusters_stats(void)
381{
382 struct sched_cluster *cluster;
383 u64 highest_mpc = 0, lowest_mpc = U64_MAX;
384
385 pre_big_task_count_change(cpu_possible_mask);
386
387 for_each_sched_cluster(cluster) {
388 u64 mpc;
389
390 cluster->capacity = compute_capacity(cluster);
391 mpc = cluster->max_possible_capacity =
392 compute_max_possible_capacity(cluster);
393 cluster->load_scale_factor = compute_load_scale_factor(cluster);
394
395 cluster->exec_scale_factor =
396 DIV_ROUND_UP(cluster->efficiency * 1024,
397 max_possible_efficiency);
398
399 if (mpc > highest_mpc)
400 highest_mpc = mpc;
401
402 if (mpc < lowest_mpc)
403 lowest_mpc = mpc;
404 }
405
406 max_possible_capacity = highest_mpc;
407 min_max_possible_capacity = lowest_mpc;
408
409 __update_min_max_capacity();
410 sched_update_freq_max_load(cpu_possible_mask);
411 post_big_task_count_change(cpu_possible_mask);
412}
413
414static void assign_cluster_ids(struct list_head *head)
415{
416 struct sched_cluster *cluster;
417 int pos = 0;
418
419 list_for_each_entry(cluster, head, list) {
420 cluster->id = pos;
421 sched_cluster[pos++] = cluster;
422 }
423}
424
425static void
426move_list(struct list_head *dst, struct list_head *src, bool sync_rcu)
427{
428 struct list_head *first, *last;
429
430 first = src->next;
431 last = src->prev;
432
433 if (sync_rcu) {
434 INIT_LIST_HEAD_RCU(src);
435 synchronize_rcu();
436 }
437
438 first->prev = dst;
439 dst->prev = last;
440 last->next = dst;
441
442 /* Ensure list sanity before making the head visible to all CPUs. */
443 smp_mb();
444 dst->next = first;
445}
446
447static int
448compare_clusters(void *priv, struct list_head *a, struct list_head *b)
449{
450 struct sched_cluster *cluster1, *cluster2;
451 int ret;
452
453 cluster1 = container_of(a, struct sched_cluster, list);
454 cluster2 = container_of(b, struct sched_cluster, list);
455
456 ret = cluster1->max_power_cost > cluster2->max_power_cost ||
457 (cluster1->max_power_cost == cluster2->max_power_cost &&
458 cluster1->max_possible_capacity <
459 cluster2->max_possible_capacity);
460
461 return ret;
462}
463
464static void sort_clusters(void)
465{
466 struct sched_cluster *cluster;
467 struct list_head new_head;
468
469 INIT_LIST_HEAD(&new_head);
470
471 for_each_sched_cluster(cluster) {
472 cluster->max_power_cost = power_cost(cluster_first_cpu(cluster),
473 max_task_load());
474 cluster->min_power_cost = power_cost(cluster_first_cpu(cluster),
475 0);
476 }
477
478 move_list(&new_head, &cluster_head, true);
479
480 list_sort(NULL, &new_head, compare_clusters);
481 assign_cluster_ids(&new_head);
482
483 /*
484 * Ensure cluster ids are visible to all CPUs before making
485 * cluster_head visible.
486 */
487 move_list(&cluster_head, &new_head, false);
488}
489
490static void
491insert_cluster(struct sched_cluster *cluster, struct list_head *head)
492{
493 struct sched_cluster *tmp;
494 struct list_head *iter = head;
495
496 list_for_each_entry(tmp, head, list) {
497 if (cluster->max_power_cost < tmp->max_power_cost)
498 break;
499 iter = &tmp->list;
500 }
501
502 list_add(&cluster->list, iter);
503}
504
505static struct sched_cluster *alloc_new_cluster(const struct cpumask *cpus)
506{
507 struct sched_cluster *cluster = NULL;
508
509 cluster = kzalloc(sizeof(struct sched_cluster), GFP_ATOMIC);
510 if (!cluster) {
511 __WARN_printf("Cluster allocation failed. \
512 Possible bad scheduling\n");
513 return NULL;
514 }
515
516 INIT_LIST_HEAD(&cluster->list);
517 cluster->max_power_cost = 1;
518 cluster->min_power_cost = 1;
519 cluster->capacity = 1024;
520 cluster->max_possible_capacity = 1024;
521 cluster->efficiency = 1;
522 cluster->load_scale_factor = 1024;
523 cluster->cur_freq = 1;
524 cluster->max_freq = 1;
525 cluster->max_mitigated_freq = UINT_MAX;
526 cluster->min_freq = 1;
527 cluster->max_possible_freq = 1;
528 cluster->dstate = 0;
529 cluster->dstate_wakeup_energy = 0;
530 cluster->dstate_wakeup_latency = 0;
531 cluster->freq_init_done = false;
532
533 cluster->cpus = *cpus;
534 cluster->efficiency = arch_get_cpu_efficiency(cpumask_first(cpus));
535
536 if (cluster->efficiency > max_possible_efficiency)
537 max_possible_efficiency = cluster->efficiency;
538 if (cluster->efficiency < min_possible_efficiency)
539 min_possible_efficiency = cluster->efficiency;
540
541 cluster->notifier_sent = 0;
542 return cluster;
543}
544
545static void add_cluster(const struct cpumask *cpus, struct list_head *head)
546{
547 struct sched_cluster *cluster = alloc_new_cluster(cpus);
548 int i;
549
550 if (!cluster)
551 return;
552
553 for_each_cpu(i, cpus)
554 cpu_rq(i)->cluster = cluster;
555
556 insert_cluster(cluster, head);
557 set_bit(num_clusters, all_cluster_ids);
558 num_clusters++;
559}
560
561void update_cluster_topology(void)
562{
563 struct cpumask cpus = *cpu_possible_mask;
564 const struct cpumask *cluster_cpus;
565 struct list_head new_head;
566 int i;
567
568 INIT_LIST_HEAD(&new_head);
569
570 for_each_cpu(i, &cpus) {
571 cluster_cpus = cpu_coregroup_mask(i);
572 cpumask_or(&all_cluster_cpus, &all_cluster_cpus, cluster_cpus);
573 cpumask_andnot(&cpus, &cpus, cluster_cpus);
574 add_cluster(cluster_cpus, &new_head);
575 }
576
577 assign_cluster_ids(&new_head);
578
579 /*
580 * Ensure cluster ids are visible to all CPUs before making
581 * cluster_head visible.
582 */
583 move_list(&cluster_head, &new_head, false);
584}
585
586void init_clusters(void)
587{
588 bitmap_clear(all_cluster_ids, 0, NR_CPUS);
589 init_cluster.cpus = *cpu_possible_mask;
590 INIT_LIST_HEAD(&cluster_head);
591}
592
593int register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb)
594{
595 mutex_lock(&cluster_lock);
596 if (!cb->get_cpu_cycle_counter) {
597 mutex_unlock(&cluster_lock);
598 return -EINVAL;
599 }
600
601 cpu_cycle_counter_cb = *cb;
602 use_cycle_counter = true;
603 mutex_unlock(&cluster_lock);
604
605 return 0;
606}
607
608int got_boost_kick(void)
609{
610 int cpu = smp_processor_id();
611 struct rq *rq = cpu_rq(cpu);
612
613 return test_bit(BOOST_KICK, &rq->hmp_flags);
614}
615
616inline void clear_boost_kick(int cpu)
617{
618 struct rq *rq = cpu_rq(cpu);
619
620 clear_bit(BOOST_KICK, &rq->hmp_flags);
621}
622
623inline void boost_kick(int cpu)
624{
625 struct rq *rq = cpu_rq(cpu);
626
627 if (!test_and_set_bit(BOOST_KICK, &rq->hmp_flags))
628 smp_send_reschedule(cpu);
629}
630
631/* Clear any HMP scheduler related requests pending from or on cpu */
632void clear_hmp_request(int cpu)
633{
634 struct rq *rq = cpu_rq(cpu);
635 unsigned long flags;
636
637 clear_boost_kick(cpu);
638 clear_reserved(cpu);
639 if (rq->push_task) {
640 raw_spin_lock_irqsave(&rq->lock, flags);
641 if (rq->push_task) {
642 clear_reserved(rq->push_cpu);
643 put_task_struct(rq->push_task);
644 rq->push_task = NULL;
645 }
646 rq->active_balance = 0;
647 raw_spin_unlock_irqrestore(&rq->lock, flags);
648 }
649}
650
651int sched_set_static_cpu_pwr_cost(int cpu, unsigned int cost)
652{
653 struct rq *rq = cpu_rq(cpu);
654
655 rq->static_cpu_pwr_cost = cost;
656 return 0;
657}
658
659unsigned int sched_get_static_cpu_pwr_cost(int cpu)
660{
661 return cpu_rq(cpu)->static_cpu_pwr_cost;
662}
663
664int sched_set_static_cluster_pwr_cost(int cpu, unsigned int cost)
665{
666 struct sched_cluster *cluster = cpu_rq(cpu)->cluster;
667
668 cluster->static_cluster_pwr_cost = cost;
669 return 0;
670}
671
672unsigned int sched_get_static_cluster_pwr_cost(int cpu)
673{
674 return cpu_rq(cpu)->cluster->static_cluster_pwr_cost;
675}
676
677/*
678 * sched_window_stats_policy and sched_ravg_hist_size have a 'sysctl' copy
679 * associated with them. This is required for atomic update of those variables
680 * when being modifed via sysctl interface.
681 *
682 * IMPORTANT: Initialize both copies to same value!!
683 */
684
685/*
686 * Tasks that are runnable continuously for a period greather than
687 * EARLY_DETECTION_DURATION can be flagged early as potential
688 * high load tasks.
689 */
690#define EARLY_DETECTION_DURATION 9500000
691
692static __read_mostly unsigned int sched_ravg_hist_size = 5;
693__read_mostly unsigned int sysctl_sched_ravg_hist_size = 5;
694
695static __read_mostly unsigned int sched_window_stats_policy =
696 WINDOW_STATS_MAX_RECENT_AVG;
697__read_mostly unsigned int sysctl_sched_window_stats_policy =
698 WINDOW_STATS_MAX_RECENT_AVG;
699
700#define SCHED_ACCOUNT_WAIT_TIME 1
701
702__read_mostly unsigned int sysctl_sched_cpu_high_irqload = (10 * NSEC_PER_MSEC);
703
704unsigned int __read_mostly sysctl_sched_enable_colocation = 1;
705
706/*
707 * Enable colocation and frequency aggregation for all threads in a process.
708 * The children inherits the group id from the parent.
709 */
710unsigned int __read_mostly sysctl_sched_enable_thread_grouping;
711
712
713__read_mostly unsigned int sysctl_sched_new_task_windows = 5;
714
715#define SCHED_FREQ_ACCOUNT_WAIT_TIME 0
716
717/*
718 * For increase, send notification if
719 * freq_required - cur_freq > sysctl_sched_freq_inc_notify
720 */
721__read_mostly int sysctl_sched_freq_inc_notify = 10 * 1024 * 1024; /* + 10GHz */
722
723/*
724 * For decrease, send notification if
725 * cur_freq - freq_required > sysctl_sched_freq_dec_notify
726 */
727__read_mostly int sysctl_sched_freq_dec_notify = 10 * 1024 * 1024; /* - 10GHz */
728
729static __read_mostly unsigned int sched_io_is_busy;
730
731__read_mostly unsigned int sysctl_sched_pred_alert_freq = 10 * 1024 * 1024;
732
733/*
734 * Maximum possible frequency across all cpus. Task demand and cpu
735 * capacity (cpu_power) metrics are scaled in reference to it.
736 */
737unsigned int max_possible_freq = 1;
738
739/*
740 * Minimum possible max_freq across all cpus. This will be same as
741 * max_possible_freq on homogeneous systems and could be different from
742 * max_possible_freq on heterogenous systems. min_max_freq is used to derive
743 * capacity (cpu_power) of cpus.
744 */
745unsigned int min_max_freq = 1;
746
747unsigned int max_capacity = 1024; /* max(rq->capacity) */
748unsigned int min_capacity = 1024; /* min(rq->capacity) */
749unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */
750unsigned int
751min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */
752
753/* Window size (in ns) */
754__read_mostly unsigned int sched_ravg_window = 10000000;
755
756/* Min window size (in ns) = 10ms */
757#define MIN_SCHED_RAVG_WINDOW 10000000
758
759/* Max window size (in ns) = 1s */
760#define MAX_SCHED_RAVG_WINDOW 1000000000
761
762/* Temporarily disable window-stats activity on all cpus */
763unsigned int __read_mostly sched_disable_window_stats;
764
765/*
766 * Major task runtime. If a task runs for more than sched_major_task_runtime
767 * in a window, it's considered to be generating majority of workload
768 * for this window. Prediction could be adjusted for such tasks.
769 */
770__read_mostly unsigned int sched_major_task_runtime = 10000000;
771
772static unsigned int sync_cpu;
773
774static LIST_HEAD(related_thread_groups);
775static DEFINE_RWLOCK(related_thread_group_lock);
776
777#define for_each_related_thread_group(grp) \
778 list_for_each_entry(grp, &related_thread_groups, list)
779
780/*
781 * Demand aggregation for frequency purpose:
782 *
783 * 'sched_freq_aggregate' controls aggregation of cpu demand of related threads
784 * for frequency determination purpose. This aggregation is done per-cluster.
785 *
786 * CPU demand of tasks from various related groups is aggregated per-cluster and
787 * added to the "max_busy_cpu" in that cluster, where max_busy_cpu is determined
788 * by just rq->prev_runnable_sum.
789 *
790 * Some examples follow, which assume:
791 * Cluster0 = CPU0-3, Cluster1 = CPU4-7
792 * One related thread group A that has tasks A0, A1, A2
793 *
794 * A->cpu_time[X].curr/prev_sum = counters in which cpu execution stats of
795 * tasks belonging to group A are accumulated when they run on cpu X.
796 *
797 * CX->curr/prev_sum = counters in which cpu execution stats of all tasks
798 * not belonging to group A are accumulated when they run on cpu X
799 *
800 * Lets say the stats for window M was as below:
801 *
802 * C0->prev_sum = 1ms, A->cpu_time[0].prev_sum = 5ms
803 * Task A0 ran 5ms on CPU0
804 * Task B0 ran 1ms on CPU0
805 *
806 * C1->prev_sum = 5ms, A->cpu_time[1].prev_sum = 6ms
807 * Task A1 ran 4ms on CPU1
808 * Task A2 ran 2ms on CPU1
809 * Task B1 ran 5ms on CPU1
810 *
811 * C2->prev_sum = 0ms, A->cpu_time[2].prev_sum = 0
812 * CPU2 idle
813 *
814 * C3->prev_sum = 0ms, A->cpu_time[3].prev_sum = 0
815 * CPU3 idle
816 *
817 * In this case, CPU1 was most busy going by just its prev_sum counter. Demand
818 * from all group A tasks are added to CPU1. IOW, at end of window M, cpu busy
819 * time reported to governor will be:
820 *
821 *
822 * C0 busy time = 1ms
823 * C1 busy time = 5 + 5 + 6 = 16ms
824 *
825 */
826static __read_mostly unsigned int sched_freq_aggregate;
827__read_mostly unsigned int sysctl_sched_freq_aggregate;
828
829unsigned int __read_mostly sysctl_sched_freq_aggregate_threshold_pct;
830static unsigned int __read_mostly sched_freq_aggregate_threshold;
831
832/* Initial task load. Newly created tasks are assigned this load. */
833unsigned int __read_mostly sched_init_task_load_windows;
834unsigned int __read_mostly sysctl_sched_init_task_load_pct = 15;
835
836unsigned int max_task_load(void)
837{
838 return sched_ravg_window;
839}
840
841/*
842 * Scheduler boost is a mechanism to temporarily place tasks on CPUs
843 * with higher capacity than those where a task would have normally
844 * ended up with their load characteristics. Any entity enabling
845 * boost is responsible for disabling it as well.
846 */
847unsigned int sysctl_sched_boost;
848
849/* A cpu can no longer accommodate more tasks if:
850 *
851 * rq->nr_running > sysctl_sched_spill_nr_run ||
852 * rq->hmp_stats.cumulative_runnable_avg > sched_spill_load
853 */
854unsigned int __read_mostly sysctl_sched_spill_nr_run = 10;
855
856/*
857 * Place sync wakee tasks those have less than configured demand to the waker's
858 * cluster.
859 */
860unsigned int __read_mostly sched_small_wakee_task_load;
861unsigned int __read_mostly sysctl_sched_small_wakee_task_load_pct = 10;
862
863unsigned int __read_mostly sched_big_waker_task_load;
864unsigned int __read_mostly sysctl_sched_big_waker_task_load_pct = 25;
865
866/*
867 * CPUs with load greater than the sched_spill_load_threshold are not
868 * eligible for task placement. When all CPUs in a cluster achieve a
869 * load higher than this level, tasks becomes eligible for inter
870 * cluster migration.
871 */
872unsigned int __read_mostly sched_spill_load;
873unsigned int __read_mostly sysctl_sched_spill_load_pct = 100;
874
875/*
876 * Tasks whose bandwidth consumption on a cpu is more than
877 * sched_upmigrate are considered "big" tasks. Big tasks will be
878 * considered for "up" migration, i.e migrating to a cpu with better
879 * capacity.
880 */
881unsigned int __read_mostly sched_upmigrate;
882unsigned int __read_mostly sysctl_sched_upmigrate_pct = 80;
883
884/*
885 * Big tasks, once migrated, will need to drop their bandwidth
886 * consumption to less than sched_downmigrate before they are "down"
887 * migrated.
888 */
889unsigned int __read_mostly sched_downmigrate;
890unsigned int __read_mostly sysctl_sched_downmigrate_pct = 60;
891
892/*
893 * The load scale factor of a CPU gets boosted when its max frequency
894 * is restricted due to which the tasks are migrating to higher capacity
895 * CPUs early. The sched_upmigrate threshold is auto-upgraded by
896 * rq->max_possible_freq/rq->max_freq of a lower capacity CPU.
897 */
898unsigned int up_down_migrate_scale_factor = 1024;
899
900/*
901 * Scheduler selects and places task to its previous CPU if sleep time is
902 * less than sysctl_sched_select_prev_cpu_us.
903 */
904unsigned int __read_mostly
905sched_short_sleep_task_threshold = 2000 * NSEC_PER_USEC;
906
907unsigned int __read_mostly sysctl_sched_select_prev_cpu_us = 2000;
908
909unsigned int __read_mostly
910sched_long_cpu_selection_threshold = 100 * NSEC_PER_MSEC;
911
912unsigned int __read_mostly sysctl_sched_restrict_cluster_spill;
913
914void update_up_down_migrate(void)
915{
916 unsigned int up_migrate = pct_to_real(sysctl_sched_upmigrate_pct);
917 unsigned int down_migrate = pct_to_real(sysctl_sched_downmigrate_pct);
918 unsigned int delta;
919
920 if (up_down_migrate_scale_factor == 1024)
921 goto done;
922
923 delta = up_migrate - down_migrate;
924
925 up_migrate /= NSEC_PER_USEC;
926 up_migrate *= up_down_migrate_scale_factor;
927 up_migrate >>= 10;
928 up_migrate *= NSEC_PER_USEC;
929
930 up_migrate = min(up_migrate, sched_ravg_window);
931
932 down_migrate /= NSEC_PER_USEC;
933 down_migrate *= up_down_migrate_scale_factor;
934 down_migrate >>= 10;
935 down_migrate *= NSEC_PER_USEC;
936
937 down_migrate = min(down_migrate, up_migrate - delta);
938done:
939 sched_upmigrate = up_migrate;
940 sched_downmigrate = down_migrate;
941}
942
943void set_hmp_defaults(void)
944{
945 sched_spill_load =
946 pct_to_real(sysctl_sched_spill_load_pct);
947
948 update_up_down_migrate();
949
950 sched_major_task_runtime =
951 mult_frac(sched_ravg_window, MAJOR_TASK_PCT, 100);
952
953 sched_init_task_load_windows =
954 div64_u64((u64)sysctl_sched_init_task_load_pct *
955 (u64)sched_ravg_window, 100);
956
957 sched_short_sleep_task_threshold = sysctl_sched_select_prev_cpu_us *
958 NSEC_PER_USEC;
959
960 sched_small_wakee_task_load =
961 div64_u64((u64)sysctl_sched_small_wakee_task_load_pct *
962 (u64)sched_ravg_window, 100);
963
964 sched_big_waker_task_load =
965 div64_u64((u64)sysctl_sched_big_waker_task_load_pct *
966 (u64)sched_ravg_window, 100);
967
968 sched_freq_aggregate_threshold =
969 pct_to_real(sysctl_sched_freq_aggregate_threshold_pct);
970}
971
972u32 sched_get_init_task_load(struct task_struct *p)
973{
974 return p->init_load_pct;
975}
976
977int sched_set_init_task_load(struct task_struct *p, int init_load_pct)
978{
979 if (init_load_pct < 0 || init_load_pct > 100)
980 return -EINVAL;
981
982 p->init_load_pct = init_load_pct;
983
984 return 0;
985}
986
987#ifdef CONFIG_CGROUP_SCHED
988
989int upmigrate_discouraged(struct task_struct *p)
990{
991 return task_group(p)->upmigrate_discouraged;
992}
993
994#else
995
996static inline int upmigrate_discouraged(struct task_struct *p)
997{
998 return 0;
999}
1000
1001#endif
1002
1003/* Is a task "big" on its current cpu */
1004static inline int __is_big_task(struct task_struct *p, u64 scaled_load)
1005{
1006 int nice = task_nice(p);
1007
1008 if (nice > SCHED_UPMIGRATE_MIN_NICE || upmigrate_discouraged(p))
1009 return 0;
1010
1011 return scaled_load > sched_upmigrate;
1012}
1013
1014int is_big_task(struct task_struct *p)
1015{
1016 return __is_big_task(p, scale_load_to_cpu(task_load(p), task_cpu(p)));
1017}
1018
1019u64 cpu_load(int cpu)
1020{
1021 struct rq *rq = cpu_rq(cpu);
1022
1023 return scale_load_to_cpu(rq->hmp_stats.cumulative_runnable_avg, cpu);
1024}
1025
1026u64 cpu_load_sync(int cpu, int sync)
1027{
1028 return scale_load_to_cpu(cpu_cravg_sync(cpu, sync), cpu);
1029}
1030
1031static int boost_refcount;
1032static DEFINE_SPINLOCK(boost_lock);
1033static DEFINE_MUTEX(boost_mutex);
1034
1035static void boost_kick_cpus(void)
1036{
1037 int i;
1038
1039 for_each_online_cpu(i) {
1040 if (cpu_capacity(i) != max_capacity)
1041 boost_kick(i);
1042 }
1043}
1044
1045int sched_boost(void)
1046{
1047 return boost_refcount > 0;
1048}
1049
1050int sched_set_boost(int enable)
1051{
1052 unsigned long flags;
1053 int ret = 0;
1054 int old_refcount;
1055
1056 spin_lock_irqsave(&boost_lock, flags);
1057
1058 old_refcount = boost_refcount;
1059
1060 if (enable == 1) {
1061 boost_refcount++;
1062 } else if (!enable) {
1063 if (boost_refcount >= 1)
1064 boost_refcount--;
1065 else
1066 ret = -EINVAL;
1067 } else {
1068 ret = -EINVAL;
1069 }
1070
1071 if (!old_refcount && boost_refcount)
1072 boost_kick_cpus();
1073
1074 trace_sched_set_boost(boost_refcount);
1075 spin_unlock_irqrestore(&boost_lock, flags);
1076
1077 return ret;
1078}
1079
1080int sched_boost_handler(struct ctl_table *table, int write,
1081 void __user *buffer, size_t *lenp,
1082 loff_t *ppos)
1083{
1084 int ret;
1085
1086 mutex_lock(&boost_mutex);
1087 if (!write)
1088 sysctl_sched_boost = sched_boost();
1089
1090 ret = proc_dointvec(table, write, buffer, lenp, ppos);
1091 if (ret || !write)
1092 goto done;
1093
1094 ret = (sysctl_sched_boost <= 1) ?
1095 sched_set_boost(sysctl_sched_boost) : -EINVAL;
1096
1097done:
1098 mutex_unlock(&boost_mutex);
1099 return ret;
1100}
1101
1102/*
1103 * Task will fit on a cpu if it's bandwidth consumption on that cpu
1104 * will be less than sched_upmigrate. A big task that was previously
1105 * "up" migrated will be considered fitting on "little" cpu if its
1106 * bandwidth consumption on "little" cpu will be less than
1107 * sched_downmigrate. This will help avoid frequenty migrations for
1108 * tasks with load close to the upmigrate threshold
1109 */
1110int task_load_will_fit(struct task_struct *p, u64 task_load, int cpu,
1111 enum sched_boost_type boost_type)
1112{
1113 int upmigrate;
1114
1115 if (cpu_capacity(cpu) == max_capacity)
1116 return 1;
1117
1118 if (boost_type != SCHED_BOOST_ON_BIG) {
1119 if (task_nice(p) > SCHED_UPMIGRATE_MIN_NICE ||
1120 upmigrate_discouraged(p))
1121 return 1;
1122
1123 upmigrate = sched_upmigrate;
1124 if (cpu_capacity(task_cpu(p)) > cpu_capacity(cpu))
1125 upmigrate = sched_downmigrate;
1126
1127 if (task_load < upmigrate)
1128 return 1;
1129 }
1130
1131 return 0;
1132}
1133
1134enum sched_boost_type sched_boost_type(void)
1135{
1136 if (sched_boost()) {
1137 if (min_possible_efficiency != max_possible_efficiency)
1138 return SCHED_BOOST_ON_BIG;
1139 else
1140 return SCHED_BOOST_ON_ALL;
1141 }
1142 return SCHED_BOOST_NONE;
1143}
1144
1145int task_will_fit(struct task_struct *p, int cpu)
1146{
1147 u64 tload = scale_load_to_cpu(task_load(p), cpu);
1148
1149 return task_load_will_fit(p, tload, cpu, sched_boost_type());
1150}
1151
1152int group_will_fit(struct sched_cluster *cluster,
1153 struct related_thread_group *grp, u64 demand)
1154{
1155 int cpu = cluster_first_cpu(cluster);
1156 int prev_capacity = 0;
1157 unsigned int threshold = sched_upmigrate;
1158 u64 load;
1159
1160 if (cluster->capacity == max_capacity)
1161 return 1;
1162
1163 if (grp->preferred_cluster)
1164 prev_capacity = grp->preferred_cluster->capacity;
1165
1166 if (cluster->capacity < prev_capacity)
1167 threshold = sched_downmigrate;
1168
1169 load = scale_load_to_cpu(demand, cpu);
1170 if (load < threshold)
1171 return 1;
1172
1173 return 0;
1174}
1175
1176/*
1177 * Return the cost of running task p on CPU cpu. This function
1178 * currently assumes that task p is the only task which will run on
1179 * the CPU.
1180 */
1181unsigned int power_cost(int cpu, u64 demand)
1182{
1183 int first, mid, last;
1184 struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats();
1185 struct cpu_pstate_pwr *costs;
1186 struct freq_max_load *max_load;
1187 int total_static_pwr_cost = 0;
1188 struct rq *rq = cpu_rq(cpu);
1189 unsigned int pc;
1190
1191 if (!per_cpu_info || !per_cpu_info[cpu].ptable)
1192 /*
1193 * When power aware scheduling is not in use, or CPU
1194 * power data is not available, just use the CPU
1195 * capacity as a rough stand-in for real CPU power
1196 * numbers, assuming bigger CPUs are more power
1197 * hungry.
1198 */
1199 return cpu_max_possible_capacity(cpu);
1200
1201 rcu_read_lock();
1202 max_load = rcu_dereference(per_cpu(freq_max_load, cpu));
1203 if (!max_load) {
1204 pc = cpu_max_possible_capacity(cpu);
1205 goto unlock;
1206 }
1207
1208 costs = per_cpu_info[cpu].ptable;
1209
1210 if (demand <= max_load->freqs[0].hdemand) {
1211 pc = costs[0].power;
1212 goto unlock;
1213 } else if (demand > max_load->freqs[max_load->length - 1].hdemand) {
1214 pc = costs[max_load->length - 1].power;
1215 goto unlock;
1216 }
1217
1218 first = 0;
1219 last = max_load->length - 1;
1220 mid = (last - first) >> 1;
1221 while (1) {
1222 if (demand <= max_load->freqs[mid].hdemand)
1223 last = mid;
1224 else
1225 first = mid;
1226
1227 if (last - first == 1)
1228 break;
1229 mid = first + ((last - first) >> 1);
1230 }
1231
1232 pc = costs[last].power;
1233
1234unlock:
1235 rcu_read_unlock();
1236
1237 if (idle_cpu(cpu) && rq->cstate) {
1238 total_static_pwr_cost += rq->static_cpu_pwr_cost;
1239 if (rq->cluster->dstate)
1240 total_static_pwr_cost +=
1241 rq->cluster->static_cluster_pwr_cost;
1242 }
1243
1244 return pc + total_static_pwr_cost;
1245
1246}
1247
1248void inc_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p)
1249{
1250 if (sched_disable_window_stats)
1251 return;
1252
1253 if (is_big_task(p))
1254 stats->nr_big_tasks++;
1255}
1256
1257void dec_nr_big_task(struct hmp_sched_stats *stats, struct task_struct *p)
1258{
1259 if (sched_disable_window_stats)
1260 return;
1261
1262 if (is_big_task(p))
1263 stats->nr_big_tasks--;
1264
1265 BUG_ON(stats->nr_big_tasks < 0);
1266}
1267
1268void inc_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra)
1269{
1270 inc_nr_big_task(&rq->hmp_stats, p);
1271 if (change_cra)
1272 inc_cumulative_runnable_avg(&rq->hmp_stats, p);
1273}
1274
1275void dec_rq_hmp_stats(struct rq *rq, struct task_struct *p, int change_cra)
1276{
1277 dec_nr_big_task(&rq->hmp_stats, p);
1278 if (change_cra)
1279 dec_cumulative_runnable_avg(&rq->hmp_stats, p);
1280}
1281
1282static void reset_hmp_stats(struct hmp_sched_stats *stats, int reset_cra)
1283{
1284 stats->nr_big_tasks = 0;
1285 if (reset_cra) {
1286 stats->cumulative_runnable_avg = 0;
1287 stats->pred_demands_sum = 0;
1288 }
1289}
1290
1291/*
1292 * Invoked from three places:
1293 * 1) try_to_wake_up() -> ... -> select_best_cpu()
1294 * 2) scheduler_tick() -> ... -> migration_needed() -> select_best_cpu()
1295 * 3) can_migrate_task()
1296 *
1297 * Its safe to de-reference p->grp in first case (since p->pi_lock is held)
1298 * but not in other cases. p->grp is hence freed after a RCU grace period and
1299 * accessed under rcu_read_lock()
1300 */
1301int preferred_cluster(struct sched_cluster *cluster, struct task_struct *p)
1302{
1303 struct related_thread_group *grp;
1304 int rc = 0;
1305
1306 rcu_read_lock();
1307
1308 grp = task_related_thread_group(p);
1309 if (!grp || !sysctl_sched_enable_colocation)
1310 rc = 1;
1311 else
1312 rc = (grp->preferred_cluster == cluster);
1313
1314 rcu_read_unlock();
1315 return rc;
1316}
1317
1318struct sched_cluster *rq_cluster(struct rq *rq)
1319{
1320 return rq->cluster;
1321}
1322
1323/*
1324 * reset_cpu_hmp_stats - reset HMP stats for a cpu
1325 * nr_big_tasks
1326 * cumulative_runnable_avg (iff reset_cra is true)
1327 */
1328void reset_cpu_hmp_stats(int cpu, int reset_cra)
1329{
1330 reset_cfs_rq_hmp_stats(cpu, reset_cra);
1331 reset_hmp_stats(&cpu_rq(cpu)->hmp_stats, reset_cra);
1332}
1333
1334void fixup_nr_big_tasks(struct hmp_sched_stats *stats,
1335 struct task_struct *p, s64 delta)
1336{
1337 u64 new_task_load;
1338 u64 old_task_load;
1339
1340 if (sched_disable_window_stats)
1341 return;
1342
1343 old_task_load = scale_load_to_cpu(task_load(p), task_cpu(p));
1344 new_task_load = scale_load_to_cpu(delta + task_load(p), task_cpu(p));
1345
1346 if (__is_big_task(p, old_task_load) && !__is_big_task(p, new_task_load))
1347 stats->nr_big_tasks--;
1348 else if (!__is_big_task(p, old_task_load) &&
1349 __is_big_task(p, new_task_load))
1350 stats->nr_big_tasks++;
1351
1352 BUG_ON(stats->nr_big_tasks < 0);
1353}
1354
1355/*
1356 * Walk runqueue of cpu and re-initialize 'nr_big_tasks' counters.
1357 */
1358static void update_nr_big_tasks(int cpu)
1359{
1360 struct rq *rq = cpu_rq(cpu);
1361 struct task_struct *p;
1362
1363 /* Do not reset cumulative_runnable_avg */
1364 reset_cpu_hmp_stats(cpu, 0);
1365
1366 list_for_each_entry(p, &rq->cfs_tasks, se.group_node)
1367 inc_hmp_sched_stats_fair(rq, p, 0);
1368}
1369
1370/* Disable interrupts and grab runqueue lock of all cpus listed in @cpus */
1371void pre_big_task_count_change(const struct cpumask *cpus)
1372{
1373 int i;
1374
1375 local_irq_disable();
1376
1377 for_each_cpu(i, cpus)
1378 raw_spin_lock(&cpu_rq(i)->lock);
1379}
1380
1381/*
1382 * Reinitialize 'nr_big_tasks' counters on all affected cpus
1383 */
1384void post_big_task_count_change(const struct cpumask *cpus)
1385{
1386 int i;
1387
1388 /* Assumes local_irq_disable() keeps online cpumap stable */
1389 for_each_cpu(i, cpus)
1390 update_nr_big_tasks(i);
1391
1392 for_each_cpu(i, cpus)
1393 raw_spin_unlock(&cpu_rq(i)->lock);
1394
1395 local_irq_enable();
1396}
1397
1398DEFINE_MUTEX(policy_mutex);
1399
1400static inline int invalid_value_freq_input(unsigned int *data)
1401{
1402 if (data == &sysctl_sched_freq_aggregate)
1403 return !(*data == 0 || *data == 1);
1404
1405 return 0;
1406}
1407
1408static inline int invalid_value(unsigned int *data)
1409{
1410 unsigned int val = *data;
1411
1412 if (data == &sysctl_sched_ravg_hist_size)
1413 return (val < 2 || val > RAVG_HIST_SIZE_MAX);
1414
1415 if (data == &sysctl_sched_window_stats_policy)
1416 return val >= WINDOW_STATS_INVALID_POLICY;
1417
1418 return invalid_value_freq_input(data);
1419}
1420
1421/*
1422 * Handle "atomic" update of sysctl_sched_window_stats_policy,
1423 * sysctl_sched_ravg_hist_size and sched_freq_legacy_mode variables.
1424 */
1425int sched_window_update_handler(struct ctl_table *table, int write,
1426 void __user *buffer, size_t *lenp,
1427 loff_t *ppos)
1428{
1429 int ret;
1430 unsigned int *data = (unsigned int *)table->data;
1431 unsigned int old_val;
1432
1433 mutex_lock(&policy_mutex);
1434
1435 old_val = *data;
1436
1437 ret = proc_dointvec(table, write, buffer, lenp, ppos);
1438 if (ret || !write || (write && (old_val == *data)))
1439 goto done;
1440
1441 if (invalid_value(data)) {
1442 *data = old_val;
1443 ret = -EINVAL;
1444 goto done;
1445 }
1446
1447 reset_all_window_stats(0, 0);
1448
1449done:
1450 mutex_unlock(&policy_mutex);
1451
1452 return ret;
1453}
1454
1455/*
1456 * Convert percentage value into absolute form. This will avoid div() operation
1457 * in fast path, to convert task load in percentage scale.
1458 */
1459int sched_hmp_proc_update_handler(struct ctl_table *table, int write,
1460 void __user *buffer, size_t *lenp,
1461 loff_t *ppos)
1462{
1463 int ret;
1464 unsigned int old_val;
1465 unsigned int *data = (unsigned int *)table->data;
1466 int update_min_nice = 0;
1467
1468 mutex_lock(&policy_mutex);
1469
1470 old_val = *data;
1471
1472 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
1473
1474 if (ret || !write)
1475 goto done;
1476
1477 if (write && (old_val == *data))
1478 goto done;
1479
1480 if (sysctl_sched_downmigrate_pct > sysctl_sched_upmigrate_pct) {
1481 *data = old_val;
1482 ret = -EINVAL;
1483 goto done;
1484 }
1485
1486 /*
1487 * Big task tunable change will need to re-classify tasks on
1488 * runqueue as big and set their counters appropriately.
1489 * sysctl interface affects secondary variables (*_pct), which is then
1490 * "atomically" carried over to the primary variables. Atomic change
1491 * includes taking runqueue lock of all online cpus and re-initiatizing
1492 * their big counter values based on changed criteria.
1493 */
1494 if ((data == &sysctl_sched_upmigrate_pct || update_min_nice)) {
1495 get_online_cpus();
1496 pre_big_task_count_change(cpu_online_mask);
1497 }
1498
1499 set_hmp_defaults();
1500
1501 if ((data == &sysctl_sched_upmigrate_pct || update_min_nice)) {
1502 post_big_task_count_change(cpu_online_mask);
1503 put_online_cpus();
1504 }
1505
1506done:
1507 mutex_unlock(&policy_mutex);
1508 return ret;
1509}
1510
1511inline int nr_big_tasks(struct rq *rq)
1512{
1513 return rq->hmp_stats.nr_big_tasks;
1514}
1515
1516unsigned int cpu_temp(int cpu)
1517{
1518 struct cpu_pwr_stats *per_cpu_info = get_cpu_pwr_stats();
1519
1520 if (per_cpu_info)
1521 return per_cpu_info[cpu].temp;
1522 else
1523 return 0;
1524}
1525
1526void init_new_task_load(struct task_struct *p)
1527{
1528 int i;
1529 u32 init_load_windows = sched_init_task_load_windows;
1530 u32 init_load_pct = current->init_load_pct;
1531
1532 p->init_load_pct = 0;
1533 rcu_assign_pointer(p->grp, NULL);
1534 INIT_LIST_HEAD(&p->grp_list);
1535 memset(&p->ravg, 0, sizeof(struct ravg));
1536 p->cpu_cycles = 0;
1537
1538 if (init_load_pct)
1539 init_load_windows = div64_u64((u64)init_load_pct *
1540 (u64)sched_ravg_window, 100);
1541
1542 p->ravg.demand = init_load_windows;
1543 p->ravg.pred_demand = 0;
1544 for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i)
1545 p->ravg.sum_history[i] = init_load_windows;
1546}
1547
1548/* Return task demand in percentage scale */
1549unsigned int pct_task_load(struct task_struct *p)
1550{
1551 unsigned int load;
1552
1553 load = div64_u64((u64)task_load(p) * 100, (u64)max_task_load());
1554
1555 return load;
1556}
1557
1558/*
1559 * Return total number of tasks "eligible" to run on highest capacity cpu
1560 *
1561 * This is simply nr_big_tasks for cpus which are not of max_capacity and
1562 * nr_running for cpus of max_capacity
1563 */
1564unsigned int nr_eligible_big_tasks(int cpu)
1565{
1566 struct rq *rq = cpu_rq(cpu);
1567 int nr_big = rq->hmp_stats.nr_big_tasks;
1568 int nr = rq->nr_running;
1569
1570 if (cpu_max_possible_capacity(cpu) != max_possible_capacity)
1571 return nr_big;
1572
1573 return nr;
1574}
1575
1576static inline int exiting_task(struct task_struct *p)
1577{
1578 return (p->ravg.sum_history[0] == EXITING_TASK_MARKER);
1579}
1580
1581static int __init set_sched_ravg_window(char *str)
1582{
1583 unsigned int window_size;
1584
1585 get_option(&str, &window_size);
1586
1587 if (window_size < MIN_SCHED_RAVG_WINDOW ||
1588 window_size > MAX_SCHED_RAVG_WINDOW) {
1589 WARN_ON(1);
1590 return -EINVAL;
1591 }
1592
1593 sched_ravg_window = window_size;
1594 return 0;
1595}
1596
1597early_param("sched_ravg_window", set_sched_ravg_window);
1598
1599static inline void
1600update_window_start(struct rq *rq, u64 wallclock)
1601{
1602 s64 delta;
1603 int nr_windows;
1604
1605 delta = wallclock - rq->window_start;
1606 BUG_ON(delta < 0);
1607 if (delta < sched_ravg_window)
1608 return;
1609
1610 nr_windows = div64_u64(delta, sched_ravg_window);
1611 rq->window_start += (u64)nr_windows * (u64)sched_ravg_window;
1612}
1613
1614#define DIV64_U64_ROUNDUP(X, Y) div64_u64((X) + (Y - 1), Y)
1615
1616static inline u64 scale_exec_time(u64 delta, struct rq *rq)
1617{
1618 u32 freq;
1619
1620 freq = cpu_cycles_to_freq(rq->cc.cycles, rq->cc.time);
1621 delta = DIV64_U64_ROUNDUP(delta * freq, max_possible_freq);
1622 delta *= rq->cluster->exec_scale_factor;
1623 delta >>= 10;
1624
1625 return delta;
1626}
1627
1628static inline int cpu_is_waiting_on_io(struct rq *rq)
1629{
1630 if (!sched_io_is_busy)
1631 return 0;
1632
1633 return atomic_read(&rq->nr_iowait);
1634}
1635
1636/* Does freq_required sufficiently exceed or fall behind cur_freq? */
1637static inline int
1638nearly_same_freq(unsigned int cur_freq, unsigned int freq_required)
1639{
1640 int delta = freq_required - cur_freq;
1641
1642 if (freq_required > cur_freq)
1643 return delta < sysctl_sched_freq_inc_notify;
1644
1645 delta = -delta;
1646
1647 return delta < sysctl_sched_freq_dec_notify;
1648}
1649
1650/* Convert busy time to frequency equivalent */
1651static inline unsigned int load_to_freq(struct rq *rq, u64 load)
1652{
1653 unsigned int freq;
1654
1655 load = scale_load_to_cpu(load, cpu_of(rq));
1656 load *= 128;
1657 load = div64_u64(load, max_task_load());
1658
1659 freq = load * cpu_max_possible_freq(cpu_of(rq));
1660 freq /= 128;
1661
1662 return freq;
1663}
1664
1665static inline struct group_cpu_time *
1666_group_cpu_time(struct related_thread_group *grp, int cpu);
1667
1668/*
1669 * Return load from all related group in given cpu.
1670 * Caller must ensure that related_thread_group_lock is held.
1671 */
1672static void _group_load_in_cpu(int cpu, u64 *grp_load, u64 *new_grp_load)
1673{
1674 struct related_thread_group *grp;
1675
1676 for_each_related_thread_group(grp) {
1677 struct group_cpu_time *cpu_time;
1678
1679 cpu_time = _group_cpu_time(grp, cpu);
1680 *grp_load += cpu_time->prev_runnable_sum;
1681 if (new_grp_load)
1682 *new_grp_load += cpu_time->nt_prev_runnable_sum;
1683 }
1684}
1685
1686/*
1687 * Return load from all related groups in given frequency domain.
1688 * Caller must ensure that related_thread_group_lock is held.
1689 */
1690static void group_load_in_freq_domain(struct cpumask *cpus,
1691 u64 *grp_load, u64 *new_grp_load)
1692{
1693 struct related_thread_group *grp;
1694 int j;
1695
1696 for_each_related_thread_group(grp) {
1697 for_each_cpu(j, cpus) {
1698 struct group_cpu_time *cpu_time;
1699
1700 cpu_time = _group_cpu_time(grp, j);
1701 *grp_load += cpu_time->prev_runnable_sum;
1702 *new_grp_load += cpu_time->nt_prev_runnable_sum;
1703 }
1704 }
1705}
1706
1707/*
1708 * Should scheduler alert governor for changing frequency?
1709 *
1710 * @check_pred - evaluate frequency based on the predictive demand
1711 * @check_groups - add load from all related groups on given cpu
1712 *
1713 * check_groups is set to 1 if a "related" task movement/wakeup is triggering
1714 * the notification check. To avoid "re-aggregation" of demand in such cases,
1715 * we check whether the migrated/woken tasks demand (along with demand from
1716 * existing tasks on the cpu) can be met on target cpu
1717 *
1718 */
1719
1720static int send_notification(struct rq *rq, int check_pred, int check_groups)
1721{
1722 unsigned int cur_freq, freq_required;
1723 unsigned long flags;
1724 int rc = 0;
1725 u64 group_load = 0, new_load = 0;
1726
1727 if (check_pred) {
1728 u64 prev = rq->old_busy_time;
1729 u64 predicted = rq->hmp_stats.pred_demands_sum;
1730
1731 if (rq->cluster->cur_freq == cpu_max_freq(cpu_of(rq)))
1732 return 0;
1733
1734 prev = max(prev, rq->old_estimated_time);
1735 if (prev > predicted)
1736 return 0;
1737
1738 cur_freq = load_to_freq(rq, prev);
1739 freq_required = load_to_freq(rq, predicted);
1740
1741 if (freq_required < cur_freq + sysctl_sched_pred_alert_freq)
1742 return 0;
1743 } else {
1744 read_lock(&related_thread_group_lock);
1745 /*
1746 * Protect from concurrent update of rq->prev_runnable_sum and
1747 * group cpu load
1748 */
1749 raw_spin_lock_irqsave(&rq->lock, flags);
1750 if (check_groups)
1751 _group_load_in_cpu(cpu_of(rq), &group_load, NULL);
1752
1753 new_load = rq->prev_runnable_sum + group_load;
1754
1755 raw_spin_unlock_irqrestore(&rq->lock, flags);
1756 read_unlock(&related_thread_group_lock);
1757
1758 cur_freq = load_to_freq(rq, rq->old_busy_time);
1759 freq_required = load_to_freq(rq, new_load);
1760
1761 if (nearly_same_freq(cur_freq, freq_required))
1762 return 0;
1763 }
1764
1765 raw_spin_lock_irqsave(&rq->lock, flags);
1766 if (!rq->cluster->notifier_sent) {
1767 rq->cluster->notifier_sent = 1;
1768 rc = 1;
1769 trace_sched_freq_alert(cpu_of(rq), check_pred, check_groups, rq,
1770 new_load);
1771 }
1772 raw_spin_unlock_irqrestore(&rq->lock, flags);
1773
1774 return rc;
1775}
1776
1777/* Alert governor if there is a need to change frequency */
1778void check_for_freq_change(struct rq *rq, bool check_pred, bool check_groups)
1779{
1780 int cpu = cpu_of(rq);
1781
1782 if (!send_notification(rq, check_pred, check_groups))
1783 return;
1784
1785 atomic_notifier_call_chain(
1786 &load_alert_notifier_head, 0,
1787 (void *)(long)cpu);
1788}
1789
1790void notify_migration(int src_cpu, int dest_cpu, bool src_cpu_dead,
1791 struct task_struct *p)
1792{
1793 bool check_groups;
1794
1795 rcu_read_lock();
1796 check_groups = task_in_related_thread_group(p);
1797 rcu_read_unlock();
1798
1799 if (!same_freq_domain(src_cpu, dest_cpu)) {
1800 if (!src_cpu_dead)
1801 check_for_freq_change(cpu_rq(src_cpu), false,
1802 check_groups);
1803 check_for_freq_change(cpu_rq(dest_cpu), false, check_groups);
1804 } else {
1805 check_for_freq_change(cpu_rq(dest_cpu), true, check_groups);
1806 }
1807}
1808
1809static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p,
1810 u64 irqtime, int event)
1811{
1812 if (is_idle_task(p)) {
1813 /* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */
1814 if (event == PICK_NEXT_TASK)
1815 return 0;
1816
1817 /* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */
1818 return irqtime || cpu_is_waiting_on_io(rq);
1819 }
1820
1821 if (event == TASK_WAKE)
1822 return 0;
1823
1824 if (event == PUT_PREV_TASK || event == IRQ_UPDATE)
1825 return 1;
1826
1827 /*
1828 * TASK_UPDATE can be called on sleeping task, when its moved between
1829 * related groups
1830 */
1831 if (event == TASK_UPDATE) {
1832 if (rq->curr == p)
1833 return 1;
1834
1835 return p->on_rq ? SCHED_FREQ_ACCOUNT_WAIT_TIME : 0;
1836 }
1837
1838 /* TASK_MIGRATE, PICK_NEXT_TASK left */
1839 return SCHED_FREQ_ACCOUNT_WAIT_TIME;
1840}
1841
1842static inline bool is_new_task(struct task_struct *p)
1843{
1844 return p->ravg.active_windows < sysctl_sched_new_task_windows;
1845}
1846
1847#define INC_STEP 8
1848#define DEC_STEP 2
1849#define CONSISTENT_THRES 16
1850#define INC_STEP_BIG 16
1851/*
1852 * bucket_increase - update the count of all buckets
1853 *
1854 * @buckets: array of buckets tracking busy time of a task
1855 * @idx: the index of bucket to be incremented
1856 *
1857 * Each time a complete window finishes, count of bucket that runtime
1858 * falls in (@idx) is incremented. Counts of all other buckets are
1859 * decayed. The rate of increase and decay could be different based
1860 * on current count in the bucket.
1861 */
1862static inline void bucket_increase(u8 *buckets, int idx)
1863{
1864 int i, step;
1865
1866 for (i = 0; i < NUM_BUSY_BUCKETS; i++) {
1867 if (idx != i) {
1868 if (buckets[i] > DEC_STEP)
1869 buckets[i] -= DEC_STEP;
1870 else
1871 buckets[i] = 0;
1872 } else {
1873 step = buckets[i] >= CONSISTENT_THRES ?
1874 INC_STEP_BIG : INC_STEP;
1875 if (buckets[i] > U8_MAX - step)
1876 buckets[i] = U8_MAX;
1877 else
1878 buckets[i] += step;
1879 }
1880 }
1881}
1882
1883static inline int busy_to_bucket(u32 normalized_rt)
1884{
1885 int bidx;
1886
1887 bidx = mult_frac(normalized_rt, NUM_BUSY_BUCKETS, max_task_load());
1888 bidx = min(bidx, NUM_BUSY_BUCKETS - 1);
1889
1890 /*
1891 * Combine lowest two buckets. The lowest frequency falls into
1892 * 2nd bucket and thus keep predicting lowest bucket is not
1893 * useful.
1894 */
1895 if (!bidx)
1896 bidx++;
1897
1898 return bidx;
1899}
1900
1901static inline u64
1902scale_load_to_freq(u64 load, unsigned int src_freq, unsigned int dst_freq)
1903{
1904 return div64_u64(load * (u64)src_freq, (u64)dst_freq);
1905}
1906
1907#define HEAVY_TASK_SKIP 2
1908#define HEAVY_TASK_SKIP_LIMIT 4
1909/*
1910 * get_pred_busy - calculate predicted demand for a task on runqueue
1911 *
1912 * @rq: runqueue of task p
1913 * @p: task whose prediction is being updated
1914 * @start: starting bucket. returned prediction should not be lower than
1915 * this bucket.
1916 * @runtime: runtime of the task. returned prediction should not be lower
1917 * than this runtime.
1918 * Note: @start can be derived from @runtime. It's passed in only to
1919 * avoid duplicated calculation in some cases.
1920 *
1921 * A new predicted busy time is returned for task @p based on @runtime
1922 * passed in. The function searches through buckets that represent busy
1923 * time equal to or bigger than @runtime and attempts to find the bucket to
1924 * to use for prediction. Once found, it searches through historical busy
1925 * time and returns the latest that falls into the bucket. If no such busy
1926 * time exists, it returns the medium of that bucket.
1927 */
1928static u32 get_pred_busy(struct rq *rq, struct task_struct *p,
1929 int start, u32 runtime)
1930{
1931 int i;
1932 u8 *buckets = p->ravg.busy_buckets;
1933 u32 *hist = p->ravg.sum_history;
1934 u32 dmin, dmax;
1935 u64 cur_freq_runtime = 0;
1936 int first = NUM_BUSY_BUCKETS, final, skip_to;
1937 u32 ret = runtime;
1938
1939 /* skip prediction for new tasks due to lack of history */
1940 if (unlikely(is_new_task(p)))
1941 goto out;
1942
1943 /* find minimal bucket index to pick */
1944 for (i = start; i < NUM_BUSY_BUCKETS; i++) {
1945 if (buckets[i]) {
1946 first = i;
1947 break;
1948 }
1949 }
1950 /* if no higher buckets are filled, predict runtime */
1951 if (first >= NUM_BUSY_BUCKETS)
1952 goto out;
1953
1954 /* compute the bucket for prediction */
1955 final = first;
1956 if (first < HEAVY_TASK_SKIP_LIMIT) {
1957 /* compute runtime at current CPU frequency */
1958 cur_freq_runtime = mult_frac(runtime, max_possible_efficiency,
1959 rq->cluster->efficiency);
1960 cur_freq_runtime = scale_load_to_freq(cur_freq_runtime,
1961 max_possible_freq, rq->cluster->cur_freq);
1962 /*
1963 * if the task runs for majority of the window, try to
1964 * pick higher buckets.
1965 */
1966 if (cur_freq_runtime >= sched_major_task_runtime) {
1967 int next = NUM_BUSY_BUCKETS;
1968 /*
1969 * if there is a higher bucket that's consistently
1970 * hit, don't jump beyond that.
1971 */
1972 for (i = start + 1; i <= HEAVY_TASK_SKIP_LIMIT &&
1973 i < NUM_BUSY_BUCKETS; i++) {
1974 if (buckets[i] > CONSISTENT_THRES) {
1975 next = i;
1976 break;
1977 }
1978 }
1979 skip_to = min(next, start + HEAVY_TASK_SKIP);
1980 /* don't jump beyond HEAVY_TASK_SKIP_LIMIT */
1981 skip_to = min(HEAVY_TASK_SKIP_LIMIT, skip_to);
1982 /* don't go below first non-empty bucket, if any */
1983 final = max(first, skip_to);
1984 }
1985 }
1986
1987 /* determine demand range for the predicted bucket */
1988 if (final < 2) {
1989 /* lowest two buckets are combined */
1990 dmin = 0;
1991 final = 1;
1992 } else {
1993 dmin = mult_frac(final, max_task_load(), NUM_BUSY_BUCKETS);
1994 }
1995 dmax = mult_frac(final + 1, max_task_load(), NUM_BUSY_BUCKETS);
1996
1997 /*
1998 * search through runtime history and return first runtime that falls
1999 * into the range of predicted bucket.
2000 */
2001 for (i = 0; i < sched_ravg_hist_size; i++) {
2002 if (hist[i] >= dmin && hist[i] < dmax) {
2003 ret = hist[i];
2004 break;
2005 }
2006 }
2007 /* no historical runtime within bucket found, use average of the bin */
2008 if (ret < dmin)
2009 ret = (dmin + dmax) / 2;
2010 /*
2011 * when updating in middle of a window, runtime could be higher
2012 * than all recorded history. Always predict at least runtime.
2013 */
2014 ret = max(runtime, ret);
2015out:
2016 trace_sched_update_pred_demand(rq, p, runtime,
2017 mult_frac((unsigned int)cur_freq_runtime, 100,
2018 sched_ravg_window), ret);
2019 return ret;
2020}
2021
2022static inline u32 calc_pred_demand(struct rq *rq, struct task_struct *p)
2023{
2024 if (p->ravg.pred_demand >= p->ravg.curr_window)
2025 return p->ravg.pred_demand;
2026
2027 return get_pred_busy(rq, p, busy_to_bucket(p->ravg.curr_window),
2028 p->ravg.curr_window);
2029}
2030
2031/*
2032 * predictive demand of a task is calculated at the window roll-over.
2033 * if the task current window busy time exceeds the predicted
2034 * demand, update it here to reflect the task needs.
2035 */
2036void update_task_pred_demand(struct rq *rq, struct task_struct *p, int event)
2037{
2038 u32 new, old;
2039
2040 if (is_idle_task(p) || exiting_task(p))
2041 return;
2042
2043 if (event != PUT_PREV_TASK && event != TASK_UPDATE &&
2044 (!SCHED_FREQ_ACCOUNT_WAIT_TIME ||
2045 (event != TASK_MIGRATE &&
2046 event != PICK_NEXT_TASK)))
2047 return;
2048
2049 /*
2050 * TASK_UPDATE can be called on sleeping task, when its moved between
2051 * related groups
2052 */
2053 if (event == TASK_UPDATE) {
2054 if (!p->on_rq && !SCHED_FREQ_ACCOUNT_WAIT_TIME)
2055 return;
2056 }
2057
2058 new = calc_pred_demand(rq, p);
2059 old = p->ravg.pred_demand;
2060
2061 if (old >= new)
2062 return;
2063
2064 if (task_on_rq_queued(p) && (!task_has_dl_policy(p) ||
2065 !p->dl.dl_throttled))
2066 p->sched_class->fixup_hmp_sched_stats(rq, p,
2067 p->ravg.demand,
2068 new);
2069
2070 p->ravg.pred_demand = new;
2071}
2072
2073/*
2074 * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum)
2075 */
2076static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
2077 int event, u64 wallclock, u64 irqtime)
2078{
2079 int new_window, full_window = 0;
2080 int p_is_curr_task = (p == rq->curr);
2081 u64 mark_start = p->ravg.mark_start;
2082 u64 window_start = rq->window_start;
2083 u32 window_size = sched_ravg_window;
2084 u64 delta;
2085 u64 *curr_runnable_sum = &rq->curr_runnable_sum;
2086 u64 *prev_runnable_sum = &rq->prev_runnable_sum;
2087 u64 *nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
2088 u64 *nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
2089 int flip_counters = 0;
2090 int prev_sum_reset = 0;
2091 bool new_task;
2092 struct related_thread_group *grp;
2093
2094 new_window = mark_start < window_start;
2095 if (new_window) {
2096 full_window = (window_start - mark_start) >= window_size;
2097 if (p->ravg.active_windows < USHRT_MAX)
2098 p->ravg.active_windows++;
2099 }
2100
2101 new_task = is_new_task(p);
2102
2103 grp = p->grp;
2104 if (grp && sched_freq_aggregate) {
2105 /* cpu_time protected by rq_lock */
2106 struct group_cpu_time *cpu_time =
2107 _group_cpu_time(grp, cpu_of(rq));
2108
2109 curr_runnable_sum = &cpu_time->curr_runnable_sum;
2110 prev_runnable_sum = &cpu_time->prev_runnable_sum;
2111
2112 nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
2113 nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
2114
2115 if (cpu_time->window_start != rq->window_start) {
2116 int nr_windows;
2117
2118 delta = rq->window_start - cpu_time->window_start;
2119 nr_windows = div64_u64(delta, window_size);
2120 if (nr_windows > 1)
2121 prev_sum_reset = 1;
2122
2123 cpu_time->window_start = rq->window_start;
2124 flip_counters = 1;
2125 }
2126
2127 if (p_is_curr_task && new_window) {
2128 u64 curr_sum = rq->curr_runnable_sum;
2129 u64 nt_curr_sum = rq->nt_curr_runnable_sum;
2130
2131 if (full_window)
2132 curr_sum = nt_curr_sum = 0;
2133
2134 rq->prev_runnable_sum = curr_sum;
2135 rq->nt_prev_runnable_sum = nt_curr_sum;
2136
2137 rq->curr_runnable_sum = 0;
2138 rq->nt_curr_runnable_sum = 0;
2139 }
2140 } else {
2141 if (p_is_curr_task && new_window) {
2142 flip_counters = 1;
2143 if (full_window)
2144 prev_sum_reset = 1;
2145 }
2146 }
2147
2148 /*
2149 * Handle per-task window rollover. We don't care about the idle
2150 * task or exiting tasks.
2151 */
2152 if (new_window && !is_idle_task(p) && !exiting_task(p)) {
2153 u32 curr_window = 0;
2154
2155 if (!full_window)
2156 curr_window = p->ravg.curr_window;
2157
2158 p->ravg.prev_window = curr_window;
2159 p->ravg.curr_window = 0;
2160 }
2161
2162 if (flip_counters) {
2163 u64 curr_sum = *curr_runnable_sum;
2164 u64 nt_curr_sum = *nt_curr_runnable_sum;
2165
2166 if (prev_sum_reset)
2167 curr_sum = nt_curr_sum = 0;
2168
2169 *prev_runnable_sum = curr_sum;
2170 *nt_prev_runnable_sum = nt_curr_sum;
2171
2172 *curr_runnable_sum = 0;
2173 *nt_curr_runnable_sum = 0;
2174 }
2175
2176 if (!account_busy_for_cpu_time(rq, p, irqtime, event)) {
2177 /*
2178 * account_busy_for_cpu_time() = 0, so no update to the
2179 * task's current window needs to be made. This could be
2180 * for example
2181 *
2182 * - a wakeup event on a task within the current
2183 * window (!new_window below, no action required),
2184 * - switching to a new task from idle (PICK_NEXT_TASK)
2185 * in a new window where irqtime is 0 and we aren't
2186 * waiting on IO
2187 */
2188
2189 if (!new_window)
2190 return;
2191
2192 /*
2193 * A new window has started. The RQ demand must be rolled
2194 * over if p is the current task.
2195 */
2196 if (p_is_curr_task) {
2197 /* p is idle task */
2198 BUG_ON(p != rq->idle);
2199 }
2200
2201 return;
2202 }
2203
2204 if (!new_window) {
2205 /*
2206 * account_busy_for_cpu_time() = 1 so busy time needs
2207 * to be accounted to the current window. No rollover
2208 * since we didn't start a new window. An example of this is
2209 * when a task starts execution and then sleeps within the
2210 * same window.
2211 */
2212
2213 if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq))
2214 delta = wallclock - mark_start;
2215 else
2216 delta = irqtime;
2217 delta = scale_exec_time(delta, rq);
2218 *curr_runnable_sum += delta;
2219 if (new_task)
2220 *nt_curr_runnable_sum += delta;
2221
2222 if (!is_idle_task(p) && !exiting_task(p))
2223 p->ravg.curr_window += delta;
2224
2225 return;
2226 }
2227
2228 if (!p_is_curr_task) {
2229 /*
2230 * account_busy_for_cpu_time() = 1 so busy time needs
2231 * to be accounted to the current window. A new window
2232 * has also started, but p is not the current task, so the
2233 * window is not rolled over - just split up and account
2234 * as necessary into curr and prev. The window is only
2235 * rolled over when a new window is processed for the current
2236 * task.
2237 *
2238 * Irqtime can't be accounted by a task that isn't the
2239 * currently running task.
2240 */
2241
2242 if (!full_window) {
2243 /*
2244 * A full window hasn't elapsed, account partial
2245 * contribution to previous completed window.
2246 */
2247 delta = scale_exec_time(window_start - mark_start, rq);
2248 if (!exiting_task(p))
2249 p->ravg.prev_window += delta;
2250 } else {
2251 /*
2252 * Since at least one full window has elapsed,
2253 * the contribution to the previous window is the
2254 * full window (window_size).
2255 */
2256 delta = scale_exec_time(window_size, rq);
2257 if (!exiting_task(p))
2258 p->ravg.prev_window = delta;
2259 }
2260
2261 *prev_runnable_sum += delta;
2262 if (new_task)
2263 *nt_prev_runnable_sum += delta;
2264
2265 /* Account piece of busy time in the current window. */
2266 delta = scale_exec_time(wallclock - window_start, rq);
2267 *curr_runnable_sum += delta;
2268 if (new_task)
2269 *nt_curr_runnable_sum += delta;
2270
2271 if (!exiting_task(p))
2272 p->ravg.curr_window = delta;
2273
2274 return;
2275 }
2276
2277 if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) {
2278 /*
2279 * account_busy_for_cpu_time() = 1 so busy time needs
2280 * to be accounted to the current window. A new window
2281 * has started and p is the current task so rollover is
2282 * needed. If any of these three above conditions are true
2283 * then this busy time can't be accounted as irqtime.
2284 *
2285 * Busy time for the idle task or exiting tasks need not
2286 * be accounted.
2287 *
2288 * An example of this would be a task that starts execution
2289 * and then sleeps once a new window has begun.
2290 */
2291
2292 if (!full_window) {
2293 /*
2294 * A full window hasn't elapsed, account partial
2295 * contribution to previous completed window.
2296 */
2297 delta = scale_exec_time(window_start - mark_start, rq);
2298 if (!is_idle_task(p) && !exiting_task(p))
2299 p->ravg.prev_window += delta;
2300 } else {
2301 /*
2302 * Since at least one full window has elapsed,
2303 * the contribution to the previous window is the
2304 * full window (window_size).
2305 */
2306 delta = scale_exec_time(window_size, rq);
2307 if (!is_idle_task(p) && !exiting_task(p))
2308 p->ravg.prev_window = delta;
2309 }
2310
2311 /*
2312 * Rollover is done here by overwriting the values in
2313 * prev_runnable_sum and curr_runnable_sum.
2314 */
2315 *prev_runnable_sum += delta;
2316 if (new_task)
2317 *nt_prev_runnable_sum += delta;
2318
2319 /* Account piece of busy time in the current window. */
2320 delta = scale_exec_time(wallclock - window_start, rq);
2321 *curr_runnable_sum += delta;
2322 if (new_task)
2323 *nt_curr_runnable_sum += delta;
2324
2325 if (!is_idle_task(p) && !exiting_task(p))
2326 p->ravg.curr_window = delta;
2327
2328 return;
2329 }
2330
2331 if (irqtime) {
2332 /*
2333 * account_busy_for_cpu_time() = 1 so busy time needs
2334 * to be accounted to the current window. A new window
2335 * has started and p is the current task so rollover is
2336 * needed. The current task must be the idle task because
2337 * irqtime is not accounted for any other task.
2338 *
2339 * Irqtime will be accounted each time we process IRQ activity
2340 * after a period of idleness, so we know the IRQ busy time
2341 * started at wallclock - irqtime.
2342 */
2343
2344 BUG_ON(!is_idle_task(p));
2345 mark_start = wallclock - irqtime;
2346
2347 /*
2348 * Roll window over. If IRQ busy time was just in the current
2349 * window then that is all that need be accounted.
2350 */
2351 if (mark_start > window_start) {
2352 *curr_runnable_sum = scale_exec_time(irqtime, rq);
2353 return;
2354 }
2355
2356 /*
2357 * The IRQ busy time spanned multiple windows. Process the
2358 * busy time preceding the current window start first.
2359 */
2360 delta = window_start - mark_start;
2361 if (delta > window_size)
2362 delta = window_size;
2363 delta = scale_exec_time(delta, rq);
2364 *prev_runnable_sum += delta;
2365
2366 /* Process the remaining IRQ busy time in the current window. */
2367 delta = wallclock - window_start;
2368 rq->curr_runnable_sum = scale_exec_time(delta, rq);
2369
2370 return;
2371 }
2372
2373 BUG();
2374}
2375
2376static inline u32 predict_and_update_buckets(struct rq *rq,
2377 struct task_struct *p, u32 runtime) {
2378
2379 int bidx;
2380 u32 pred_demand;
2381
2382 bidx = busy_to_bucket(runtime);
2383 pred_demand = get_pred_busy(rq, p, bidx, runtime);
2384 bucket_increase(p->ravg.busy_buckets, bidx);
2385
2386 return pred_demand;
2387}
2388
2389static void update_task_cpu_cycles(struct task_struct *p, int cpu)
2390{
2391 if (use_cycle_counter)
2392 p->cpu_cycles = cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu);
2393}
2394
2395static void
2396update_task_rq_cpu_cycles(struct task_struct *p, struct rq *rq, int event,
2397 u64 wallclock, u64 irqtime)
2398{
2399 u64 cur_cycles;
2400 int cpu = cpu_of(rq);
2401
2402 lockdep_assert_held(&rq->lock);
2403
2404 if (!use_cycle_counter) {
2405 rq->cc.cycles = cpu_cur_freq(cpu);
2406 rq->cc.time = 1;
2407 return;
2408 }
2409
2410 cur_cycles = cpu_cycle_counter_cb.get_cpu_cycle_counter(cpu);
2411
2412 /*
2413 * If current task is idle task and irqtime == 0 CPU was
2414 * indeed idle and probably its cycle counter was not
2415 * increasing. We still need estimatied CPU frequency
2416 * for IO wait time accounting. Use the previously
2417 * calculated frequency in such a case.
2418 */
2419 if (!is_idle_task(rq->curr) || irqtime) {
2420 if (unlikely(cur_cycles < p->cpu_cycles))
2421 rq->cc.cycles = cur_cycles + (U64_MAX - p->cpu_cycles);
2422 else
2423 rq->cc.cycles = cur_cycles - p->cpu_cycles;
2424 rq->cc.cycles = rq->cc.cycles * NSEC_PER_MSEC;
2425
2426 if (event == IRQ_UPDATE && is_idle_task(p))
2427 /*
2428 * Time between mark_start of idle task and IRQ handler
2429 * entry time is CPU cycle counter stall period.
2430 * Upon IRQ handler entry sched_account_irqstart()
2431 * replenishes idle task's cpu cycle counter so
2432 * rq->cc.cycles now represents increased cycles during
2433 * IRQ handler rather than time between idle entry and
2434 * IRQ exit. Thus use irqtime as time delta.
2435 */
2436 rq->cc.time = irqtime;
2437 else
2438 rq->cc.time = wallclock - p->ravg.mark_start;
2439 BUG_ON((s64)rq->cc.time < 0);
2440 }
2441
2442 p->cpu_cycles = cur_cycles;
2443
2444 trace_sched_get_task_cpu_cycles(cpu, event, rq->cc.cycles, rq->cc.time);
2445}
2446
2447static int account_busy_for_task_demand(struct task_struct *p, int event)
2448{
2449 /*
2450 * No need to bother updating task demand for exiting tasks
2451 * or the idle task.
2452 */
2453 if (exiting_task(p) || is_idle_task(p))
2454 return 0;
2455
2456 /*
2457 * When a task is waking up it is completing a segment of non-busy
2458 * time. Likewise, if wait time is not treated as busy time, then
2459 * when a task begins to run or is migrated, it is not running and
2460 * is completing a segment of non-busy time.
2461 */
2462 if (event == TASK_WAKE || (!SCHED_ACCOUNT_WAIT_TIME &&
2463 (event == PICK_NEXT_TASK || event == TASK_MIGRATE)))
2464 return 0;
2465
2466 return 1;
2467}
2468
2469/*
2470 * Called when new window is starting for a task, to record cpu usage over
2471 * recently concluded window(s). Normally 'samples' should be 1. It can be > 1
2472 * when, say, a real-time task runs without preemption for several windows at a
2473 * stretch.
2474 */
2475static void update_history(struct rq *rq, struct task_struct *p,
2476 u32 runtime, int samples, int event)
2477{
2478 u32 *hist = &p->ravg.sum_history[0];
2479 int ridx, widx;
2480 u32 max = 0, avg, demand, pred_demand;
2481 u64 sum = 0;
2482
2483 /* Ignore windows where task had no activity */
2484 if (!runtime || is_idle_task(p) || exiting_task(p) || !samples)
2485 goto done;
2486
2487 /* Push new 'runtime' value onto stack */
2488 widx = sched_ravg_hist_size - 1;
2489 ridx = widx - samples;
2490 for (; ridx >= 0; --widx, --ridx) {
2491 hist[widx] = hist[ridx];
2492 sum += hist[widx];
2493 if (hist[widx] > max)
2494 max = hist[widx];
2495 }
2496
2497 for (widx = 0; widx < samples && widx < sched_ravg_hist_size; widx++) {
2498 hist[widx] = runtime;
2499 sum += hist[widx];
2500 if (hist[widx] > max)
2501 max = hist[widx];
2502 }
2503
2504 p->ravg.sum = 0;
2505
2506 if (sched_window_stats_policy == WINDOW_STATS_RECENT) {
2507 demand = runtime;
2508 } else if (sched_window_stats_policy == WINDOW_STATS_MAX) {
2509 demand = max;
2510 } else {
2511 avg = div64_u64(sum, sched_ravg_hist_size);
2512 if (sched_window_stats_policy == WINDOW_STATS_AVG)
2513 demand = avg;
2514 else
2515 demand = max(avg, runtime);
2516 }
2517 pred_demand = predict_and_update_buckets(rq, p, runtime);
2518
2519 /*
2520 * A throttled deadline sched class task gets dequeued without
2521 * changing p->on_rq. Since the dequeue decrements hmp stats
2522 * avoid decrementing it here again.
2523 */
2524 if (task_on_rq_queued(p) && (!task_has_dl_policy(p) ||
2525 !p->dl.dl_throttled))
2526 p->sched_class->fixup_hmp_sched_stats(rq, p, demand,
2527 pred_demand);
2528
2529 p->ravg.demand = demand;
2530 p->ravg.pred_demand = pred_demand;
2531
2532done:
2533 trace_sched_update_history(rq, p, runtime, samples, event);
2534}
2535
2536static void add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta)
2537{
2538 delta = scale_exec_time(delta, rq);
2539 p->ravg.sum += delta;
2540 if (unlikely(p->ravg.sum > sched_ravg_window))
2541 p->ravg.sum = sched_ravg_window;
2542}
2543
2544/*
2545 * Account cpu demand of task and/or update task's cpu demand history
2546 *
2547 * ms = p->ravg.mark_start;
2548 * wc = wallclock
2549 * ws = rq->window_start
2550 *
2551 * Three possibilities:
2552 *
2553 * a) Task event is contained within one window.
2554 * window_start < mark_start < wallclock
2555 *
2556 * ws ms wc
2557 * | | |
2558 * V V V
2559 * |---------------|
2560 *
2561 * In this case, p->ravg.sum is updated *iff* event is appropriate
2562 * (ex: event == PUT_PREV_TASK)
2563 *
2564 * b) Task event spans two windows.
2565 * mark_start < window_start < wallclock
2566 *
2567 * ms ws wc
2568 * | | |
2569 * V V V
2570 * -----|-------------------
2571 *
2572 * In this case, p->ravg.sum is updated with (ws - ms) *iff* event
2573 * is appropriate, then a new window sample is recorded followed
2574 * by p->ravg.sum being set to (wc - ws) *iff* event is appropriate.
2575 *
2576 * c) Task event spans more than two windows.
2577 *
2578 * ms ws_tmp ws wc
2579 * | | | |
2580 * V V V V
2581 * ---|-------|-------|-------|-------|------
2582 * | |
2583 * |<------ nr_full_windows ------>|
2584 *
2585 * In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff*
2586 * event is appropriate, window sample of p->ravg.sum is recorded,
2587 * 'nr_full_window' samples of window_size is also recorded *iff*
2588 * event is appropriate and finally p->ravg.sum is set to (wc - ws)
2589 * *iff* event is appropriate.
2590 *
2591 * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time()
2592 * depends on it!
2593 */
2594static void update_task_demand(struct task_struct *p, struct rq *rq,
2595 int event, u64 wallclock)
2596{
2597 u64 mark_start = p->ravg.mark_start;
2598 u64 delta, window_start = rq->window_start;
2599 int new_window, nr_full_windows;
2600 u32 window_size = sched_ravg_window;
2601
2602 new_window = mark_start < window_start;
2603 if (!account_busy_for_task_demand(p, event)) {
2604 if (new_window)
2605 /*
2606 * If the time accounted isn't being accounted as
2607 * busy time, and a new window started, only the
2608 * previous window need be closed out with the
2609 * pre-existing demand. Multiple windows may have
2610 * elapsed, but since empty windows are dropped,
2611 * it is not necessary to account those.
2612 */
2613 update_history(rq, p, p->ravg.sum, 1, event);
2614 return;
2615 }
2616
2617 if (!new_window) {
2618 /*
2619 * The simple case - busy time contained within the existing
2620 * window.
2621 */
2622 add_to_task_demand(rq, p, wallclock - mark_start);
2623 return;
2624 }
2625
2626 /*
2627 * Busy time spans at least two windows. Temporarily rewind
2628 * window_start to first window boundary after mark_start.
2629 */
2630 delta = window_start - mark_start;
2631 nr_full_windows = div64_u64(delta, window_size);
2632 window_start -= (u64)nr_full_windows * (u64)window_size;
2633
2634 /* Process (window_start - mark_start) first */
2635 add_to_task_demand(rq, p, window_start - mark_start);
2636
2637 /* Push new sample(s) into task's demand history */
2638 update_history(rq, p, p->ravg.sum, 1, event);
2639 if (nr_full_windows)
2640 update_history(rq, p, scale_exec_time(window_size, rq),
2641 nr_full_windows, event);
2642
2643 /*
2644 * Roll window_start back to current to process any remainder
2645 * in current window.
2646 */
2647 window_start += (u64)nr_full_windows * (u64)window_size;
2648
2649 /* Process (wallclock - window_start) next */
2650 mark_start = window_start;
2651 add_to_task_demand(rq, p, wallclock - mark_start);
2652}
2653
2654/* Reflect task activity on its demand and cpu's busy time statistics */
2655void update_task_ravg(struct task_struct *p, struct rq *rq, int event,
2656 u64 wallclock, u64 irqtime)
2657{
2658 if (!rq->window_start || sched_disable_window_stats)
2659 return;
2660
2661 lockdep_assert_held(&rq->lock);
2662
2663 update_window_start(rq, wallclock);
2664
2665 if (!p->ravg.mark_start) {
2666 update_task_cpu_cycles(p, cpu_of(rq));
2667 goto done;
2668 }
2669
2670 update_task_rq_cpu_cycles(p, rq, event, wallclock, irqtime);
2671 update_task_demand(p, rq, event, wallclock);
2672 update_cpu_busy_time(p, rq, event, wallclock, irqtime);
2673 update_task_pred_demand(rq, p, event);
2674done:
2675 trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime,
2676 rq->cc.cycles, rq->cc.time,
2677 _group_cpu_time(p->grp, cpu_of(rq)));
2678
2679 p->ravg.mark_start = wallclock;
2680}
2681
2682void sched_account_irqtime(int cpu, struct task_struct *curr,
2683 u64 delta, u64 wallclock)
2684{
2685 struct rq *rq = cpu_rq(cpu);
2686 unsigned long flags, nr_windows;
2687 u64 cur_jiffies_ts;
2688
2689 raw_spin_lock_irqsave(&rq->lock, flags);
2690
2691 /*
2692 * cputime (wallclock) uses sched_clock so use the same here for
2693 * consistency.
2694 */
2695 delta += sched_clock() - wallclock;
2696 cur_jiffies_ts = get_jiffies_64();
2697
2698 if (is_idle_task(curr))
2699 update_task_ravg(curr, rq, IRQ_UPDATE, sched_ktime_clock(),
2700 delta);
2701
2702 nr_windows = cur_jiffies_ts - rq->irqload_ts;
2703
2704 if (nr_windows) {
2705 if (nr_windows < 10) {
2706 /* Decay CPU's irqload by 3/4 for each window. */
2707 rq->avg_irqload *= (3 * nr_windows);
2708 rq->avg_irqload = div64_u64(rq->avg_irqload,
2709 4 * nr_windows);
2710 } else {
2711 rq->avg_irqload = 0;
2712 }
2713 rq->avg_irqload += rq->cur_irqload;
2714 rq->cur_irqload = 0;
2715 }
2716
2717 rq->cur_irqload += delta;
2718 rq->irqload_ts = cur_jiffies_ts;
2719 raw_spin_unlock_irqrestore(&rq->lock, flags);
2720}
2721
2722void sched_account_irqstart(int cpu, struct task_struct *curr, u64 wallclock)
2723{
2724 struct rq *rq = cpu_rq(cpu);
2725
2726 if (!rq->window_start || sched_disable_window_stats)
2727 return;
2728
2729 if (is_idle_task(curr)) {
2730 /* We're here without rq->lock held, IRQ disabled */
2731 raw_spin_lock(&rq->lock);
2732 update_task_cpu_cycles(curr, cpu);
2733 raw_spin_unlock(&rq->lock);
2734 }
2735}
2736
2737void reset_task_stats(struct task_struct *p)
2738{
2739 u32 sum = 0;
2740
2741 if (exiting_task(p))
2742 sum = EXITING_TASK_MARKER;
2743
2744 memset(&p->ravg, 0, sizeof(struct ravg));
2745 /* Retain EXITING_TASK marker */
2746 p->ravg.sum_history[0] = sum;
2747}
2748
2749void mark_task_starting(struct task_struct *p)
2750{
2751 u64 wallclock;
2752 struct rq *rq = task_rq(p);
2753
2754 if (!rq->window_start || sched_disable_window_stats) {
2755 reset_task_stats(p);
2756 return;
2757 }
2758
2759 wallclock = sched_ktime_clock();
2760 p->ravg.mark_start = p->last_wake_ts = wallclock;
2761 p->last_cpu_selected_ts = wallclock;
2762 p->last_switch_out_ts = 0;
2763 update_task_cpu_cycles(p, cpu_of(rq));
2764}
2765
2766void set_window_start(struct rq *rq)
2767{
2768 int cpu = cpu_of(rq);
2769 struct rq *sync_rq = cpu_rq(sync_cpu);
2770
2771 if (rq->window_start)
2772 return;
2773
2774 if (cpu == sync_cpu) {
2775 rq->window_start = sched_ktime_clock();
2776 } else {
2777 raw_spin_unlock(&rq->lock);
2778 double_rq_lock(rq, sync_rq);
2779 rq->window_start = cpu_rq(sync_cpu)->window_start;
2780 rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
2781 rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
2782 raw_spin_unlock(&sync_rq->lock);
2783 }
2784
2785 rq->curr->ravg.mark_start = rq->window_start;
2786}
2787
2788void migrate_sync_cpu(int cpu)
2789{
2790 if (cpu == sync_cpu)
2791 sync_cpu = smp_processor_id();
2792}
2793
2794static void reset_all_task_stats(void)
2795{
2796 struct task_struct *g, *p;
2797
2798 read_lock(&tasklist_lock);
2799 do_each_thread(g, p) {
2800 reset_task_stats(p);
2801 } while_each_thread(g, p);
2802 read_unlock(&tasklist_lock);
2803}
2804
2805static void disable_window_stats(void)
2806{
2807 unsigned long flags;
2808 int i;
2809
2810 local_irq_save(flags);
2811 for_each_possible_cpu(i)
2812 raw_spin_lock(&cpu_rq(i)->lock);
2813
2814 sched_disable_window_stats = 1;
2815
2816 for_each_possible_cpu(i)
2817 raw_spin_unlock(&cpu_rq(i)->lock);
2818
2819 local_irq_restore(flags);
2820}
2821
2822/* Called with all cpu's rq->lock held */
2823static void enable_window_stats(void)
2824{
2825 sched_disable_window_stats = 0;
2826
2827}
2828
2829enum reset_reason_code {
2830 WINDOW_CHANGE,
2831 POLICY_CHANGE,
2832 HIST_SIZE_CHANGE,
2833 FREQ_AGGREGATE_CHANGE,
2834};
2835
2836const char *sched_window_reset_reasons[] = {
2837 "WINDOW_CHANGE",
2838 "POLICY_CHANGE",
2839 "HIST_SIZE_CHANGE",
2840};
2841
2842/* Called with IRQs enabled */
2843void reset_all_window_stats(u64 window_start, unsigned int window_size)
2844{
2845 int cpu;
2846 unsigned long flags;
2847 u64 start_ts = sched_ktime_clock();
2848 int reason = WINDOW_CHANGE;
2849 unsigned int old = 0, new = 0;
2850 struct related_thread_group *grp;
2851
2852 disable_window_stats();
2853
2854 reset_all_task_stats();
2855
2856 local_irq_save(flags);
2857
2858 read_lock(&related_thread_group_lock);
2859
2860 for_each_possible_cpu(cpu)
2861 raw_spin_lock(&cpu_rq(cpu)->lock);
2862
2863 list_for_each_entry(grp, &related_thread_groups, list) {
2864 int j;
2865
2866 for_each_possible_cpu(j) {
2867 struct group_cpu_time *cpu_time;
2868 /* Protected by rq lock */
2869 cpu_time = _group_cpu_time(grp, j);
2870 memset(cpu_time, 0, sizeof(struct group_cpu_time));
2871 if (window_start)
2872 cpu_time->window_start = window_start;
2873 }
2874 }
2875
2876 if (window_size) {
2877 sched_ravg_window = window_size * TICK_NSEC;
2878 set_hmp_defaults();
2879 }
2880
2881 enable_window_stats();
2882
2883 for_each_possible_cpu(cpu) {
2884 struct rq *rq = cpu_rq(cpu);
2885
2886 if (window_start)
2887 rq->window_start = window_start;
2888 rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
2889 rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
2890 reset_cpu_hmp_stats(cpu, 1);
2891 }
2892
2893 if (sched_window_stats_policy != sysctl_sched_window_stats_policy) {
2894 reason = POLICY_CHANGE;
2895 old = sched_window_stats_policy;
2896 new = sysctl_sched_window_stats_policy;
2897 sched_window_stats_policy = sysctl_sched_window_stats_policy;
2898 } else if (sched_ravg_hist_size != sysctl_sched_ravg_hist_size) {
2899 reason = HIST_SIZE_CHANGE;
2900 old = sched_ravg_hist_size;
2901 new = sysctl_sched_ravg_hist_size;
2902 sched_ravg_hist_size = sysctl_sched_ravg_hist_size;
2903 } else if (sched_freq_aggregate !=
2904 sysctl_sched_freq_aggregate) {
2905 reason = FREQ_AGGREGATE_CHANGE;
2906 old = sched_freq_aggregate;
2907 new = sysctl_sched_freq_aggregate;
2908 sched_freq_aggregate = sysctl_sched_freq_aggregate;
2909 }
2910
2911 for_each_possible_cpu(cpu)
2912 raw_spin_unlock(&cpu_rq(cpu)->lock);
2913
2914 read_unlock(&related_thread_group_lock);
2915
2916 local_irq_restore(flags);
2917
2918 trace_sched_reset_all_window_stats(window_start, window_size,
2919 sched_ktime_clock() - start_ts, reason, old, new);
2920}
2921
2922static inline void
2923sync_window_start(struct rq *rq, struct group_cpu_time *cpu_time);
2924
2925void sched_get_cpus_busy(struct sched_load *busy,
2926 const struct cpumask *query_cpus)
2927{
2928 unsigned long flags;
2929 struct rq *rq;
2930 const int cpus = cpumask_weight(query_cpus);
2931 u64 load[cpus], group_load[cpus];
2932 u64 nload[cpus], ngload[cpus];
2933 u64 pload[cpus];
2934 unsigned int cur_freq[cpus], max_freq[cpus];
2935 int notifier_sent = 0;
2936 int early_detection[cpus];
2937 int cpu, i = 0;
2938 unsigned int window_size;
2939 u64 max_prev_sum = 0;
2940 int max_busy_cpu = cpumask_first(query_cpus);
2941 struct related_thread_group *grp;
2942 u64 total_group_load = 0, total_ngload = 0;
2943 bool aggregate_load = false;
2944
2945 if (unlikely(cpus == 0))
2946 return;
2947
2948 /*
2949 * This function could be called in timer context, and the
2950 * current task may have been executing for a long time. Ensure
2951 * that the window stats are current by doing an update.
2952 */
2953 read_lock(&related_thread_group_lock);
2954
2955 local_irq_save(flags);
2956 for_each_cpu(cpu, query_cpus)
2957 raw_spin_lock(&cpu_rq(cpu)->lock);
2958
2959 window_size = sched_ravg_window;
2960
2961 for_each_cpu(cpu, query_cpus) {
2962 rq = cpu_rq(cpu);
2963
2964 update_task_ravg(rq->curr, rq, TASK_UPDATE, sched_ktime_clock(),
2965 0);
2966 cur_freq[i] = cpu_cycles_to_freq(rq->cc.cycles, rq->cc.time);
2967
2968 load[i] = rq->old_busy_time = rq->prev_runnable_sum;
2969 nload[i] = rq->nt_prev_runnable_sum;
2970 pload[i] = rq->hmp_stats.pred_demands_sum;
2971 rq->old_estimated_time = pload[i];
2972
2973 if (load[i] > max_prev_sum) {
2974 max_prev_sum = load[i];
2975 max_busy_cpu = cpu;
2976 }
2977
2978 /*
2979 * sched_get_cpus_busy() is called for all CPUs in a
2980 * frequency domain. So the notifier_sent flag per
2981 * cluster works even when a frequency domain spans
2982 * more than 1 cluster.
2983 */
2984 if (rq->cluster->notifier_sent) {
2985 notifier_sent = 1;
2986 rq->cluster->notifier_sent = 0;
2987 }
2988 early_detection[i] = (rq->ed_task != NULL);
2989 cur_freq[i] = cpu_cur_freq(cpu);
2990 max_freq[i] = cpu_max_freq(cpu);
2991 i++;
2992 }
2993
2994 for_each_related_thread_group(grp) {
2995 for_each_cpu(cpu, query_cpus) {
2996 /* Protected by rq_lock */
2997 struct group_cpu_time *cpu_time =
2998 _group_cpu_time(grp, cpu);
2999 sync_window_start(cpu_rq(cpu), cpu_time);
3000 }
3001 }
3002
3003 group_load_in_freq_domain(
3004 &cpu_rq(max_busy_cpu)->freq_domain_cpumask,
3005 &total_group_load, &total_ngload);
3006 aggregate_load = !!(total_group_load > sched_freq_aggregate_threshold);
3007
3008 i = 0;
3009 for_each_cpu(cpu, query_cpus) {
3010 group_load[i] = 0;
3011 ngload[i] = 0;
3012
3013 if (early_detection[i])
3014 goto skip_early;
3015
3016 rq = cpu_rq(cpu);
3017 if (aggregate_load) {
3018 if (cpu == max_busy_cpu) {
3019 group_load[i] = total_group_load;
3020 ngload[i] = total_ngload;
3021 }
3022 } else {
3023 _group_load_in_cpu(cpu, &group_load[i], &ngload[i]);
3024 }
3025
3026 load[i] += group_load[i];
3027 nload[i] += ngload[i];
3028 /*
3029 * Scale load in reference to cluster max_possible_freq.
3030 *
3031 * Note that scale_load_to_cpu() scales load in reference to
3032 * the cluster max_freq.
3033 */
3034 load[i] = scale_load_to_cpu(load[i], cpu);
3035 nload[i] = scale_load_to_cpu(nload[i], cpu);
3036 pload[i] = scale_load_to_cpu(pload[i], cpu);
3037skip_early:
3038 i++;
3039 }
3040
3041 for_each_cpu(cpu, query_cpus)
3042 raw_spin_unlock(&(cpu_rq(cpu))->lock);
3043 local_irq_restore(flags);
3044
3045 read_unlock(&related_thread_group_lock);
3046
3047 i = 0;
3048 for_each_cpu(cpu, query_cpus) {
3049 rq = cpu_rq(cpu);
3050
3051 if (early_detection[i]) {
3052 busy[i].prev_load = div64_u64(sched_ravg_window,
3053 NSEC_PER_USEC);
3054 busy[i].new_task_load = 0;
3055 goto exit_early;
3056 }
3057
3058 /*
3059 * When the load aggregation is controlled by
3060 * sched_freq_aggregate_threshold, allow reporting loads
3061 * greater than 100 @ Fcur to ramp up the frequency
3062 * faster.
3063 */
3064 if (notifier_sent || (aggregate_load &&
3065 sched_freq_aggregate_threshold)) {
3066 load[i] = scale_load_to_freq(load[i], max_freq[i],
3067 cpu_max_possible_freq(cpu));
3068 nload[i] = scale_load_to_freq(nload[i], max_freq[i],
3069 cpu_max_possible_freq(cpu));
3070 } else {
3071 load[i] = scale_load_to_freq(load[i], max_freq[i],
3072 cur_freq[i]);
3073 nload[i] = scale_load_to_freq(nload[i], max_freq[i],
3074 cur_freq[i]);
3075 if (load[i] > window_size)
3076 load[i] = window_size;
3077 if (nload[i] > window_size)
3078 nload[i] = window_size;
3079
3080 load[i] = scale_load_to_freq(load[i], cur_freq[i],
3081 cpu_max_possible_freq(cpu));
3082 nload[i] = scale_load_to_freq(nload[i], cur_freq[i],
3083 cpu_max_possible_freq(cpu));
3084 }
3085 pload[i] = scale_load_to_freq(pload[i], max_freq[i],
3086 rq->cluster->max_possible_freq);
3087
3088 busy[i].prev_load = div64_u64(load[i], NSEC_PER_USEC);
3089 busy[i].new_task_load = div64_u64(nload[i], NSEC_PER_USEC);
3090 busy[i].predicted_load = div64_u64(pload[i], NSEC_PER_USEC);
3091
3092exit_early:
3093 trace_sched_get_busy(cpu, busy[i].prev_load,
3094 busy[i].new_task_load,
3095 busy[i].predicted_load,
3096 early_detection[i]);
3097 i++;
3098 }
3099}
3100
3101void sched_set_io_is_busy(int val)
3102{
3103 sched_io_is_busy = val;
3104}
3105
3106int sched_set_window(u64 window_start, unsigned int window_size)
3107{
3108 u64 now, cur_jiffies, jiffy_ktime_ns;
3109 s64 ws;
3110 unsigned long flags;
3111
3112 if (window_size * TICK_NSEC < MIN_SCHED_RAVG_WINDOW)
3113 return -EINVAL;
3114
3115 mutex_lock(&policy_mutex);
3116
3117 /*
3118 * Get a consistent view of ktime, jiffies, and the time
3119 * since the last jiffy (based on last_jiffies_update).
3120 */
3121 local_irq_save(flags);
3122 cur_jiffies = jiffy_to_ktime_ns(&now, &jiffy_ktime_ns);
3123 local_irq_restore(flags);
3124
3125 /* translate window_start from jiffies to nanoseconds */
3126 ws = (window_start - cur_jiffies); /* jiffy difference */
3127 ws *= TICK_NSEC;
3128 ws += jiffy_ktime_ns;
3129
3130 /*
3131 * Roll back calculated window start so that it is in
3132 * the past (window stats must have a current window).
3133 */
3134 while (ws > now)
3135 ws -= (window_size * TICK_NSEC);
3136
3137 BUG_ON(sched_ktime_clock() < ws);
3138
3139 reset_all_window_stats(ws, window_size);
3140
3141 sched_update_freq_max_load(cpu_possible_mask);
3142
3143 mutex_unlock(&policy_mutex);
3144
3145 return 0;
3146}
3147
3148void fixup_busy_time(struct task_struct *p, int new_cpu)
3149{
3150 struct rq *src_rq = task_rq(p);
3151 struct rq *dest_rq = cpu_rq(new_cpu);
3152 u64 wallclock;
3153 u64 *src_curr_runnable_sum, *dst_curr_runnable_sum;
3154 u64 *src_prev_runnable_sum, *dst_prev_runnable_sum;
3155 u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum;
3156 u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum;
3157 int migrate_type;
3158 struct migration_sum_data d;
3159 bool new_task;
3160 struct related_thread_group *grp;
3161
3162 if (!p->on_rq && p->state != TASK_WAKING)
3163 return;
3164
3165 if (exiting_task(p)) {
3166 clear_ed_task(p, src_rq);
3167 return;
3168 }
3169
3170 if (p->state == TASK_WAKING)
3171 double_rq_lock(src_rq, dest_rq);
3172
3173 if (sched_disable_window_stats)
3174 goto done;
3175
3176 wallclock = sched_ktime_clock();
3177
3178 update_task_ravg(task_rq(p)->curr, task_rq(p),
3179 TASK_UPDATE,
3180 wallclock, 0);
3181 update_task_ravg(dest_rq->curr, dest_rq,
3182 TASK_UPDATE, wallclock, 0);
3183
3184 update_task_ravg(p, task_rq(p), TASK_MIGRATE,
3185 wallclock, 0);
3186
3187 update_task_cpu_cycles(p, new_cpu);
3188
3189 new_task = is_new_task(p);
3190 /* Protected by rq_lock */
3191 grp = p->grp;
3192 if (grp && sched_freq_aggregate) {
3193 struct group_cpu_time *cpu_time;
3194
3195 migrate_type = GROUP_TO_GROUP;
3196 /* Protected by rq_lock */
3197 cpu_time = _group_cpu_time(grp, cpu_of(src_rq));
3198 d.src_rq = NULL;
3199 d.src_cpu_time = cpu_time;
3200 src_curr_runnable_sum = &cpu_time->curr_runnable_sum;
3201 src_prev_runnable_sum = &cpu_time->prev_runnable_sum;
3202 src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
3203 src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
3204
3205 /* Protected by rq_lock */
3206 cpu_time = _group_cpu_time(grp, cpu_of(dest_rq));
3207 d.dst_rq = NULL;
3208 d.dst_cpu_time = cpu_time;
3209 dst_curr_runnable_sum = &cpu_time->curr_runnable_sum;
3210 dst_prev_runnable_sum = &cpu_time->prev_runnable_sum;
3211 dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
3212 dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
3213 sync_window_start(dest_rq, cpu_time);
3214 } else {
3215 migrate_type = RQ_TO_RQ;
3216 d.src_rq = src_rq;
3217 d.src_cpu_time = NULL;
3218 d.dst_rq = dest_rq;
3219 d.dst_cpu_time = NULL;
3220 src_curr_runnable_sum = &src_rq->curr_runnable_sum;
3221 src_prev_runnable_sum = &src_rq->prev_runnable_sum;
3222 src_nt_curr_runnable_sum = &src_rq->nt_curr_runnable_sum;
3223 src_nt_prev_runnable_sum = &src_rq->nt_prev_runnable_sum;
3224
3225 dst_curr_runnable_sum = &dest_rq->curr_runnable_sum;
3226 dst_prev_runnable_sum = &dest_rq->prev_runnable_sum;
3227 dst_nt_curr_runnable_sum = &dest_rq->nt_curr_runnable_sum;
3228 dst_nt_prev_runnable_sum = &dest_rq->nt_prev_runnable_sum;
3229 }
3230
3231 if (p->ravg.curr_window) {
3232 *src_curr_runnable_sum -= p->ravg.curr_window;
3233 *dst_curr_runnable_sum += p->ravg.curr_window;
3234 if (new_task) {
3235 *src_nt_curr_runnable_sum -= p->ravg.curr_window;
3236 *dst_nt_curr_runnable_sum += p->ravg.curr_window;
3237 }
3238 }
3239
3240 if (p->ravg.prev_window) {
3241 *src_prev_runnable_sum -= p->ravg.prev_window;
3242 *dst_prev_runnable_sum += p->ravg.prev_window;
3243 if (new_task) {
3244 *src_nt_prev_runnable_sum -= p->ravg.prev_window;
3245 *dst_nt_prev_runnable_sum += p->ravg.prev_window;
3246 }
3247 }
3248
3249 if (p == src_rq->ed_task) {
3250 src_rq->ed_task = NULL;
3251 if (!dest_rq->ed_task)
3252 dest_rq->ed_task = p;
3253 }
3254
3255 trace_sched_migration_update_sum(p, migrate_type, &d);
3256 BUG_ON((s64)*src_prev_runnable_sum < 0);
3257 BUG_ON((s64)*src_curr_runnable_sum < 0);
3258 BUG_ON((s64)*src_nt_prev_runnable_sum < 0);
3259 BUG_ON((s64)*src_nt_curr_runnable_sum < 0);
3260
3261done:
3262 if (p->state == TASK_WAKING)
3263 double_rq_unlock(src_rq, dest_rq);
3264}
3265
3266#define sched_up_down_migrate_auto_update 1
3267static void check_for_up_down_migrate_update(const struct cpumask *cpus)
3268{
3269 int i = cpumask_first(cpus);
3270
3271 if (!sched_up_down_migrate_auto_update)
3272 return;
3273
3274 if (cpu_max_possible_capacity(i) == max_possible_capacity)
3275 return;
3276
3277 if (cpu_max_possible_freq(i) == cpu_max_freq(i))
3278 up_down_migrate_scale_factor = 1024;
3279 else
3280 up_down_migrate_scale_factor = (1024 *
3281 cpu_max_possible_freq(i)) / cpu_max_freq(i);
3282
3283 update_up_down_migrate();
3284}
3285
3286/* Return cluster which can offer required capacity for group */
3287static struct sched_cluster *
3288best_cluster(struct related_thread_group *grp, u64 total_demand)
3289{
3290 struct sched_cluster *cluster = NULL;
3291
3292 for_each_sched_cluster(cluster) {
3293 if (group_will_fit(cluster, grp, total_demand))
3294 return cluster;
3295 }
3296
3297 return NULL;
3298}
3299
3300static void _set_preferred_cluster(struct related_thread_group *grp)
3301{
3302 struct task_struct *p;
3303 u64 combined_demand = 0;
3304
3305 if (!sysctl_sched_enable_colocation) {
3306 grp->last_update = sched_ktime_clock();
3307 grp->preferred_cluster = NULL;
3308 return;
3309 }
3310
3311 /*
3312 * wakeup of two or more related tasks could race with each other and
3313 * could result in multiple calls to _set_preferred_cluster being issued
3314 * at same time. Avoid overhead in such cases of rechecking preferred
3315 * cluster
3316 */
3317 if (sched_ktime_clock() - grp->last_update < sched_ravg_window / 10)
3318 return;
3319
3320 list_for_each_entry(p, &grp->tasks, grp_list)
3321 combined_demand += p->ravg.demand;
3322
3323 grp->preferred_cluster = best_cluster(grp, combined_demand);
3324 grp->last_update = sched_ktime_clock();
3325 trace_sched_set_preferred_cluster(grp, combined_demand);
3326}
3327
3328void set_preferred_cluster(struct related_thread_group *grp)
3329{
3330 raw_spin_lock(&grp->lock);
3331 _set_preferred_cluster(grp);
3332 raw_spin_unlock(&grp->lock);
3333}
3334
3335#define ADD_TASK 0
3336#define REM_TASK 1
3337
3338static inline void free_group_cputime(struct related_thread_group *grp)
3339{
3340 free_percpu(grp->cpu_time);
3341}
3342
3343static int alloc_group_cputime(struct related_thread_group *grp)
3344{
3345 int i;
3346 struct group_cpu_time *cpu_time;
3347 int cpu = raw_smp_processor_id();
3348 struct rq *rq = cpu_rq(cpu);
3349 u64 window_start = rq->window_start;
3350
3351 grp->cpu_time = alloc_percpu(struct group_cpu_time);
3352 if (!grp->cpu_time)
3353 return -ENOMEM;
3354
3355 for_each_possible_cpu(i) {
3356 cpu_time = per_cpu_ptr(grp->cpu_time, i);
3357 memset(cpu_time, 0, sizeof(struct group_cpu_time));
3358 cpu_time->window_start = window_start;
3359 }
3360
3361 return 0;
3362}
3363
3364/*
3365 * A group's window_start may be behind. When moving it forward, flip prev/curr
3366 * counters. When moving forward > 1 window, prev counter is set to 0
3367 */
3368static inline void
3369sync_window_start(struct rq *rq, struct group_cpu_time *cpu_time)
3370{
3371 u64 delta;
3372 int nr_windows;
3373 u64 curr_sum = cpu_time->curr_runnable_sum;
3374 u64 nt_curr_sum = cpu_time->nt_curr_runnable_sum;
3375
3376 delta = rq->window_start - cpu_time->window_start;
3377 if (!delta)
3378 return;
3379
3380 nr_windows = div64_u64(delta, sched_ravg_window);
3381 if (nr_windows > 1)
3382 curr_sum = nt_curr_sum = 0;
3383
3384 cpu_time->prev_runnable_sum = curr_sum;
3385 cpu_time->curr_runnable_sum = 0;
3386
3387 cpu_time->nt_prev_runnable_sum = nt_curr_sum;
3388 cpu_time->nt_curr_runnable_sum = 0;
3389
3390 cpu_time->window_start = rq->window_start;
3391}
3392
3393/*
3394 * Task's cpu usage is accounted in:
3395 * rq->curr/prev_runnable_sum, when its ->grp is NULL
3396 * grp->cpu_time[cpu]->curr/prev_runnable_sum, when its ->grp is !NULL
3397 *
3398 * Transfer task's cpu usage between those counters when transitioning between
3399 * groups
3400 */
3401static void transfer_busy_time(struct rq *rq, struct related_thread_group *grp,
3402 struct task_struct *p, int event)
3403{
3404 u64 wallclock;
3405 struct group_cpu_time *cpu_time;
3406 u64 *src_curr_runnable_sum, *dst_curr_runnable_sum;
3407 u64 *src_prev_runnable_sum, *dst_prev_runnable_sum;
3408 u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum;
3409 u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum;
3410 struct migration_sum_data d;
3411 int migrate_type;
3412
3413 if (!sched_freq_aggregate)
3414 return;
3415
3416 wallclock = sched_ktime_clock();
3417
3418 update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
3419 update_task_ravg(p, rq, TASK_UPDATE, wallclock, 0);
3420
3421 /* cpu_time protected by related_thread_group_lock, grp->lock rq_lock */
3422 cpu_time = _group_cpu_time(grp, cpu_of(rq));
3423 if (event == ADD_TASK) {
3424 sync_window_start(rq, cpu_time);
3425 migrate_type = RQ_TO_GROUP;
3426 d.src_rq = rq;
3427 d.src_cpu_time = NULL;
3428 d.dst_rq = NULL;
3429 d.dst_cpu_time = cpu_time;
3430 src_curr_runnable_sum = &rq->curr_runnable_sum;
3431 dst_curr_runnable_sum = &cpu_time->curr_runnable_sum;
3432 src_prev_runnable_sum = &rq->prev_runnable_sum;
3433 dst_prev_runnable_sum = &cpu_time->prev_runnable_sum;
3434
3435 src_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
3436 dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
3437 src_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
3438 dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
3439 } else {
3440 migrate_type = GROUP_TO_RQ;
3441 d.src_rq = NULL;
3442 d.src_cpu_time = cpu_time;
3443 d.dst_rq = rq;
3444 d.dst_cpu_time = NULL;
3445
3446 /*
3447 * In case of REM_TASK, cpu_time->window_start would be
3448 * uptodate, because of the update_task_ravg() we called
3449 * above on the moving task. Hence no need for
3450 * sync_window_start()
3451 */
3452 src_curr_runnable_sum = &cpu_time->curr_runnable_sum;
3453 dst_curr_runnable_sum = &rq->curr_runnable_sum;
3454 src_prev_runnable_sum = &cpu_time->prev_runnable_sum;
3455 dst_prev_runnable_sum = &rq->prev_runnable_sum;
3456
3457 src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
3458 dst_nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
3459 src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
3460 dst_nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
3461 }
3462
3463 *src_curr_runnable_sum -= p->ravg.curr_window;
3464 *dst_curr_runnable_sum += p->ravg.curr_window;
3465
3466 *src_prev_runnable_sum -= p->ravg.prev_window;
3467 *dst_prev_runnable_sum += p->ravg.prev_window;
3468
3469 if (is_new_task(p)) {
3470 *src_nt_curr_runnable_sum -= p->ravg.curr_window;
3471 *dst_nt_curr_runnable_sum += p->ravg.curr_window;
3472 *src_nt_prev_runnable_sum -= p->ravg.prev_window;
3473 *dst_nt_prev_runnable_sum += p->ravg.prev_window;
3474 }
3475
3476 trace_sched_migration_update_sum(p, migrate_type, &d);
3477
3478 BUG_ON((s64)*src_curr_runnable_sum < 0);
3479 BUG_ON((s64)*src_prev_runnable_sum < 0);
3480}
3481
3482static inline struct group_cpu_time *
3483task_group_cpu_time(struct task_struct *p, int cpu)
3484{
3485 return _group_cpu_time(rcu_dereference(p->grp), cpu);
3486}
3487
3488static inline struct group_cpu_time *
3489_group_cpu_time(struct related_thread_group *grp, int cpu)
3490{
3491 return grp ? per_cpu_ptr(grp->cpu_time, cpu) : NULL;
3492}
3493
3494struct related_thread_group *alloc_related_thread_group(int group_id)
3495{
3496 struct related_thread_group *grp;
3497
3498 grp = kzalloc(sizeof(*grp), GFP_KERNEL);
3499 if (!grp)
3500 return ERR_PTR(-ENOMEM);
3501
3502 if (alloc_group_cputime(grp)) {
3503 kfree(grp);
3504 return ERR_PTR(-ENOMEM);
3505 }
3506
3507 grp->id = group_id;
3508 INIT_LIST_HEAD(&grp->tasks);
3509 INIT_LIST_HEAD(&grp->list);
3510 raw_spin_lock_init(&grp->lock);
3511
3512 return grp;
3513}
3514
3515struct related_thread_group *lookup_related_thread_group(unsigned int group_id)
3516{
3517 struct related_thread_group *grp;
3518
3519 list_for_each_entry(grp, &related_thread_groups, list) {
3520 if (grp->id == group_id)
3521 return grp;
3522 }
3523
3524 return NULL;
3525}
3526
3527/* See comments before preferred_cluster() */
3528static void free_related_thread_group(struct rcu_head *rcu)
3529{
3530 struct related_thread_group *grp = container_of(rcu, struct
3531 related_thread_group, rcu);
3532
3533 free_group_cputime(grp);
3534 kfree(grp);
3535}
3536
3537static void remove_task_from_group(struct task_struct *p)
3538{
3539 struct related_thread_group *grp = p->grp;
3540 struct rq *rq;
3541 int empty_group = 1;
3542 struct rq_flags rf;
3543
3544 raw_spin_lock(&grp->lock);
3545
3546 rq = __task_rq_lock(p, &rf);
3547 transfer_busy_time(rq, p->grp, p, REM_TASK);
3548 list_del_init(&p->grp_list);
3549 rcu_assign_pointer(p->grp, NULL);
3550 __task_rq_unlock(rq, &rf);
3551
3552 if (!list_empty(&grp->tasks)) {
3553 empty_group = 0;
3554 _set_preferred_cluster(grp);
3555 }
3556
3557 raw_spin_unlock(&grp->lock);
3558
3559 if (empty_group) {
3560 list_del(&grp->list);
3561 call_rcu(&grp->rcu, free_related_thread_group);
3562 }
3563}
3564
3565static int
3566add_task_to_group(struct task_struct *p, struct related_thread_group *grp)
3567{
3568 struct rq *rq;
3569 struct rq_flags rf;
3570
3571 raw_spin_lock(&grp->lock);
3572
3573 /*
3574 * Change p->grp under rq->lock. Will prevent races with read-side
3575 * reference of p->grp in various hot-paths
3576 */
3577 rq = __task_rq_lock(p, &rf);
3578 transfer_busy_time(rq, grp, p, ADD_TASK);
3579 list_add(&p->grp_list, &grp->tasks);
3580 rcu_assign_pointer(p->grp, grp);
3581 __task_rq_unlock(rq, &rf);
3582
3583 _set_preferred_cluster(grp);
3584
3585 raw_spin_unlock(&grp->lock);
3586
3587 return 0;
3588}
3589
3590void add_new_task_to_grp(struct task_struct *new)
3591{
3592 unsigned long flags;
3593 struct related_thread_group *grp;
3594 struct task_struct *parent;
3595
3596 if (!sysctl_sched_enable_thread_grouping)
3597 return;
3598
3599 if (thread_group_leader(new))
3600 return;
3601
3602 parent = new->group_leader;
3603
3604 /*
3605 * The parent's pi_lock is required here to protect race
3606 * against the parent task being removed from the
3607 * group.
3608 */
3609 raw_spin_lock_irqsave(&parent->pi_lock, flags);
3610
3611 /* protected by pi_lock. */
3612 grp = task_related_thread_group(parent);
3613 if (!grp) {
3614 raw_spin_unlock_irqrestore(&parent->pi_lock, flags);
3615 return;
3616 }
3617 raw_spin_lock(&grp->lock);
3618
3619 rcu_assign_pointer(new->grp, grp);
3620 list_add(&new->grp_list, &grp->tasks);
3621
3622 raw_spin_unlock(&grp->lock);
3623 raw_spin_unlock_irqrestore(&parent->pi_lock, flags);
3624}
3625
3626int sched_set_group_id(struct task_struct *p, unsigned int group_id)
3627{
3628 int rc = 0, destroy = 0;
3629 unsigned long flags;
3630 struct related_thread_group *grp = NULL, *new = NULL;
3631
3632redo:
3633 raw_spin_lock_irqsave(&p->pi_lock, flags);
3634
3635 if ((current != p && p->flags & PF_EXITING) ||
3636 (!p->grp && !group_id) ||
3637 (p->grp && p->grp->id == group_id))
3638 goto done;
3639
3640 write_lock(&related_thread_group_lock);
3641
3642 if (!group_id) {
3643 remove_task_from_group(p);
3644 write_unlock(&related_thread_group_lock);
3645 goto done;
3646 }
3647
3648 if (p->grp && p->grp->id != group_id)
3649 remove_task_from_group(p);
3650
3651 grp = lookup_related_thread_group(group_id);
3652 if (!grp && !new) {
3653 /* New group */
3654 write_unlock(&related_thread_group_lock);
3655 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3656 new = alloc_related_thread_group(group_id);
3657 if (IS_ERR(new))
3658 return -ENOMEM;
3659 destroy = 1;
3660 /* Rerun checks (like task exiting), since we dropped pi_lock */
3661 goto redo;
3662 } else if (!grp && new) {
3663 /* New group - use object allocated before */
3664 destroy = 0;
3665 list_add(&new->list, &related_thread_groups);
3666 grp = new;
3667 }
3668
3669 BUG_ON(!grp);
3670 rc = add_task_to_group(p, grp);
3671 write_unlock(&related_thread_group_lock);
3672done:
3673 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3674
3675 if (new && destroy) {
3676 free_group_cputime(new);
3677 kfree(new);
3678 }
3679
3680 return rc;
3681}
3682
3683unsigned int sched_get_group_id(struct task_struct *p)
3684{
3685 unsigned int group_id;
3686 struct related_thread_group *grp;
3687
3688 rcu_read_lock();
3689 grp = task_related_thread_group(p);
3690 group_id = grp ? grp->id : 0;
3691 rcu_read_unlock();
3692
3693 return group_id;
3694}
3695
3696static void update_cpu_cluster_capacity(const cpumask_t *cpus)
3697{
3698 int i;
3699 struct sched_cluster *cluster;
3700 struct cpumask cpumask;
3701
3702 cpumask_copy(&cpumask, cpus);
3703 pre_big_task_count_change(cpu_possible_mask);
3704
3705 for_each_cpu(i, &cpumask) {
3706 cluster = cpu_rq(i)->cluster;
3707 cpumask_andnot(&cpumask, &cpumask, &cluster->cpus);
3708
3709 cluster->capacity = compute_capacity(cluster);
3710 cluster->load_scale_factor = compute_load_scale_factor(cluster);
3711
3712 /* 'cpus' can contain cpumask more than one cluster */
3713 check_for_up_down_migrate_update(&cluster->cpus);
3714 }
3715
3716 __update_min_max_capacity();
3717
3718 post_big_task_count_change(cpu_possible_mask);
3719}
3720
3721static DEFINE_SPINLOCK(cpu_freq_min_max_lock);
3722void sched_update_cpu_freq_min_max(const cpumask_t *cpus, u32 fmin, u32 fmax)
3723{
3724 struct cpumask cpumask;
3725 struct sched_cluster *cluster;
3726 int i, update_capacity = 0;
3727 unsigned long flags;
3728
3729 spin_lock_irqsave(&cpu_freq_min_max_lock, flags);
3730 cpumask_copy(&cpumask, cpus);
3731 for_each_cpu(i, &cpumask) {
3732 cluster = cpu_rq(i)->cluster;
3733 cpumask_andnot(&cpumask, &cpumask, &cluster->cpus);
3734
3735 update_capacity += (cluster->max_mitigated_freq != fmax);
3736 cluster->max_mitigated_freq = fmax;
3737 }
3738 spin_unlock_irqrestore(&cpu_freq_min_max_lock, flags);
3739
3740 if (update_capacity)
3741 update_cpu_cluster_capacity(cpus);
3742}
3743
3744static int cpufreq_notifier_policy(struct notifier_block *nb,
3745 unsigned long val, void *data)
3746{
3747 struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
3748 struct sched_cluster *cluster = NULL;
3749 struct cpumask policy_cluster = *policy->related_cpus;
3750 unsigned int orig_max_freq = 0;
3751 int i, j, update_capacity = 0;
3752
3753 if (val != CPUFREQ_NOTIFY && val != CPUFREQ_REMOVE_POLICY &&
3754 val != CPUFREQ_CREATE_POLICY)
3755 return 0;
3756
3757 if (val == CPUFREQ_REMOVE_POLICY || val == CPUFREQ_CREATE_POLICY) {
3758 update_min_max_capacity();
3759 return 0;
3760 }
3761
3762 max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq);
3763 if (min_max_freq == 1)
3764 min_max_freq = UINT_MAX;
3765 min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq);
3766 BUG_ON(!min_max_freq);
3767 BUG_ON(!policy->max);
3768
3769 for_each_cpu(i, &policy_cluster) {
3770 cluster = cpu_rq(i)->cluster;
3771 cpumask_andnot(&policy_cluster, &policy_cluster,
3772 &cluster->cpus);
3773
3774 orig_max_freq = cluster->max_freq;
3775 cluster->min_freq = policy->min;
3776 cluster->max_freq = policy->max;
3777 cluster->cur_freq = policy->cur;
3778
3779 if (!cluster->freq_init_done) {
3780 mutex_lock(&cluster_lock);
3781 for_each_cpu(j, &cluster->cpus)
3782 cpumask_copy(&cpu_rq(j)->freq_domain_cpumask,
3783 policy->related_cpus);
3784 cluster->max_possible_freq = policy->cpuinfo.max_freq;
3785 cluster->max_possible_capacity =
3786 compute_max_possible_capacity(cluster);
3787 cluster->freq_init_done = true;
3788
3789 sort_clusters();
3790 update_all_clusters_stats();
3791 mutex_unlock(&cluster_lock);
3792 continue;
3793 }
3794
3795 update_capacity += (orig_max_freq != cluster->max_freq);
3796 }
3797
3798 if (update_capacity)
3799 update_cpu_cluster_capacity(policy->related_cpus);
3800
3801 return 0;
3802}
3803
3804static int cpufreq_notifier_trans(struct notifier_block *nb,
3805 unsigned long val, void *data)
3806{
3807 struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data;
3808 unsigned int cpu = freq->cpu, new_freq = freq->new;
3809 unsigned long flags;
3810 struct sched_cluster *cluster;
3811 struct cpumask policy_cpus = cpu_rq(cpu)->freq_domain_cpumask;
3812 int i, j;
3813
3814 if (val != CPUFREQ_POSTCHANGE)
3815 return 0;
3816
3817 BUG_ON(!new_freq);
3818
3819 if (cpu_cur_freq(cpu) == new_freq)
3820 return 0;
3821
3822 for_each_cpu(i, &policy_cpus) {
3823 cluster = cpu_rq(i)->cluster;
3824
3825 for_each_cpu(j, &cluster->cpus) {
3826 struct rq *rq = cpu_rq(j);
3827
3828 raw_spin_lock_irqsave(&rq->lock, flags);
3829 update_task_ravg(rq->curr, rq, TASK_UPDATE,
3830 sched_ktime_clock(), 0);
3831 raw_spin_unlock_irqrestore(&rq->lock, flags);
3832 }
3833
3834 cluster->cur_freq = new_freq;
3835 cpumask_andnot(&policy_cpus, &policy_cpus, &cluster->cpus);
3836 }
3837
3838 return 0;
3839}
3840
3841static int pwr_stats_ready_notifier(struct notifier_block *nb,
3842 unsigned long cpu, void *data)
3843{
3844 cpumask_t mask = CPU_MASK_NONE;
3845
3846 cpumask_set_cpu(cpu, &mask);
3847 sched_update_freq_max_load(&mask);
3848
3849 mutex_lock(&cluster_lock);
3850 sort_clusters();
3851 mutex_unlock(&cluster_lock);
3852
3853 return 0;
3854}
3855
3856static struct notifier_block notifier_policy_block = {
3857 .notifier_call = cpufreq_notifier_policy
3858};
3859
3860static struct notifier_block notifier_trans_block = {
3861 .notifier_call = cpufreq_notifier_trans
3862};
3863
3864static struct notifier_block notifier_pwr_stats_ready = {
3865 .notifier_call = pwr_stats_ready_notifier
3866};
3867
3868int __weak register_cpu_pwr_stats_ready_notifier(struct notifier_block *nb)
3869{
3870 return -EINVAL;
3871}
3872
3873static int register_sched_callback(void)
3874{
3875 int ret;
3876
3877 ret = cpufreq_register_notifier(&notifier_policy_block,
3878 CPUFREQ_POLICY_NOTIFIER);
3879
3880 if (!ret)
3881 ret = cpufreq_register_notifier(&notifier_trans_block,
3882 CPUFREQ_TRANSITION_NOTIFIER);
3883
3884 register_cpu_pwr_stats_ready_notifier(&notifier_pwr_stats_ready);
3885
3886 return 0;
3887}
3888
3889/*
3890 * cpufreq callbacks can be registered at core_initcall or later time.
3891 * Any registration done prior to that is "forgotten" by cpufreq. See
3892 * initialization of variable init_cpufreq_transition_notifier_list_called
3893 * for further information.
3894 */
3895core_initcall(register_sched_callback);
3896
3897int update_preferred_cluster(struct related_thread_group *grp,
3898 struct task_struct *p, u32 old_load)
3899{
3900 u32 new_load = task_load(p);
3901
3902 if (!grp)
3903 return 0;
3904
3905 /*
3906 * Update if task's load has changed significantly or a complete window
3907 * has passed since we last updated preference
3908 */
3909 if (abs(new_load - old_load) > sched_ravg_window / 4 ||
3910 sched_ktime_clock() - grp->last_update > sched_ravg_window)
3911 return 1;
3912
3913 return 0;
3914}
3915
3916bool early_detection_notify(struct rq *rq, u64 wallclock)
3917{
3918 struct task_struct *p;
3919 int loop_max = 10;
3920
3921 if (!sched_boost() || !rq->cfs.h_nr_running)
3922 return 0;
3923
3924 rq->ed_task = NULL;
3925 list_for_each_entry(p, &rq->cfs_tasks, se.group_node) {
3926 if (!loop_max)
3927 break;
3928
3929 if (wallclock - p->last_wake_ts >= EARLY_DETECTION_DURATION) {
3930 rq->ed_task = p;
3931 return 1;
3932 }
3933
3934 loop_max--;
3935 }
3936
3937 return 0;
3938}
3939
3940#ifdef CONFIG_CGROUP_SCHED
3941u64 cpu_upmigrate_discourage_read_u64(struct cgroup_subsys_state *css,
3942 struct cftype *cft)
3943{
3944 struct task_group *tg = css_tg(css);
3945
3946 return tg->upmigrate_discouraged;
3947}
3948
3949int cpu_upmigrate_discourage_write_u64(struct cgroup_subsys_state *css,
3950 struct cftype *cft, u64 upmigrate_discourage)
3951{
3952 struct task_group *tg = css_tg(css);
3953 int discourage = upmigrate_discourage > 0;
3954
3955 if (tg->upmigrate_discouraged == discourage)
3956 return 0;
3957
3958 /*
3959 * Revisit big-task classification for tasks of this cgroup. It would
3960 * have been efficient to walk tasks of just this cgroup in running
3961 * state, but we don't have easy means to do that. Walk all tasks in
3962 * running state on all cpus instead and re-visit their big task
3963 * classification.
3964 */
3965 get_online_cpus();
3966 pre_big_task_count_change(cpu_online_mask);
3967
3968 tg->upmigrate_discouraged = discourage;
3969
3970 post_big_task_count_change(cpu_online_mask);
3971 put_online_cpus();
3972
3973 return 0;
3974}
3975#endif /* CONFIG_CGROUP_SCHED */