blob: d59307ecd67d2b197a8ca181bc816b2eca23e767 [file] [log] [blame]
Greg Kroah-Hartmanb2441312017-11-01 15:07:57 +01001// SPDX-License-Identifier: GPL-2.0
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02002/*
3 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
4 *
5 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
6 *
7 * Interactivity improvements by Mike Galbraith
8 * (C) 2007 Mike Galbraith <efault@gmx.de>
9 *
10 * Various enhancements by Dmitry Adamushko.
11 * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
12 *
13 * Group scheduling enhancements by Srivatsa Vaddagiri
14 * Copyright IBM Corporation, 2007
15 * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
16 *
17 * Scaled math optimizations by Thomas Gleixner
18 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
Peter Zijlstra21805082007-08-25 18:41:53 +020019 *
20 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
Peter Zijlstra90eec102015-11-16 11:08:45 +010021 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020022 */
Ingo Molnar325ea102018-03-03 12:20:47 +010023#include "sched.h"
Peter Zijlstra029632f2011-10-25 10:00:11 +020024
25#include <trace/events/sched.h>
26
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020027/*
Peter Zijlstra21805082007-08-25 18:41:53 +020028 * Targeted preemption latency for CPU-bound tasks:
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020029 *
Peter Zijlstra21805082007-08-25 18:41:53 +020030 * NOTE: this latency value is not the same as the concept of
Ingo Molnard274a4c2007-10-15 17:00:14 +020031 * 'timeslice length' - timeslices in CFS are of variable length
32 * and have no persistent notion like in traditional, time-slice
33 * based scheduling concepts.
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020034 *
Ingo Molnard274a4c2007-10-15 17:00:14 +020035 * (to see the precise effective timeslice length of your workload,
36 * run vmstat and monitor the context-switches (cs) field)
Ingo Molnar2b4d5b22016-11-23 07:37:00 +010037 *
38 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020039 */
Ingo Molnar2b4d5b22016-11-23 07:37:00 +010040unsigned int sysctl_sched_latency = 6000000ULL;
41unsigned int normalized_sysctl_sched_latency = 6000000ULL;
Ingo Molnar2bd8e6d2007-10-15 17:00:02 +020042
43/*
Christian Ehrhardt1983a922009-11-30 12:16:47 +010044 * The initial- and re-scaling of tunables is configurable
Christian Ehrhardt1983a922009-11-30 12:16:47 +010045 *
46 * Options are:
Ingo Molnar2b4d5b22016-11-23 07:37:00 +010047 *
48 * SCHED_TUNABLESCALING_NONE - unscaled, always *1
49 * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
50 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
51 *
52 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
Christian Ehrhardt1983a922009-11-30 12:16:47 +010053 */
Ingo Molnar2b4d5b22016-11-23 07:37:00 +010054enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
Christian Ehrhardt1983a922009-11-30 12:16:47 +010055
56/*
Peter Zijlstrab2be5e92007-11-09 22:39:37 +010057 * Minimal preemption granularity for CPU-bound tasks:
Ingo Molnar2b4d5b22016-11-23 07:37:00 +010058 *
Takuya Yoshikawa864616e2010-10-14 16:09:13 +090059 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
Peter Zijlstrab2be5e92007-11-09 22:39:37 +010060 */
Ingo Molnar2b4d5b22016-11-23 07:37:00 +010061unsigned int sysctl_sched_min_granularity = 750000ULL;
62unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
Peter Zijlstrab2be5e92007-11-09 22:39:37 +010063
64/*
Ingo Molnar2b4d5b22016-11-23 07:37:00 +010065 * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
Peter Zijlstrab2be5e92007-11-09 22:39:37 +010066 */
Ingo Molnar0bf377b2010-09-12 08:14:52 +020067static unsigned int sched_nr_latency = 8;
Peter Zijlstrab2be5e92007-11-09 22:39:37 +010068
69/*
Mike Galbraith2bba22c2009-09-09 15:41:37 +020070 * After fork, child runs first. If set to 0 (default) then
Ingo Molnar2bd8e6d2007-10-15 17:00:02 +020071 * parent will (try to) run first.
72 */
Mike Galbraith2bba22c2009-09-09 15:41:37 +020073unsigned int sysctl_sched_child_runs_first __read_mostly;
Peter Zijlstra21805082007-08-25 18:41:53 +020074
75/*
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020076 * SCHED_OTHER wake-up granularity.
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020077 *
78 * This option delays the preemption effects of decoupled workloads
79 * and reduces their over-scheduling. Synchronous workloads will still
80 * have immediate wakeup/sleep latencies.
Ingo Molnar2b4d5b22016-11-23 07:37:00 +010081 *
82 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020083 */
Ingo Molnar2b4d5b22016-11-23 07:37:00 +010084unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
85unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020086
Ingo Molnar2b4d5b22016-11-23 07:37:00 +010087const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
Ingo Molnarda84d962007-10-15 17:00:18 +020088
Tim Chenafe06ef2016-11-22 12:23:53 -080089#ifdef CONFIG_SMP
90/*
Ingo Molnar97fb7a02018-03-03 14:01:12 +010091 * For asym packing, by default the lower numbered CPU has higher priority.
Tim Chenafe06ef2016-11-22 12:23:53 -080092 */
93int __weak arch_asym_cpu_priority(int cpu)
94{
95 return -cpu;
96}
97#endif
98
Paul Turnerec12cb72011-07-21 09:43:30 -070099#ifdef CONFIG_CFS_BANDWIDTH
100/*
101 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
102 * each time a cfs_rq requests quota.
103 *
104 * Note: in the case that the slice exceeds the runtime remaining (either due
105 * to consumption or the quota being specified to be smaller than the slice)
106 * we will always only issue the remaining available time.
107 *
Ingo Molnar2b4d5b22016-11-23 07:37:00 +0100108 * (default: 5 msec, units: microseconds)
109 */
110unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
Paul Turnerec12cb72011-07-21 09:43:30 -0700111#endif
112
Morten Rasmussen32731632016-07-25 14:34:26 +0100113/*
114 * The margin used when comparing utilization with CPU capacity:
Morten Rasmussen893c5d22016-10-14 14:41:12 +0100115 * util * margin < capacity * 1024
Ingo Molnar2b4d5b22016-11-23 07:37:00 +0100116 *
117 * (default: ~20%)
Morten Rasmussen32731632016-07-25 14:34:26 +0100118 */
Ingo Molnar2b4d5b22016-11-23 07:37:00 +0100119unsigned int capacity_margin = 1280;
Morten Rasmussen32731632016-07-25 14:34:26 +0100120
Paul Gortmaker85276322013-04-19 15:10:50 -0400121static inline void update_load_add(struct load_weight *lw, unsigned long inc)
122{
123 lw->weight += inc;
124 lw->inv_weight = 0;
125}
126
127static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
128{
129 lw->weight -= dec;
130 lw->inv_weight = 0;
131}
132
133static inline void update_load_set(struct load_weight *lw, unsigned long w)
134{
135 lw->weight = w;
136 lw->inv_weight = 0;
137}
138
Peter Zijlstra029632f2011-10-25 10:00:11 +0200139/*
140 * Increase the granularity value when there are more CPUs,
141 * because with more CPUs the 'effective latency' as visible
142 * to users decreases. But the relationship is not linear,
143 * so pick a second-best guess by going with the log2 of the
144 * number of CPUs.
145 *
146 * This idea comes from the SD scheduler of Con Kolivas:
147 */
Nicholas Mc Guire58ac93e2015-05-15 21:05:42 +0200148static unsigned int get_update_sysctl_factor(void)
Peter Zijlstra029632f2011-10-25 10:00:11 +0200149{
Nicholas Mc Guire58ac93e2015-05-15 21:05:42 +0200150 unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
Peter Zijlstra029632f2011-10-25 10:00:11 +0200151 unsigned int factor;
152
153 switch (sysctl_sched_tunable_scaling) {
154 case SCHED_TUNABLESCALING_NONE:
155 factor = 1;
156 break;
157 case SCHED_TUNABLESCALING_LINEAR:
158 factor = cpus;
159 break;
160 case SCHED_TUNABLESCALING_LOG:
161 default:
162 factor = 1 + ilog2(cpus);
163 break;
164 }
165
166 return factor;
167}
168
169static void update_sysctl(void)
170{
171 unsigned int factor = get_update_sysctl_factor();
172
173#define SET_SYSCTL(name) \
174 (sysctl_##name = (factor) * normalized_sysctl_##name)
175 SET_SYSCTL(sched_min_granularity);
176 SET_SYSCTL(sched_latency);
177 SET_SYSCTL(sched_wakeup_granularity);
178#undef SET_SYSCTL
179}
180
181void sched_init_granularity(void)
182{
183 update_sysctl();
184}
185
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100186#define WMULT_CONST (~0U)
Peter Zijlstra029632f2011-10-25 10:00:11 +0200187#define WMULT_SHIFT 32
188
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100189static void __update_inv_weight(struct load_weight *lw)
Peter Zijlstra029632f2011-10-25 10:00:11 +0200190{
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100191 unsigned long w;
Peter Zijlstra029632f2011-10-25 10:00:11 +0200192
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100193 if (likely(lw->inv_weight))
194 return;
195
196 w = scale_load_down(lw->weight);
197
198 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
199 lw->inv_weight = 1;
200 else if (unlikely(!w))
201 lw->inv_weight = WMULT_CONST;
Peter Zijlstra029632f2011-10-25 10:00:11 +0200202 else
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100203 lw->inv_weight = WMULT_CONST / w;
204}
Peter Zijlstra029632f2011-10-25 10:00:11 +0200205
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100206/*
207 * delta_exec * weight / lw.weight
208 * OR
209 * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
210 *
Yuyang Du1c3de5e2016-03-30 07:07:51 +0800211 * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100212 * we're guaranteed shift stays positive because inv_weight is guaranteed to
213 * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
214 *
215 * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
216 * weight/lw.weight <= 1, and therefore our shift will also be positive.
217 */
218static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
219{
220 u64 fact = scale_load_down(weight);
221 int shift = WMULT_SHIFT;
Peter Zijlstra029632f2011-10-25 10:00:11 +0200222
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100223 __update_inv_weight(lw);
224
225 if (unlikely(fact >> 32)) {
226 while (fact >> 32) {
227 fact >>= 1;
228 shift--;
229 }
Peter Zijlstra029632f2011-10-25 10:00:11 +0200230 }
231
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100232 /* hint to use a 32x32->64 mul */
233 fact = (u64)(u32)fact * lw->inv_weight;
Peter Zijlstra029632f2011-10-25 10:00:11 +0200234
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100235 while (fact >> 32) {
236 fact >>= 1;
237 shift--;
238 }
239
240 return mul_u64_u32_shr(delta_exec, fact, shift);
Peter Zijlstra029632f2011-10-25 10:00:11 +0200241}
242
243
244const struct sched_class fair_sched_class;
Peter Zijlstraa4c2f002008-10-17 19:27:03 +0200245
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200246/**************************************************************
247 * CFS operations on generic schedulable entities:
248 */
249
250#ifdef CONFIG_FAIR_GROUP_SCHED
251
252/* cpu runqueue to which this cfs_rq is attached */
253static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
254{
255 return cfs_rq->rq;
256}
257
Peter Zijlstra8f488942009-07-24 12:25:30 +0200258static inline struct task_struct *task_of(struct sched_entity *se)
259{
Peter Zijlstra9148a3a2016-09-20 22:34:51 +0200260 SCHED_WARN_ON(!entity_is_task(se));
Peter Zijlstra8f488942009-07-24 12:25:30 +0200261 return container_of(se, struct task_struct, se);
262}
263
Peter Zijlstrab7581492008-04-19 19:45:00 +0200264/* Walk up scheduling entities hierarchy */
265#define for_each_sched_entity(se) \
266 for (; se; se = se->parent)
267
268static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
269{
270 return p->se.cfs_rq;
271}
272
273/* runqueue on which this entity is (to be) queued */
274static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
275{
276 return se->cfs_rq;
277}
278
279/* runqueue "owned" by this group */
280static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
281{
282 return grp->my_q;
283}
284
Peter Zijlstra3d4b47b2010-11-15 15:47:01 -0800285static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
286{
287 if (!cfs_rq->on_list) {
Vincent Guittot9c2791f2016-11-08 10:53:43 +0100288 struct rq *rq = rq_of(cfs_rq);
289 int cpu = cpu_of(rq);
Paul Turner67e86252010-11-15 15:47:05 -0800290 /*
291 * Ensure we either appear before our parent (if already
292 * enqueued) or force our parent to appear after us when it is
Vincent Guittot9c2791f2016-11-08 10:53:43 +0100293 * enqueued. The fact that we always enqueue bottom-up
294 * reduces this to two cases and a special case for the root
295 * cfs_rq. Furthermore, it also means that we will always reset
296 * tmp_alone_branch either when the branch is connected
297 * to a tree or when we reach the beg of the tree
Paul Turner67e86252010-11-15 15:47:05 -0800298 */
299 if (cfs_rq->tg->parent &&
Vincent Guittot9c2791f2016-11-08 10:53:43 +0100300 cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
301 /*
302 * If parent is already on the list, we add the child
303 * just before. Thanks to circular linked property of
304 * the list, this means to put the child at the tail
305 * of the list that starts by parent.
306 */
Paul Turner67e86252010-11-15 15:47:05 -0800307 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
Vincent Guittot9c2791f2016-11-08 10:53:43 +0100308 &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
309 /*
310 * The branch is now connected to its tree so we can
311 * reset tmp_alone_branch to the beginning of the
312 * list.
313 */
314 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
315 } else if (!cfs_rq->tg->parent) {
316 /*
317 * cfs rq without parent should be put
318 * at the tail of the list.
319 */
320 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
321 &rq->leaf_cfs_rq_list);
322 /*
323 * We have reach the beg of a tree so we can reset
324 * tmp_alone_branch to the beginning of the list.
325 */
326 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
327 } else {
328 /*
329 * The parent has not already been added so we want to
330 * make sure that it will be put after us.
331 * tmp_alone_branch points to the beg of the branch
332 * where we will add parent.
333 */
334 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
335 rq->tmp_alone_branch);
336 /*
337 * update tmp_alone_branch to points to the new beg
338 * of the branch
339 */
340 rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
Paul Turner67e86252010-11-15 15:47:05 -0800341 }
Peter Zijlstra3d4b47b2010-11-15 15:47:01 -0800342
343 cfs_rq->on_list = 1;
344 }
345}
346
347static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
348{
349 if (cfs_rq->on_list) {
350 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
351 cfs_rq->on_list = 0;
352 }
353}
354
Peter Zijlstrab7581492008-04-19 19:45:00 +0200355/* Iterate thr' all leaf cfs_rq's on a runqueue */
Tejun Heoa9e7f652017-04-25 17:43:50 -0700356#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
357 list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
358 leaf_cfs_rq_list)
Peter Zijlstrab7581492008-04-19 19:45:00 +0200359
360/* Do the two (enqueued) entities belong to the same group ? */
Peter Zijlstrafed14d42012-02-11 06:05:00 +0100361static inline struct cfs_rq *
Peter Zijlstrab7581492008-04-19 19:45:00 +0200362is_same_group(struct sched_entity *se, struct sched_entity *pse)
363{
364 if (se->cfs_rq == pse->cfs_rq)
Peter Zijlstrafed14d42012-02-11 06:05:00 +0100365 return se->cfs_rq;
Peter Zijlstrab7581492008-04-19 19:45:00 +0200366
Peter Zijlstrafed14d42012-02-11 06:05:00 +0100367 return NULL;
Peter Zijlstrab7581492008-04-19 19:45:00 +0200368}
369
370static inline struct sched_entity *parent_entity(struct sched_entity *se)
371{
372 return se->parent;
373}
374
Peter Zijlstra464b7522008-10-24 11:06:15 +0200375static void
376find_matching_se(struct sched_entity **se, struct sched_entity **pse)
377{
378 int se_depth, pse_depth;
379
380 /*
381 * preemption test can be made between sibling entities who are in the
382 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
383 * both tasks until we find their ancestors who are siblings of common
384 * parent.
385 */
386
387 /* First walk up until both entities are at same depth */
Peter Zijlstrafed14d42012-02-11 06:05:00 +0100388 se_depth = (*se)->depth;
389 pse_depth = (*pse)->depth;
Peter Zijlstra464b7522008-10-24 11:06:15 +0200390
391 while (se_depth > pse_depth) {
392 se_depth--;
393 *se = parent_entity(*se);
394 }
395
396 while (pse_depth > se_depth) {
397 pse_depth--;
398 *pse = parent_entity(*pse);
399 }
400
401 while (!is_same_group(*se, *pse)) {
402 *se = parent_entity(*se);
403 *pse = parent_entity(*pse);
404 }
405}
406
Peter Zijlstra8f488942009-07-24 12:25:30 +0200407#else /* !CONFIG_FAIR_GROUP_SCHED */
408
409static inline struct task_struct *task_of(struct sched_entity *se)
410{
411 return container_of(se, struct task_struct, se);
412}
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200413
414static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
415{
416 return container_of(cfs_rq, struct rq, cfs);
417}
418
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200419
Peter Zijlstrab7581492008-04-19 19:45:00 +0200420#define for_each_sched_entity(se) \
421 for (; se; se = NULL)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200422
Peter Zijlstrab7581492008-04-19 19:45:00 +0200423static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200424{
Peter Zijlstrab7581492008-04-19 19:45:00 +0200425 return &task_rq(p)->cfs;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200426}
427
Peter Zijlstrab7581492008-04-19 19:45:00 +0200428static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
429{
430 struct task_struct *p = task_of(se);
431 struct rq *rq = task_rq(p);
432
433 return &rq->cfs;
434}
435
436/* runqueue "owned" by this group */
437static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
438{
439 return NULL;
440}
441
Peter Zijlstra3d4b47b2010-11-15 15:47:01 -0800442static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
443{
444}
445
446static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
447{
448}
449
Tejun Heoa9e7f652017-04-25 17:43:50 -0700450#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
451 for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
Peter Zijlstrab7581492008-04-19 19:45:00 +0200452
Peter Zijlstrab7581492008-04-19 19:45:00 +0200453static inline struct sched_entity *parent_entity(struct sched_entity *se)
454{
455 return NULL;
456}
457
Peter Zijlstra464b7522008-10-24 11:06:15 +0200458static inline void
459find_matching_se(struct sched_entity **se, struct sched_entity **pse)
460{
461}
462
Peter Zijlstrab7581492008-04-19 19:45:00 +0200463#endif /* CONFIG_FAIR_GROUP_SCHED */
464
Peter Zijlstra6c16a6d2012-03-21 13:07:16 -0700465static __always_inline
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100466void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200467
468/**************************************************************
469 * Scheduling class tree data structure manipulation methods:
470 */
471
Andrei Epure1bf08232013-03-12 21:12:24 +0200472static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
Peter Zijlstra02e04312007-10-15 17:00:07 +0200473{
Andrei Epure1bf08232013-03-12 21:12:24 +0200474 s64 delta = (s64)(vruntime - max_vruntime);
Peter Zijlstra368059a2007-10-15 17:00:11 +0200475 if (delta > 0)
Andrei Epure1bf08232013-03-12 21:12:24 +0200476 max_vruntime = vruntime;
Peter Zijlstra02e04312007-10-15 17:00:07 +0200477
Andrei Epure1bf08232013-03-12 21:12:24 +0200478 return max_vruntime;
Peter Zijlstra02e04312007-10-15 17:00:07 +0200479}
480
Ingo Molnar0702e3e2007-10-15 17:00:14 +0200481static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
Peter Zijlstrab0ffd242007-10-15 17:00:12 +0200482{
483 s64 delta = (s64)(vruntime - min_vruntime);
484 if (delta < 0)
485 min_vruntime = vruntime;
486
487 return min_vruntime;
488}
489
Fabio Checconi54fdc582009-07-16 12:32:27 +0200490static inline int entity_before(struct sched_entity *a,
491 struct sched_entity *b)
492{
493 return (s64)(a->vruntime - b->vruntime) < 0;
494}
495
Peter Zijlstra1af5f732008-10-24 11:06:13 +0200496static void update_min_vruntime(struct cfs_rq *cfs_rq)
497{
Peter Zijlstrab60205c2016-09-20 21:58:12 +0200498 struct sched_entity *curr = cfs_rq->curr;
Davidlohr Buesobfb06882017-09-08 16:14:55 -0700499 struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
Peter Zijlstrab60205c2016-09-20 21:58:12 +0200500
Peter Zijlstra1af5f732008-10-24 11:06:13 +0200501 u64 vruntime = cfs_rq->min_vruntime;
502
Peter Zijlstrab60205c2016-09-20 21:58:12 +0200503 if (curr) {
504 if (curr->on_rq)
505 vruntime = curr->vruntime;
506 else
507 curr = NULL;
508 }
Peter Zijlstra1af5f732008-10-24 11:06:13 +0200509
Davidlohr Buesobfb06882017-09-08 16:14:55 -0700510 if (leftmost) { /* non-empty tree */
511 struct sched_entity *se;
512 se = rb_entry(leftmost, struct sched_entity, run_node);
Peter Zijlstra1af5f732008-10-24 11:06:13 +0200513
Peter Zijlstrab60205c2016-09-20 21:58:12 +0200514 if (!curr)
Peter Zijlstra1af5f732008-10-24 11:06:13 +0200515 vruntime = se->vruntime;
516 else
517 vruntime = min_vruntime(vruntime, se->vruntime);
518 }
519
Andrei Epure1bf08232013-03-12 21:12:24 +0200520 /* ensure we never gain time by being placed backwards. */
Peter Zijlstra1af5f732008-10-24 11:06:13 +0200521 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
Peter Zijlstra3fe16982011-04-05 17:23:48 +0200522#ifndef CONFIG_64BIT
523 smp_wmb();
524 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
525#endif
Peter Zijlstra1af5f732008-10-24 11:06:13 +0200526}
527
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200528/*
529 * Enqueue an entity into the rb-tree:
530 */
Ingo Molnar0702e3e2007-10-15 17:00:14 +0200531static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200532{
Davidlohr Buesobfb06882017-09-08 16:14:55 -0700533 struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200534 struct rb_node *parent = NULL;
535 struct sched_entity *entry;
Davidlohr Buesobfb06882017-09-08 16:14:55 -0700536 bool leftmost = true;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200537
538 /*
539 * Find the right place in the rbtree:
540 */
541 while (*link) {
542 parent = *link;
543 entry = rb_entry(parent, struct sched_entity, run_node);
544 /*
545 * We dont care about collisions. Nodes with
546 * the same key stay together.
547 */
Stephan Baerwolf2bd2d6f2011-07-20 14:46:59 +0200548 if (entity_before(se, entry)) {
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200549 link = &parent->rb_left;
550 } else {
551 link = &parent->rb_right;
Davidlohr Buesobfb06882017-09-08 16:14:55 -0700552 leftmost = false;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200553 }
554 }
555
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200556 rb_link_node(&se->run_node, parent, link);
Davidlohr Buesobfb06882017-09-08 16:14:55 -0700557 rb_insert_color_cached(&se->run_node,
558 &cfs_rq->tasks_timeline, leftmost);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200559}
560
Ingo Molnar0702e3e2007-10-15 17:00:14 +0200561static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200562{
Davidlohr Buesobfb06882017-09-08 16:14:55 -0700563 rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200564}
565
Peter Zijlstra029632f2011-10-25 10:00:11 +0200566struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200567{
Davidlohr Buesobfb06882017-09-08 16:14:55 -0700568 struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
Peter Zijlstraf4b67552008-11-04 21:25:07 +0100569
570 if (!left)
571 return NULL;
572
573 return rb_entry(left, struct sched_entity, run_node);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200574}
575
Rik van Rielac53db52011-02-01 09:51:03 -0500576static struct sched_entity *__pick_next_entity(struct sched_entity *se)
577{
578 struct rb_node *next = rb_next(&se->run_node);
579
580 if (!next)
581 return NULL;
582
583 return rb_entry(next, struct sched_entity, run_node);
584}
585
586#ifdef CONFIG_SCHED_DEBUG
Peter Zijlstra029632f2011-10-25 10:00:11 +0200587struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
Peter Zijlstraaeb73b02007-10-15 17:00:05 +0200588{
Davidlohr Buesobfb06882017-09-08 16:14:55 -0700589 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
Peter Zijlstraaeb73b02007-10-15 17:00:05 +0200590
Balbir Singh70eee742008-02-22 13:25:53 +0530591 if (!last)
592 return NULL;
Ingo Molnar7eee3e62008-02-22 10:32:21 +0100593
594 return rb_entry(last, struct sched_entity, run_node);
Peter Zijlstraaeb73b02007-10-15 17:00:05 +0200595}
596
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200597/**************************************************************
598 * Scheduling class statistics methods:
599 */
600
Christian Ehrhardtacb4a842009-11-30 12:16:48 +0100601int sched_proc_update_handler(struct ctl_table *table, int write,
Alexey Dobriyan8d65af72009-09-23 15:57:19 -0700602 void __user *buffer, size_t *lenp,
Peter Zijlstrab2be5e92007-11-09 22:39:37 +0100603 loff_t *ppos)
604{
Alexey Dobriyan8d65af72009-09-23 15:57:19 -0700605 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
Nicholas Mc Guire58ac93e2015-05-15 21:05:42 +0200606 unsigned int factor = get_update_sysctl_factor();
Peter Zijlstrab2be5e92007-11-09 22:39:37 +0100607
608 if (ret || !write)
609 return ret;
610
611 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
612 sysctl_sched_min_granularity);
613
Christian Ehrhardtacb4a842009-11-30 12:16:48 +0100614#define WRT_SYSCTL(name) \
615 (normalized_sysctl_##name = sysctl_##name / (factor))
616 WRT_SYSCTL(sched_min_granularity);
617 WRT_SYSCTL(sched_latency);
618 WRT_SYSCTL(sched_wakeup_granularity);
Christian Ehrhardtacb4a842009-11-30 12:16:48 +0100619#undef WRT_SYSCTL
620
Peter Zijlstrab2be5e92007-11-09 22:39:37 +0100621 return 0;
622}
623#endif
Ingo Molnar647e7ca2007-10-15 17:00:13 +0200624
625/*
Peter Zijlstraf9c0b092008-10-17 19:27:04 +0200626 * delta /= w
Peter Zijlstraa7be37a2008-06-27 13:41:11 +0200627 */
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100628static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
Peter Zijlstraa7be37a2008-06-27 13:41:11 +0200629{
Peter Zijlstraf9c0b092008-10-17 19:27:04 +0200630 if (unlikely(se->load.weight != NICE_0_LOAD))
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100631 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
Peter Zijlstraa7be37a2008-06-27 13:41:11 +0200632
633 return delta;
634}
635
636/*
Ingo Molnar647e7ca2007-10-15 17:00:13 +0200637 * The idea is to set a period in which each task runs once.
638 *
Borislav Petkov532b1852012-08-08 16:16:04 +0200639 * When there are too many tasks (sched_nr_latency) we have to stretch
Ingo Molnar647e7ca2007-10-15 17:00:13 +0200640 * this period because otherwise the slices get too small.
641 *
642 * p = (nr <= nl) ? l : l*nr/nl
643 */
Peter Zijlstra4d78e7b2007-10-15 17:00:04 +0200644static u64 __sched_period(unsigned long nr_running)
645{
Boqun Feng8e2b0bf2015-07-02 22:25:52 +0800646 if (unlikely(nr_running > sched_nr_latency))
647 return nr_running * sysctl_sched_min_granularity;
648 else
649 return sysctl_sched_latency;
Peter Zijlstra4d78e7b2007-10-15 17:00:04 +0200650}
651
Ingo Molnar647e7ca2007-10-15 17:00:13 +0200652/*
653 * We calculate the wall-time slice from the period by taking a part
654 * proportional to the weight.
655 *
Peter Zijlstraf9c0b092008-10-17 19:27:04 +0200656 * s = p*P[w/rw]
Ingo Molnar647e7ca2007-10-15 17:00:13 +0200657 */
Peter Zijlstra6d0f0eb2007-10-15 17:00:05 +0200658static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
Peter Zijlstra21805082007-08-25 18:41:53 +0200659{
Mike Galbraith0a582442009-01-02 12:16:42 +0100660 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
Peter Zijlstraf9c0b092008-10-17 19:27:04 +0200661
Mike Galbraith0a582442009-01-02 12:16:42 +0100662 for_each_sched_entity(se) {
Lin Ming6272d682009-01-15 17:17:15 +0100663 struct load_weight *load;
Christian Engelmayer3104bf02009-06-16 10:35:12 +0200664 struct load_weight lw;
Lin Ming6272d682009-01-15 17:17:15 +0100665
666 cfs_rq = cfs_rq_of(se);
667 load = &cfs_rq->load;
Peter Zijlstraf9c0b092008-10-17 19:27:04 +0200668
Mike Galbraith0a582442009-01-02 12:16:42 +0100669 if (unlikely(!se->on_rq)) {
Christian Engelmayer3104bf02009-06-16 10:35:12 +0200670 lw = cfs_rq->load;
Mike Galbraith0a582442009-01-02 12:16:42 +0100671
672 update_load_add(&lw, se->load.weight);
673 load = &lw;
674 }
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100675 slice = __calc_delta(slice, se->load.weight, load);
Mike Galbraith0a582442009-01-02 12:16:42 +0100676 }
677 return slice;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200678}
679
Ingo Molnar647e7ca2007-10-15 17:00:13 +0200680/*
Andrei Epure660cc002013-03-11 12:03:20 +0200681 * We calculate the vruntime slice of a to-be-inserted task.
Ingo Molnar647e7ca2007-10-15 17:00:13 +0200682 *
Peter Zijlstraf9c0b092008-10-17 19:27:04 +0200683 * vs = s/w
Ingo Molnar647e7ca2007-10-15 17:00:13 +0200684 */
Peter Zijlstraf9c0b092008-10-17 19:27:04 +0200685static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
Ingo Molnar647e7ca2007-10-15 17:00:13 +0200686{
Peter Zijlstraf9c0b092008-10-17 19:27:04 +0200687 return calc_delta_fair(sched_slice(cfs_rq, se), se);
Peter Zijlstraa7be37a2008-06-27 13:41:11 +0200688}
689
Alex Shia75cdaa2013-06-20 10:18:47 +0800690#ifdef CONFIG_SMP
Vincent Guittotc0796292018-06-28 17:45:04 +0200691#include "pelt.h"
Peter Zijlstra283e2ed2017-04-11 11:08:42 +0200692#include "sched-pelt.h"
693
Morten Rasmussen772bd008c2016-06-22 18:03:13 +0100694static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
Mel Gormanfb13c7e2013-10-07 11:29:17 +0100695static unsigned long task_h_load(struct task_struct *p);
Morten Rasmussen3b1baa62018-07-04 11:17:40 +0100696static unsigned long capacity_of(int cpu);
Mel Gormanfb13c7e2013-10-07 11:29:17 +0100697
Yuyang Du540247f2015-07-15 08:04:39 +0800698/* Give new sched_entity start runnable values to heavy its load in infant time */
699void init_entity_runnable_average(struct sched_entity *se)
Alex Shia75cdaa2013-06-20 10:18:47 +0800700{
Yuyang Du540247f2015-07-15 08:04:39 +0800701 struct sched_avg *sa = &se->avg;
Alex Shia75cdaa2013-06-20 10:18:47 +0800702
Peter Zijlstraf2079342017-05-12 14:16:30 +0200703 memset(sa, 0, sizeof(*sa));
704
Vincent Guittotb5a9b342016-10-19 14:45:23 +0200705 /*
706 * Tasks are intialized with full load to be seen as heavy tasks until
707 * they get a chance to stabilize to their real load level.
708 * Group entities are intialized with zero load to reflect the fact that
709 * nothing has been attached to the task group yet.
710 */
711 if (entity_is_task(se))
Peter Zijlstra1ea6c462017-05-06 15:59:54 +0200712 sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
Peter Zijlstra1ea6c462017-05-06 15:59:54 +0200713
Peter Zijlstraf2079342017-05-12 14:16:30 +0200714 se->runnable_weight = se->load.weight;
715
Yuyang Du9d89c252015-07-15 08:04:37 +0800716 /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
Alex Shia75cdaa2013-06-20 10:18:47 +0800717}
Yuyang Du7ea241a2015-07-15 08:04:42 +0800718
Peter Zijlstra7dc603c2016-06-16 13:29:28 +0200719static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
Vincent Guittotdf217912016-11-08 10:53:42 +0100720static void attach_entity_cfs_rq(struct sched_entity *se);
Peter Zijlstra7dc603c2016-06-16 13:29:28 +0200721
Yuyang Du2b8c41d2016-03-30 04:30:56 +0800722/*
723 * With new tasks being created, their initial util_avgs are extrapolated
724 * based on the cfs_rq's current util_avg:
725 *
726 * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
727 *
728 * However, in many cases, the above util_avg does not give a desired
729 * value. Moreover, the sum of the util_avgs may be divergent, such
730 * as when the series is a harmonic series.
731 *
732 * To solve this problem, we also cap the util_avg of successive tasks to
733 * only 1/2 of the left utilization budget:
734 *
Quentin Perret8fe5c5a2018-06-12 12:22:15 +0100735 * util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n
Yuyang Du2b8c41d2016-03-30 04:30:56 +0800736 *
Quentin Perret8fe5c5a2018-06-12 12:22:15 +0100737 * where n denotes the nth task and cpu_scale the CPU capacity.
Yuyang Du2b8c41d2016-03-30 04:30:56 +0800738 *
Quentin Perret8fe5c5a2018-06-12 12:22:15 +0100739 * For example, for a CPU with 1024 of capacity, a simplest series from
740 * the beginning would be like:
Yuyang Du2b8c41d2016-03-30 04:30:56 +0800741 *
742 * task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
743 * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
744 *
745 * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
746 * if util_avg > util_avg_cap.
747 */
748void post_init_entity_util_avg(struct sched_entity *se)
749{
750 struct cfs_rq *cfs_rq = cfs_rq_of(se);
751 struct sched_avg *sa = &se->avg;
Quentin Perret8fe5c5a2018-06-12 12:22:15 +0100752 long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
753 long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
Yuyang Du2b8c41d2016-03-30 04:30:56 +0800754
755 if (cap > 0) {
756 if (cfs_rq->avg.util_avg != 0) {
757 sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
758 sa->util_avg /= (cfs_rq->avg.load_avg + 1);
759
760 if (sa->util_avg > cap)
761 sa->util_avg = cap;
762 } else {
763 sa->util_avg = cap;
764 }
Yuyang Du2b8c41d2016-03-30 04:30:56 +0800765 }
Peter Zijlstra7dc603c2016-06-16 13:29:28 +0200766
767 if (entity_is_task(se)) {
768 struct task_struct *p = task_of(se);
769 if (p->sched_class != &fair_sched_class) {
770 /*
771 * For !fair tasks do:
772 *
Viresh Kumar3a123bb2017-05-24 10:59:56 +0530773 update_cfs_rq_load_avg(now, cfs_rq);
Peter Zijlstraea14b57e2018-02-02 10:27:00 +0100774 attach_entity_load_avg(cfs_rq, se, 0);
Peter Zijlstra7dc603c2016-06-16 13:29:28 +0200775 switched_from_fair(rq, p);
776 *
777 * such that the next switched_to_fair() has the
778 * expected state.
779 */
Vincent Guittotdf217912016-11-08 10:53:42 +0100780 se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
Peter Zijlstra7dc603c2016-06-16 13:29:28 +0200781 return;
782 }
783 }
784
Vincent Guittotdf217912016-11-08 10:53:42 +0100785 attach_entity_cfs_rq(se);
Yuyang Du2b8c41d2016-03-30 04:30:56 +0800786}
787
Peter Zijlstra7dc603c2016-06-16 13:29:28 +0200788#else /* !CONFIG_SMP */
Yuyang Du540247f2015-07-15 08:04:39 +0800789void init_entity_runnable_average(struct sched_entity *se)
Alex Shia75cdaa2013-06-20 10:18:47 +0800790{
791}
Yuyang Du2b8c41d2016-03-30 04:30:56 +0800792void post_init_entity_util_avg(struct sched_entity *se)
793{
794}
Peter Zijlstra3d30544f2016-06-21 14:27:50 +0200795static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
796{
797}
Peter Zijlstra7dc603c2016-06-16 13:29:28 +0200798#endif /* CONFIG_SMP */
Alex Shia75cdaa2013-06-20 10:18:47 +0800799
Peter Zijlstraa7be37a2008-06-27 13:41:11 +0200800/*
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100801 * Update the current task's runtime statistics.
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200802 */
Ingo Molnarb7cc0892007-08-09 11:16:47 +0200803static void update_curr(struct cfs_rq *cfs_rq)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200804{
Ingo Molnar429d43b2007-10-15 17:00:03 +0200805 struct sched_entity *curr = cfs_rq->curr;
Frederic Weisbecker78becc22013-04-12 01:51:02 +0200806 u64 now = rq_clock_task(rq_of(cfs_rq));
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100807 u64 delta_exec;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200808
809 if (unlikely(!curr))
810 return;
811
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100812 delta_exec = now - curr->exec_start;
813 if (unlikely((s64)delta_exec <= 0))
Peter Zijlstra34f28ec2008-12-16 08:45:31 +0100814 return;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200815
Ingo Molnar8ebc91d2007-10-15 17:00:03 +0200816 curr->exec_start = now;
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +0100817
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100818 schedstat_set(curr->statistics.exec_max,
819 max(delta_exec, curr->statistics.exec_max));
820
821 curr->sum_exec_runtime += delta_exec;
Josh Poimboeufae928822016-06-17 12:43:24 -0500822 schedstat_add(cfs_rq->exec_clock, delta_exec);
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100823
824 curr->vruntime += calc_delta_fair(delta_exec, curr);
825 update_min_vruntime(cfs_rq);
826
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +0100827 if (entity_is_task(curr)) {
828 struct task_struct *curtask = task_of(curr);
829
Ingo Molnarf977bb42009-09-13 18:15:54 +0200830 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
Tejun Heod2cc5ed2017-09-25 08:12:04 -0700831 cgroup_account_cputime(curtask, delta_exec);
Frank Mayharf06febc2008-09-12 09:54:39 -0700832 account_group_exec_runtime(curtask, delta_exec);
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +0100833 }
Paul Turnerec12cb72011-07-21 09:43:30 -0700834
835 account_cfs_rq_runtime(cfs_rq, delta_exec);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200836}
837
Stanislaw Gruszka6e998912014-11-12 16:58:44 +0100838static void update_curr_fair(struct rq *rq)
839{
840 update_curr(cfs_rq_of(&rq->curr->se));
841}
842
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200843static inline void
Ingo Molnar5870db52007-08-09 11:16:47 +0200844update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200845{
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -0500846 u64 wait_start, prev_wait_start;
847
848 if (!schedstat_enabled())
849 return;
850
851 wait_start = rq_clock(rq_of(cfs_rq));
852 prev_wait_start = schedstat_val(se->statistics.wait_start);
Joonwoo Park3ea94de2015-11-12 19:38:54 -0800853
854 if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -0500855 likely(wait_start > prev_wait_start))
856 wait_start -= prev_wait_start;
Joonwoo Park3ea94de2015-11-12 19:38:54 -0800857
Peter Zijlstra2ed41a52018-01-23 20:34:30 +0100858 __schedstat_set(se->statistics.wait_start, wait_start);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200859}
860
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -0500861static inline void
Joonwoo Park3ea94de2015-11-12 19:38:54 -0800862update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
863{
864 struct task_struct *p;
Mel Gormancb251762016-02-05 09:08:36 +0000865 u64 delta;
866
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -0500867 if (!schedstat_enabled())
868 return;
869
870 delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
Joonwoo Park3ea94de2015-11-12 19:38:54 -0800871
872 if (entity_is_task(se)) {
873 p = task_of(se);
874 if (task_on_rq_migrating(p)) {
875 /*
876 * Preserve migrating task's wait time so wait_start
877 * time stamp can be adjusted to accumulate wait time
878 * prior to migration.
879 */
Peter Zijlstra2ed41a52018-01-23 20:34:30 +0100880 __schedstat_set(se->statistics.wait_start, delta);
Joonwoo Park3ea94de2015-11-12 19:38:54 -0800881 return;
882 }
883 trace_sched_stat_wait(p, delta);
884 }
885
Peter Zijlstra2ed41a52018-01-23 20:34:30 +0100886 __schedstat_set(se->statistics.wait_max,
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -0500887 max(schedstat_val(se->statistics.wait_max), delta));
Peter Zijlstra2ed41a52018-01-23 20:34:30 +0100888 __schedstat_inc(se->statistics.wait_count);
889 __schedstat_add(se->statistics.wait_sum, delta);
890 __schedstat_set(se->statistics.wait_start, 0);
Joonwoo Park3ea94de2015-11-12 19:38:54 -0800891}
Joonwoo Park3ea94de2015-11-12 19:38:54 -0800892
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -0500893static inline void
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -0500894update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
895{
896 struct task_struct *tsk = NULL;
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -0500897 u64 sleep_start, block_start;
898
899 if (!schedstat_enabled())
900 return;
901
902 sleep_start = schedstat_val(se->statistics.sleep_start);
903 block_start = schedstat_val(se->statistics.block_start);
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -0500904
905 if (entity_is_task(se))
906 tsk = task_of(se);
907
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -0500908 if (sleep_start) {
909 u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -0500910
911 if ((s64)delta < 0)
912 delta = 0;
913
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -0500914 if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
Peter Zijlstra2ed41a52018-01-23 20:34:30 +0100915 __schedstat_set(se->statistics.sleep_max, delta);
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -0500916
Peter Zijlstra2ed41a52018-01-23 20:34:30 +0100917 __schedstat_set(se->statistics.sleep_start, 0);
918 __schedstat_add(se->statistics.sum_sleep_runtime, delta);
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -0500919
920 if (tsk) {
921 account_scheduler_latency(tsk, delta >> 10, 1);
922 trace_sched_stat_sleep(tsk, delta);
923 }
924 }
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -0500925 if (block_start) {
926 u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -0500927
928 if ((s64)delta < 0)
929 delta = 0;
930
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -0500931 if (unlikely(delta > schedstat_val(se->statistics.block_max)))
Peter Zijlstra2ed41a52018-01-23 20:34:30 +0100932 __schedstat_set(se->statistics.block_max, delta);
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -0500933
Peter Zijlstra2ed41a52018-01-23 20:34:30 +0100934 __schedstat_set(se->statistics.block_start, 0);
935 __schedstat_add(se->statistics.sum_sleep_runtime, delta);
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -0500936
937 if (tsk) {
938 if (tsk->in_iowait) {
Peter Zijlstra2ed41a52018-01-23 20:34:30 +0100939 __schedstat_add(se->statistics.iowait_sum, delta);
940 __schedstat_inc(se->statistics.iowait_count);
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -0500941 trace_sched_stat_iowait(tsk, delta);
942 }
943
944 trace_sched_stat_blocked(tsk, delta);
945
946 /*
947 * Blocking time is in units of nanosecs, so shift by
948 * 20 to get a milliseconds-range estimation of the
949 * amount of time that the task spent sleeping:
950 */
951 if (unlikely(prof_on == SLEEP_PROFILING)) {
952 profile_hits(SLEEP_PROFILING,
953 (void *)get_wchan(tsk),
954 delta >> 20);
955 }
956 account_scheduler_latency(tsk, delta >> 10, 0);
957 }
958 }
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200959}
960
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200961/*
962 * Task is being enqueued - update stats:
963 */
Mel Gormancb251762016-02-05 09:08:36 +0000964static inline void
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -0500965update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200966{
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -0500967 if (!schedstat_enabled())
968 return;
969
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200970 /*
971 * Are we enqueueing a waiting task? (for current tasks
972 * a dequeue/enqueue event is a NOP)
973 */
Ingo Molnar429d43b2007-10-15 17:00:03 +0200974 if (se != cfs_rq->curr)
Ingo Molnar5870db52007-08-09 11:16:47 +0200975 update_stats_wait_start(cfs_rq, se);
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -0500976
977 if (flags & ENQUEUE_WAKEUP)
978 update_stats_enqueue_sleeper(cfs_rq, se);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200979}
980
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200981static inline void
Mel Gormancb251762016-02-05 09:08:36 +0000982update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200983{
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -0500984
985 if (!schedstat_enabled())
986 return;
987
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200988 /*
989 * Mark the end of the wait period if dequeueing a
990 * waiting task:
991 */
Ingo Molnar429d43b2007-10-15 17:00:03 +0200992 if (se != cfs_rq->curr)
Ingo Molnar9ef0a962007-08-09 11:16:47 +0200993 update_stats_wait_end(cfs_rq, se);
Mel Gormancb251762016-02-05 09:08:36 +0000994
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -0500995 if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
996 struct task_struct *tsk = task_of(se);
Mel Gormancb251762016-02-05 09:08:36 +0000997
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -0500998 if (tsk->state & TASK_INTERRUPTIBLE)
Peter Zijlstra2ed41a52018-01-23 20:34:30 +0100999 __schedstat_set(se->statistics.sleep_start,
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -05001000 rq_clock(rq_of(cfs_rq)));
1001 if (tsk->state & TASK_UNINTERRUPTIBLE)
Peter Zijlstra2ed41a52018-01-23 20:34:30 +01001002 __schedstat_set(se->statistics.block_start,
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -05001003 rq_clock(rq_of(cfs_rq)));
Mel Gormancb251762016-02-05 09:08:36 +00001004 }
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02001005}
1006
1007/*
1008 * We are picking a new current task - update its stats:
1009 */
1010static inline void
Ingo Molnar79303e92007-08-09 11:16:47 +02001011update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02001012{
1013 /*
1014 * We are starting a new run period:
1015 */
Frederic Weisbecker78becc22013-04-12 01:51:02 +02001016 se->exec_start = rq_clock_task(rq_of(cfs_rq));
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02001017}
1018
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02001019/**************************************************
1020 * Scheduling class queueing methods:
1021 */
1022
Peter Zijlstracbee9f82012-10-25 14:16:43 +02001023#ifdef CONFIG_NUMA_BALANCING
1024/*
Mel Gorman598f0ec2013-10-07 11:28:55 +01001025 * Approximate time to scan a full NUMA task in ms. The task scan period is
1026 * calculated based on the tasks virtual memory size and
1027 * numa_balancing_scan_size.
Peter Zijlstracbee9f82012-10-25 14:16:43 +02001028 */
Mel Gorman598f0ec2013-10-07 11:28:55 +01001029unsigned int sysctl_numa_balancing_scan_period_min = 1000;
1030unsigned int sysctl_numa_balancing_scan_period_max = 60000;
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02001031
1032/* Portion of address space to scan in MB */
1033unsigned int sysctl_numa_balancing_scan_size = 256;
Peter Zijlstracbee9f82012-10-25 14:16:43 +02001034
Peter Zijlstra4b96a292012-10-25 14:16:47 +02001035/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
1036unsigned int sysctl_numa_balancing_scan_delay = 1000;
1037
Rik van Rielb5dd77c2017-07-31 15:28:47 -04001038struct numa_group {
1039 atomic_t refcount;
1040
1041 spinlock_t lock; /* nr_tasks, tasks */
1042 int nr_tasks;
1043 pid_t gid;
1044 int active_nodes;
1045
1046 struct rcu_head rcu;
1047 unsigned long total_faults;
1048 unsigned long max_faults_cpu;
1049 /*
1050 * Faults_cpu is used to decide whether memory should move
1051 * towards the CPU. As a consequence, these stats are weighted
1052 * more by CPU use than by memory faults.
1053 */
1054 unsigned long *faults_cpu;
1055 unsigned long faults[0];
1056};
1057
1058static inline unsigned long group_faults_priv(struct numa_group *ng);
1059static inline unsigned long group_faults_shared(struct numa_group *ng);
1060
Mel Gorman598f0ec2013-10-07 11:28:55 +01001061static unsigned int task_nr_scan_windows(struct task_struct *p)
1062{
1063 unsigned long rss = 0;
1064 unsigned long nr_scan_pages;
1065
1066 /*
1067 * Calculations based on RSS as non-present and empty pages are skipped
1068 * by the PTE scanner and NUMA hinting faults should be trapped based
1069 * on resident pages
1070 */
1071 nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
1072 rss = get_mm_rss(p->mm);
1073 if (!rss)
1074 rss = nr_scan_pages;
1075
1076 rss = round_up(rss, nr_scan_pages);
1077 return rss / nr_scan_pages;
1078}
1079
1080/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
1081#define MAX_SCAN_WINDOW 2560
1082
1083static unsigned int task_scan_min(struct task_struct *p)
1084{
Jason Low316c1608d2015-04-28 13:00:20 -07001085 unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
Mel Gorman598f0ec2013-10-07 11:28:55 +01001086 unsigned int scan, floor;
1087 unsigned int windows = 1;
1088
Kirill Tkhai64192652014-10-16 14:39:37 +04001089 if (scan_size < MAX_SCAN_WINDOW)
1090 windows = MAX_SCAN_WINDOW / scan_size;
Mel Gorman598f0ec2013-10-07 11:28:55 +01001091 floor = 1000 / windows;
1092
1093 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
1094 return max_t(unsigned int, floor, scan);
1095}
1096
Rik van Rielb5dd77c2017-07-31 15:28:47 -04001097static unsigned int task_scan_start(struct task_struct *p)
1098{
1099 unsigned long smin = task_scan_min(p);
1100 unsigned long period = smin;
1101
1102 /* Scale the maximum scan period with the amount of shared memory. */
1103 if (p->numa_group) {
1104 struct numa_group *ng = p->numa_group;
1105 unsigned long shared = group_faults_shared(ng);
1106 unsigned long private = group_faults_priv(ng);
1107
1108 period *= atomic_read(&ng->refcount);
1109 period *= shared + 1;
1110 period /= private + shared + 1;
1111 }
1112
1113 return max(smin, period);
1114}
1115
Mel Gorman598f0ec2013-10-07 11:28:55 +01001116static unsigned int task_scan_max(struct task_struct *p)
1117{
Rik van Rielb5dd77c2017-07-31 15:28:47 -04001118 unsigned long smin = task_scan_min(p);
1119 unsigned long smax;
Mel Gorman598f0ec2013-10-07 11:28:55 +01001120
1121 /* Watch for min being lower than max due to floor calculations */
1122 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
Rik van Rielb5dd77c2017-07-31 15:28:47 -04001123
1124 /* Scale the maximum scan period with the amount of shared memory. */
1125 if (p->numa_group) {
1126 struct numa_group *ng = p->numa_group;
1127 unsigned long shared = group_faults_shared(ng);
1128 unsigned long private = group_faults_priv(ng);
1129 unsigned long period = smax;
1130
1131 period *= atomic_read(&ng->refcount);
1132 period *= shared + 1;
1133 period /= private + shared + 1;
1134
1135 smax = max(smax, period);
1136 }
1137
Mel Gorman598f0ec2013-10-07 11:28:55 +01001138 return max(smin, smax);
1139}
1140
Mel Gorman13784472018-05-04 16:41:09 +01001141void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
1142{
1143 int mm_users = 0;
1144 struct mm_struct *mm = p->mm;
1145
1146 if (mm) {
1147 mm_users = atomic_read(&mm->mm_users);
1148 if (mm_users == 1) {
1149 mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
1150 mm->numa_scan_seq = 0;
1151 }
1152 }
1153 p->node_stamp = 0;
1154 p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
1155 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1156 p->numa_work.next = &p->numa_work;
1157 p->numa_faults = NULL;
1158 p->numa_group = NULL;
1159 p->last_task_numa_placement = 0;
1160 p->last_sum_exec_runtime = 0;
1161
1162 /* New address space, reset the preferred nid */
1163 if (!(clone_flags & CLONE_VM)) {
1164 p->numa_preferred_nid = -1;
1165 return;
1166 }
1167
1168 /*
1169 * New thread, keep existing numa_preferred_nid which should be copied
1170 * already by arch_dup_task_struct but stagger when scans start.
1171 */
1172 if (mm) {
1173 unsigned int delay;
1174
1175 delay = min_t(unsigned int, task_scan_max(current),
1176 current->numa_scan_period * mm_users * NSEC_PER_MSEC);
1177 delay += 2 * TICK_NSEC;
1178 p->node_stamp = delay;
1179 }
1180}
1181
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01001182static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1183{
1184 rq->nr_numa_running += (p->numa_preferred_nid != -1);
1185 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1186}
1187
1188static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1189{
1190 rq->nr_numa_running -= (p->numa_preferred_nid != -1);
1191 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1192}
1193
Rik van Rielbe1e4e72014-01-27 17:03:48 -05001194/* Shared or private faults. */
1195#define NR_NUMA_HINT_FAULT_TYPES 2
1196
1197/* Memory and CPU locality */
1198#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1199
1200/* Averaged statistics, and temporary buffers. */
1201#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1202
Mel Gormane29cf082013-10-07 11:29:22 +01001203pid_t task_numa_group_id(struct task_struct *p)
1204{
1205 return p->numa_group ? p->numa_group->gid : 0;
1206}
1207
Iulia Manda44dba3d2014-10-31 02:13:31 +02001208/*
Ingo Molnar97fb7a02018-03-03 14:01:12 +01001209 * The averaged statistics, shared & private, memory & CPU,
Iulia Manda44dba3d2014-10-31 02:13:31 +02001210 * occupy the first half of the array. The second half of the
1211 * array is for current counters, which are averaged into the
1212 * first set by task_numa_placement.
1213 */
1214static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
Mel Gormanac8e8952013-10-07 11:29:03 +01001215{
Iulia Manda44dba3d2014-10-31 02:13:31 +02001216 return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
Mel Gormanac8e8952013-10-07 11:29:03 +01001217}
1218
1219static inline unsigned long task_faults(struct task_struct *p, int nid)
1220{
Iulia Manda44dba3d2014-10-31 02:13:31 +02001221 if (!p->numa_faults)
Mel Gormanac8e8952013-10-07 11:29:03 +01001222 return 0;
1223
Iulia Manda44dba3d2014-10-31 02:13:31 +02001224 return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1225 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
Mel Gormanac8e8952013-10-07 11:29:03 +01001226}
1227
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001228static inline unsigned long group_faults(struct task_struct *p, int nid)
1229{
1230 if (!p->numa_group)
1231 return 0;
1232
Iulia Manda44dba3d2014-10-31 02:13:31 +02001233 return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1234 p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001235}
1236
Rik van Riel20e07de2014-01-27 17:03:43 -05001237static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
1238{
Iulia Manda44dba3d2014-10-31 02:13:31 +02001239 return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
1240 group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
Rik van Riel20e07de2014-01-27 17:03:43 -05001241}
1242
Rik van Rielb5dd77c2017-07-31 15:28:47 -04001243static inline unsigned long group_faults_priv(struct numa_group *ng)
1244{
1245 unsigned long faults = 0;
1246 int node;
1247
1248 for_each_online_node(node) {
1249 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
1250 }
1251
1252 return faults;
1253}
1254
1255static inline unsigned long group_faults_shared(struct numa_group *ng)
1256{
1257 unsigned long faults = 0;
1258 int node;
1259
1260 for_each_online_node(node) {
1261 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
1262 }
1263
1264 return faults;
1265}
1266
Rik van Riel4142c3e2016-01-25 17:07:39 -05001267/*
1268 * A node triggering more than 1/3 as many NUMA faults as the maximum is
1269 * considered part of a numa group's pseudo-interleaving set. Migrations
1270 * between these nodes are slowed down, to allow things to settle down.
1271 */
1272#define ACTIVE_NODE_FRACTION 3
1273
1274static bool numa_is_active_node(int nid, struct numa_group *ng)
1275{
1276 return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
1277}
1278
Rik van Riel6c6b1192014-10-17 03:29:52 -04001279/* Handle placement on systems where not all nodes are directly connected. */
1280static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1281 int maxdist, bool task)
1282{
1283 unsigned long score = 0;
1284 int node;
1285
1286 /*
1287 * All nodes are directly connected, and the same distance
1288 * from each other. No need for fancy placement algorithms.
1289 */
1290 if (sched_numa_topology_type == NUMA_DIRECT)
1291 return 0;
1292
1293 /*
1294 * This code is called for each node, introducing N^2 complexity,
1295 * which should be ok given the number of nodes rarely exceeds 8.
1296 */
1297 for_each_online_node(node) {
1298 unsigned long faults;
1299 int dist = node_distance(nid, node);
1300
1301 /*
1302 * The furthest away nodes in the system are not interesting
1303 * for placement; nid was already counted.
1304 */
1305 if (dist == sched_max_numa_distance || node == nid)
1306 continue;
1307
1308 /*
1309 * On systems with a backplane NUMA topology, compare groups
1310 * of nodes, and move tasks towards the group with the most
1311 * memory accesses. When comparing two nodes at distance
1312 * "hoplimit", only nodes closer by than "hoplimit" are part
1313 * of each group. Skip other nodes.
1314 */
1315 if (sched_numa_topology_type == NUMA_BACKPLANE &&
Srikar Dronamraju0ee7e742018-06-20 22:32:48 +05301316 dist >= maxdist)
Rik van Riel6c6b1192014-10-17 03:29:52 -04001317 continue;
1318
1319 /* Add up the faults from nearby nodes. */
1320 if (task)
1321 faults = task_faults(p, node);
1322 else
1323 faults = group_faults(p, node);
1324
1325 /*
1326 * On systems with a glueless mesh NUMA topology, there are
1327 * no fixed "groups of nodes". Instead, nodes that are not
1328 * directly connected bounce traffic through intermediate
1329 * nodes; a numa_group can occupy any set of nodes.
1330 * The further away a node is, the less the faults count.
1331 * This seems to result in good task placement.
1332 */
1333 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1334 faults *= (sched_max_numa_distance - dist);
1335 faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
1336 }
1337
1338 score += faults;
1339 }
1340
1341 return score;
1342}
1343
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001344/*
1345 * These return the fraction of accesses done by a particular task, or
1346 * task group, on a particular numa node. The group weight is given a
1347 * larger multiplier, in order to group tasks together that are almost
1348 * evenly spread out between numa nodes.
1349 */
Rik van Riel7bd95322014-10-17 03:29:51 -04001350static inline unsigned long task_weight(struct task_struct *p, int nid,
1351 int dist)
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001352{
Rik van Riel7bd95322014-10-17 03:29:51 -04001353 unsigned long faults, total_faults;
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001354
Iulia Manda44dba3d2014-10-31 02:13:31 +02001355 if (!p->numa_faults)
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001356 return 0;
1357
1358 total_faults = p->total_numa_faults;
1359
1360 if (!total_faults)
1361 return 0;
1362
Rik van Riel7bd95322014-10-17 03:29:51 -04001363 faults = task_faults(p, nid);
Rik van Riel6c6b1192014-10-17 03:29:52 -04001364 faults += score_nearby_nodes(p, nid, dist, true);
1365
Rik van Riel7bd95322014-10-17 03:29:51 -04001366 return 1000 * faults / total_faults;
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001367}
1368
Rik van Riel7bd95322014-10-17 03:29:51 -04001369static inline unsigned long group_weight(struct task_struct *p, int nid,
1370 int dist)
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001371{
Rik van Riel7bd95322014-10-17 03:29:51 -04001372 unsigned long faults, total_faults;
1373
1374 if (!p->numa_group)
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001375 return 0;
1376
Rik van Riel7bd95322014-10-17 03:29:51 -04001377 total_faults = p->numa_group->total_faults;
1378
1379 if (!total_faults)
1380 return 0;
1381
1382 faults = group_faults(p, nid);
Rik van Riel6c6b1192014-10-17 03:29:52 -04001383 faults += score_nearby_nodes(p, nid, dist, false);
1384
Rik van Riel7bd95322014-10-17 03:29:51 -04001385 return 1000 * faults / total_faults;
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001386}
1387
Rik van Riel10f39042014-01-27 17:03:44 -05001388bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1389 int src_nid, int dst_cpu)
1390{
1391 struct numa_group *ng = p->numa_group;
1392 int dst_nid = cpu_to_node(dst_cpu);
1393 int last_cpupid, this_cpupid;
1394
1395 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1396
1397 /*
1398 * Multi-stage node selection is used in conjunction with a periodic
1399 * migration fault to build a temporal task<->page relation. By using
1400 * a two-stage filter we remove short/unlikely relations.
1401 *
1402 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
1403 * a task's usage of a particular page (n_p) per total usage of this
1404 * page (n_t) (in a given time-span) to a probability.
1405 *
1406 * Our periodic faults will sample this probability and getting the
1407 * same result twice in a row, given these samples are fully
1408 * independent, is then given by P(n)^2, provided our sample period
1409 * is sufficiently short compared to the usage pattern.
1410 *
1411 * This quadric squishes small probabilities, making it less likely we
1412 * act on an unlikely task<->page relation.
1413 */
1414 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1415 if (!cpupid_pid_unset(last_cpupid) &&
1416 cpupid_to_nid(last_cpupid) != dst_nid)
1417 return false;
1418
1419 /* Always allow migrate on private faults */
1420 if (cpupid_match_pid(p, last_cpupid))
1421 return true;
1422
1423 /* A shared fault, but p->numa_group has not been set up yet. */
1424 if (!ng)
1425 return true;
1426
1427 /*
Rik van Riel4142c3e2016-01-25 17:07:39 -05001428 * Destination node is much more heavily used than the source
1429 * node? Allow migration.
Rik van Riel10f39042014-01-27 17:03:44 -05001430 */
Rik van Riel4142c3e2016-01-25 17:07:39 -05001431 if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
1432 ACTIVE_NODE_FRACTION)
Rik van Riel10f39042014-01-27 17:03:44 -05001433 return true;
1434
1435 /*
Rik van Riel4142c3e2016-01-25 17:07:39 -05001436 * Distribute memory according to CPU & memory use on each node,
1437 * with 3/4 hysteresis to avoid unnecessary memory migrations:
1438 *
1439 * faults_cpu(dst) 3 faults_cpu(src)
1440 * --------------- * - > ---------------
1441 * faults_mem(dst) 4 faults_mem(src)
Rik van Riel10f39042014-01-27 17:03:44 -05001442 */
Rik van Riel4142c3e2016-01-25 17:07:39 -05001443 return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
1444 group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
Rik van Riel10f39042014-01-27 17:03:44 -05001445}
1446
Viresh Kumarc7132dd2017-05-24 10:59:54 +05301447static unsigned long weighted_cpuload(struct rq *rq);
Mel Gorman58d081b2013-10-07 11:29:10 +01001448static unsigned long source_load(int cpu, int type);
1449static unsigned long target_load(int cpu, int type);
Mel Gormane6628d52013-10-07 11:29:02 +01001450
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001451/* Cached statistics for all CPUs within a node */
Mel Gorman58d081b2013-10-07 11:29:10 +01001452struct numa_stats {
1453 unsigned long load;
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001454
1455 /* Total compute capacity of CPUs on a node */
Nicolas Pitre5ef20ca2014-05-26 18:19:34 -04001456 unsigned long compute_capacity;
Mel Gorman58d081b2013-10-07 11:29:10 +01001457};
Mel Gormane6628d52013-10-07 11:29:02 +01001458
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001459/*
1460 * XXX borrowed from update_sg_lb_stats
1461 */
1462static void update_numa_stats(struct numa_stats *ns, int nid)
1463{
Vincent Guittotd90707e2018-08-29 15:19:09 +02001464 int cpu;
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001465
1466 memset(ns, 0, sizeof(*ns));
1467 for_each_cpu(cpu, cpumask_of_node(nid)) {
1468 struct rq *rq = cpu_rq(cpu);
1469
Viresh Kumarc7132dd2017-05-24 10:59:54 +05301470 ns->load += weighted_cpuload(rq);
Nicolas Pitreced549f2014-05-26 18:19:38 -04001471 ns->compute_capacity += capacity_of(cpu);
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001472 }
1473
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001474}
1475
Mel Gorman58d081b2013-10-07 11:29:10 +01001476struct task_numa_env {
1477 struct task_struct *p;
1478
1479 int src_cpu, src_nid;
1480 int dst_cpu, dst_nid;
1481
1482 struct numa_stats src_stats, dst_stats;
1483
Wanpeng Li40ea2b42013-12-05 19:10:17 +08001484 int imbalance_pct;
Rik van Riel7bd95322014-10-17 03:29:51 -04001485 int dist;
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001486
1487 struct task_struct *best_task;
1488 long best_imp;
Mel Gorman58d081b2013-10-07 11:29:10 +01001489 int best_cpu;
1490};
1491
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001492static void task_numa_assign(struct task_numa_env *env,
1493 struct task_struct *p, long imp)
1494{
Srikar Dronamrajua4739ec2018-09-21 23:18:56 +05301495 struct rq *rq = cpu_rq(env->dst_cpu);
1496
1497 /* Bail out if run-queue part of active NUMA balance. */
1498 if (xchg(&rq->numa_migrate_on, 1))
1499 return;
1500
1501 /*
1502 * Clear previous best_cpu/rq numa-migrate flag, since task now
1503 * found a better CPU to move/swap.
1504 */
1505 if (env->best_cpu != -1) {
1506 rq = cpu_rq(env->best_cpu);
1507 WRITE_ONCE(rq->numa_migrate_on, 0);
1508 }
1509
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001510 if (env->best_task)
1511 put_task_struct(env->best_task);
Oleg Nesterovbac78572016-05-18 21:57:33 +02001512 if (p)
1513 get_task_struct(p);
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001514
1515 env->best_task = p;
1516 env->best_imp = imp;
1517 env->best_cpu = env->dst_cpu;
1518}
1519
Rik van Riel28a21742014-06-23 11:46:13 -04001520static bool load_too_imbalanced(long src_load, long dst_load,
Rik van Riele63da032014-05-14 13:22:21 -04001521 struct task_numa_env *env)
1522{
Rik van Riele4991b22015-05-27 15:04:27 -04001523 long imb, old_imb;
1524 long orig_src_load, orig_dst_load;
Rik van Riel28a21742014-06-23 11:46:13 -04001525 long src_capacity, dst_capacity;
1526
1527 /*
1528 * The load is corrected for the CPU capacity available on each node.
1529 *
1530 * src_load dst_load
1531 * ------------ vs ---------
1532 * src_capacity dst_capacity
1533 */
1534 src_capacity = env->src_stats.compute_capacity;
1535 dst_capacity = env->dst_stats.compute_capacity;
Rik van Riele63da032014-05-14 13:22:21 -04001536
Srikar Dronamraju5f95ba72018-06-20 22:32:44 +05301537 imb = abs(dst_load * src_capacity - src_load * dst_capacity);
Rik van Riele63da032014-05-14 13:22:21 -04001538
Rik van Riel28a21742014-06-23 11:46:13 -04001539 orig_src_load = env->src_stats.load;
Rik van Riele4991b22015-05-27 15:04:27 -04001540 orig_dst_load = env->dst_stats.load;
Rik van Riel28a21742014-06-23 11:46:13 -04001541
Srikar Dronamraju5f95ba72018-06-20 22:32:44 +05301542 old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
Rik van Riele4991b22015-05-27 15:04:27 -04001543
1544 /* Would this change make things worse? */
1545 return (imb > old_imb);
Rik van Riele63da032014-05-14 13:22:21 -04001546}
1547
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001548/*
Srikar Dronamraju6fd98e72018-09-21 23:19:01 +05301549 * Maximum NUMA importance can be 1998 (2*999);
1550 * SMALLIMP @ 30 would be close to 1998/64.
1551 * Used to deter task migration.
1552 */
1553#define SMALLIMP 30
1554
1555/*
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001556 * This checks if the overall compute and NUMA accesses of the system would
1557 * be improved if the source tasks was migrated to the target dst_cpu taking
1558 * into account that it might be best if task running on the dst_cpu should
1559 * be exchanged with the source task
1560 */
Rik van Riel887c2902013-10-07 11:29:31 +01001561static void task_numa_compare(struct task_numa_env *env,
Srikar Dronamraju305c1fa2018-06-20 22:32:43 +05301562 long taskimp, long groupimp, bool maymove)
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001563{
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001564 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1565 struct task_struct *cur;
Rik van Riel28a21742014-06-23 11:46:13 -04001566 long src_load, dst_load;
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001567 long load;
Rik van Riel1c5d3eb2014-06-23 11:46:15 -04001568 long imp = env->p->numa_group ? groupimp : taskimp;
Rik van Riel0132c3e2014-06-23 11:46:16 -04001569 long moveimp = imp;
Rik van Riel7bd95322014-10-17 03:29:51 -04001570 int dist = env->dist;
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001571
Srikar Dronamrajua4739ec2018-09-21 23:18:56 +05301572 if (READ_ONCE(dst_rq->numa_migrate_on))
1573 return;
1574
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001575 rcu_read_lock();
Oleg Nesterovbac78572016-05-18 21:57:33 +02001576 cur = task_rcu_dereference(&dst_rq->curr);
1577 if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001578 cur = NULL;
1579
1580 /*
Peter Zijlstra7af68332014-11-10 10:54:35 +01001581 * Because we have preemption enabled we can get migrated around and
1582 * end try selecting ourselves (current == env->p) as a swap candidate.
1583 */
1584 if (cur == env->p)
1585 goto unlock;
1586
Srikar Dronamraju305c1fa2018-06-20 22:32:43 +05301587 if (!cur) {
Srikar Dronamraju6fd98e72018-09-21 23:19:01 +05301588 if (maymove && moveimp >= env->best_imp)
Srikar Dronamraju305c1fa2018-06-20 22:32:43 +05301589 goto assign;
1590 else
1591 goto unlock;
1592 }
1593
Peter Zijlstra7af68332014-11-10 10:54:35 +01001594 /*
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001595 * "imp" is the fault differential for the source task between the
1596 * source and destination node. Calculate the total differential for
1597 * the source task and potential destination task. The more negative
Srikar Dronamraju305c1fa2018-06-20 22:32:43 +05301598 * the value is, the more remote accesses that would be expected to
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001599 * be incurred if the tasks were swapped.
1600 */
Srikar Dronamraju305c1fa2018-06-20 22:32:43 +05301601 /* Skip this swap candidate if cannot move to the source cpu */
1602 if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001603 goto unlock;
1604
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001605 /*
Srikar Dronamraju305c1fa2018-06-20 22:32:43 +05301606 * If dst and source tasks are in the same NUMA group, or not
1607 * in any group then look only at task weights.
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001608 */
Srikar Dronamraju305c1fa2018-06-20 22:32:43 +05301609 if (cur->numa_group == env->p->numa_group) {
1610 imp = taskimp + task_weight(cur, env->src_nid, dist) -
1611 task_weight(cur, env->dst_nid, dist);
Rik van Riel0132c3e2014-06-23 11:46:16 -04001612 /*
Srikar Dronamraju305c1fa2018-06-20 22:32:43 +05301613 * Add some hysteresis to prevent swapping the
1614 * tasks within a group over tiny differences.
Rik van Riel0132c3e2014-06-23 11:46:16 -04001615 */
Srikar Dronamraju305c1fa2018-06-20 22:32:43 +05301616 if (cur->numa_group)
1617 imp -= imp / 16;
1618 } else {
1619 /*
1620 * Compare the group weights. If a task is all by itself
1621 * (not part of a group), use the task weight instead.
1622 */
1623 if (cur->numa_group && env->p->numa_group)
1624 imp += group_weight(cur, env->src_nid, dist) -
1625 group_weight(cur, env->dst_nid, dist);
1626 else
1627 imp += task_weight(cur, env->src_nid, dist) -
1628 task_weight(cur, env->dst_nid, dist);
Rik van Riel0132c3e2014-06-23 11:46:16 -04001629 }
1630
Srikar Dronamraju305c1fa2018-06-20 22:32:43 +05301631 if (maymove && moveimp > imp && moveimp > env->best_imp) {
Srikar Dronamraju6fd98e72018-09-21 23:19:01 +05301632 imp = moveimp;
Srikar Dronamraju305c1fa2018-06-20 22:32:43 +05301633 cur = NULL;
1634 goto assign;
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001635 }
1636
Srikar Dronamraju305c1fa2018-06-20 22:32:43 +05301637 /*
Srikar Dronamraju6fd98e72018-09-21 23:19:01 +05301638 * If the NUMA importance is less than SMALLIMP,
1639 * task migration might only result in ping pong
1640 * of tasks and also hurt performance due to cache
1641 * misses.
1642 */
1643 if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2)
1644 goto unlock;
1645
1646 /*
Srikar Dronamraju305c1fa2018-06-20 22:32:43 +05301647 * In the overloaded case, try and keep the load balanced.
1648 */
1649 load = task_h_load(env->p) - task_h_load(cur);
1650 if (!load)
1651 goto assign;
1652
1653 dst_load = env->dst_stats.load + load;
1654 src_load = env->src_stats.load - load;
1655
Rik van Riel28a21742014-06-23 11:46:13 -04001656 if (load_too_imbalanced(src_load, dst_load, env))
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001657 goto unlock;
1658
Srikar Dronamraju305c1fa2018-06-20 22:32:43 +05301659assign:
Rik van Rielba7e5a22014-09-04 16:35:30 -04001660 /*
1661 * One idle CPU per node is evaluated for a task numa move.
1662 * Call select_idle_sibling to maybe find a better one.
1663 */
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02001664 if (!cur) {
1665 /*
Ingo Molnar97fb7a02018-03-03 14:01:12 +01001666 * select_idle_siblings() uses an per-CPU cpumask that
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02001667 * can be used from IRQ context.
1668 */
1669 local_irq_disable();
Morten Rasmussen772bd008c2016-06-22 18:03:13 +01001670 env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
1671 env->dst_cpu);
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02001672 local_irq_enable();
1673 }
Rik van Rielba7e5a22014-09-04 16:35:30 -04001674
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001675 task_numa_assign(env, cur, imp);
1676unlock:
1677 rcu_read_unlock();
1678}
1679
Rik van Riel887c2902013-10-07 11:29:31 +01001680static void task_numa_find_cpu(struct task_numa_env *env,
1681 long taskimp, long groupimp)
Mel Gorman2c8a50a2013-10-07 11:29:18 +01001682{
Srikar Dronamraju305c1fa2018-06-20 22:32:43 +05301683 long src_load, dst_load, load;
1684 bool maymove = false;
Mel Gorman2c8a50a2013-10-07 11:29:18 +01001685 int cpu;
1686
Srikar Dronamraju305c1fa2018-06-20 22:32:43 +05301687 load = task_h_load(env->p);
1688 dst_load = env->dst_stats.load + load;
1689 src_load = env->src_stats.load - load;
1690
1691 /*
1692 * If the improvement from just moving env->p direction is better
1693 * than swapping tasks around, check if a move is possible.
1694 */
1695 maymove = !load_too_imbalanced(src_load, dst_load, env);
1696
Mel Gorman2c8a50a2013-10-07 11:29:18 +01001697 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1698 /* Skip this CPU if the source task cannot migrate */
Ingo Molnar0c98d342017-02-05 15:38:10 +01001699 if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
Mel Gorman2c8a50a2013-10-07 11:29:18 +01001700 continue;
1701
1702 env->dst_cpu = cpu;
Srikar Dronamraju305c1fa2018-06-20 22:32:43 +05301703 task_numa_compare(env, taskimp, groupimp, maymove);
Mel Gorman2c8a50a2013-10-07 11:29:18 +01001704 }
1705}
1706
Mel Gorman58d081b2013-10-07 11:29:10 +01001707static int task_numa_migrate(struct task_struct *p)
Mel Gormane6628d52013-10-07 11:29:02 +01001708{
Mel Gorman58d081b2013-10-07 11:29:10 +01001709 struct task_numa_env env = {
1710 .p = p,
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001711
Mel Gorman58d081b2013-10-07 11:29:10 +01001712 .src_cpu = task_cpu(p),
Ingo Molnarb32e86b2013-10-07 11:29:30 +01001713 .src_nid = task_node(p),
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001714
1715 .imbalance_pct = 112,
1716
1717 .best_task = NULL,
1718 .best_imp = 0,
Rik van Riel4142c3e2016-01-25 17:07:39 -05001719 .best_cpu = -1,
Mel Gorman58d081b2013-10-07 11:29:10 +01001720 };
1721 struct sched_domain *sd;
Srikar Dronamrajua4739ec2018-09-21 23:18:56 +05301722 struct rq *best_rq;
Rik van Riel887c2902013-10-07 11:29:31 +01001723 unsigned long taskweight, groupweight;
Rik van Riel7bd95322014-10-17 03:29:51 -04001724 int nid, ret, dist;
Rik van Riel887c2902013-10-07 11:29:31 +01001725 long taskimp, groupimp;
Mel Gormane6628d52013-10-07 11:29:02 +01001726
Mel Gorman58d081b2013-10-07 11:29:10 +01001727 /*
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001728 * Pick the lowest SD_NUMA domain, as that would have the smallest
1729 * imbalance and would be the first to start moving tasks about.
1730 *
1731 * And we want to avoid any moving of tasks about, as that would create
1732 * random movement of tasks -- counter the numa conditions we're trying
1733 * to satisfy here.
Mel Gorman58d081b2013-10-07 11:29:10 +01001734 */
Mel Gormane6628d52013-10-07 11:29:02 +01001735 rcu_read_lock();
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001736 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
Rik van Riel46a73e82013-11-11 19:29:25 -05001737 if (sd)
1738 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
Mel Gormane6628d52013-10-07 11:29:02 +01001739 rcu_read_unlock();
1740
Rik van Riel46a73e82013-11-11 19:29:25 -05001741 /*
1742 * Cpusets can break the scheduler domain tree into smaller
1743 * balance domains, some of which do not cross NUMA boundaries.
1744 * Tasks that are "trapped" in such domains cannot be migrated
1745 * elsewhere, so there is no point in (re)trying.
1746 */
1747 if (unlikely(!sd)) {
Srikar Dronamraju8cd45ee2018-06-20 22:32:45 +05301748 sched_setnuma(p, task_node(p));
Rik van Riel46a73e82013-11-11 19:29:25 -05001749 return -EINVAL;
1750 }
1751
Mel Gorman2c8a50a2013-10-07 11:29:18 +01001752 env.dst_nid = p->numa_preferred_nid;
Rik van Riel7bd95322014-10-17 03:29:51 -04001753 dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1754 taskweight = task_weight(p, env.src_nid, dist);
1755 groupweight = group_weight(p, env.src_nid, dist);
1756 update_numa_stats(&env.src_stats, env.src_nid);
1757 taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1758 groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
Mel Gorman2c8a50a2013-10-07 11:29:18 +01001759 update_numa_stats(&env.dst_stats, env.dst_nid);
Mel Gorman58d081b2013-10-07 11:29:10 +01001760
Rik van Riela43455a2014-06-04 16:09:42 -04001761 /* Try to find a spot on the preferred nid. */
Srikar Dronamraju2d4056f2018-06-20 22:32:53 +05301762 task_numa_find_cpu(&env, taskimp, groupimp);
Rik van Riele1dda8a2013-10-07 11:29:19 +01001763
Rik van Riel9de05d42014-10-09 17:27:47 -04001764 /*
1765 * Look at other nodes in these cases:
1766 * - there is no space available on the preferred_nid
1767 * - the task is part of a numa_group that is interleaved across
1768 * multiple NUMA nodes; in order to better consolidate the group,
1769 * we need to check other locations.
1770 */
Rik van Riel4142c3e2016-01-25 17:07:39 -05001771 if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
Mel Gorman2c8a50a2013-10-07 11:29:18 +01001772 for_each_online_node(nid) {
1773 if (nid == env.src_nid || nid == p->numa_preferred_nid)
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001774 continue;
1775
Rik van Riel7bd95322014-10-17 03:29:51 -04001776 dist = node_distance(env.src_nid, env.dst_nid);
Rik van Riel6c6b1192014-10-17 03:29:52 -04001777 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1778 dist != env.dist) {
1779 taskweight = task_weight(p, env.src_nid, dist);
1780 groupweight = group_weight(p, env.src_nid, dist);
1781 }
Rik van Riel7bd95322014-10-17 03:29:51 -04001782
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001783 /* Only consider nodes where both task and groups benefit */
Rik van Riel7bd95322014-10-17 03:29:51 -04001784 taskimp = task_weight(p, nid, dist) - taskweight;
1785 groupimp = group_weight(p, nid, dist) - groupweight;
Rik van Riel887c2902013-10-07 11:29:31 +01001786 if (taskimp < 0 && groupimp < 0)
Mel Gorman2c8a50a2013-10-07 11:29:18 +01001787 continue;
1788
Rik van Riel7bd95322014-10-17 03:29:51 -04001789 env.dist = dist;
Mel Gorman2c8a50a2013-10-07 11:29:18 +01001790 env.dst_nid = nid;
1791 update_numa_stats(&env.dst_stats, env.dst_nid);
Srikar Dronamraju2d4056f2018-06-20 22:32:53 +05301792 task_numa_find_cpu(&env, taskimp, groupimp);
Mel Gorman58d081b2013-10-07 11:29:10 +01001793 }
1794 }
1795
Rik van Riel68d1b022014-04-11 13:00:29 -04001796 /*
1797 * If the task is part of a workload that spans multiple NUMA nodes,
1798 * and is migrating into one of the workload's active nodes, remember
1799 * this node as the task's preferred numa node, so the workload can
1800 * settle down.
1801 * A task that migrated to a second choice node will be better off
1802 * trying for a better one later. Do not set the preferred node here.
1803 */
Rik van Rieldb015da2014-06-23 11:41:34 -04001804 if (p->numa_group) {
1805 if (env.best_cpu == -1)
1806 nid = env.src_nid;
1807 else
Srikar Dronamraju8cd45ee2018-06-20 22:32:45 +05301808 nid = cpu_to_node(env.best_cpu);
Rik van Rieldb015da2014-06-23 11:41:34 -04001809
Srikar Dronamraju8cd45ee2018-06-20 22:32:45 +05301810 if (nid != p->numa_preferred_nid)
1811 sched_setnuma(p, nid);
Rik van Rieldb015da2014-06-23 11:41:34 -04001812 }
1813
1814 /* No better CPU than the current one was found. */
1815 if (env.best_cpu == -1)
1816 return -EAGAIN;
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01001817
Srikar Dronamrajua4739ec2018-09-21 23:18:56 +05301818 best_rq = cpu_rq(env.best_cpu);
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001819 if (env.best_task == NULL) {
Mel Gorman286549d2014-01-21 15:51:03 -08001820 ret = migrate_task_to(p, env.best_cpu);
Srikar Dronamrajua4739ec2018-09-21 23:18:56 +05301821 WRITE_ONCE(best_rq->numa_migrate_on, 0);
Mel Gorman286549d2014-01-21 15:51:03 -08001822 if (ret != 0)
1823 trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001824 return ret;
1825 }
1826
Srikar Dronamraju0ad4e3d2018-06-20 22:32:50 +05301827 ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
Srikar Dronamrajua4739ec2018-09-21 23:18:56 +05301828 WRITE_ONCE(best_rq->numa_migrate_on, 0);
Srikar Dronamraju0ad4e3d2018-06-20 22:32:50 +05301829
Mel Gorman286549d2014-01-21 15:51:03 -08001830 if (ret != 0)
1831 trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001832 put_task_struct(env.best_task);
1833 return ret;
Mel Gormane6628d52013-10-07 11:29:02 +01001834}
1835
Mel Gorman6b9a7462013-10-07 11:29:11 +01001836/* Attempt to migrate a task to a CPU on the preferred node. */
1837static void numa_migrate_preferred(struct task_struct *p)
1838{
Rik van Riel5085e2a2014-04-11 13:00:28 -04001839 unsigned long interval = HZ;
1840
Rik van Riel2739d3e2013-10-07 11:29:41 +01001841 /* This task has no NUMA fault statistics yet */
Iulia Manda44dba3d2014-10-31 02:13:31 +02001842 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
Rik van Riel2739d3e2013-10-07 11:29:41 +01001843 return;
1844
1845 /* Periodically retry migrating the task to the preferred node */
Rik van Riel5085e2a2014-04-11 13:00:28 -04001846 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
Mel Gorman789ba282018-05-09 17:31:15 +01001847 p->numa_migrate_retry = jiffies + interval;
Rik van Riel2739d3e2013-10-07 11:29:41 +01001848
Mel Gorman6b9a7462013-10-07 11:29:11 +01001849 /* Success if task is already running on preferred CPU */
Wanpeng Lide1b3012013-12-12 15:23:24 +08001850 if (task_node(p) == p->numa_preferred_nid)
Mel Gorman6b9a7462013-10-07 11:29:11 +01001851 return;
1852
Mel Gorman6b9a7462013-10-07 11:29:11 +01001853 /* Otherwise, try migrate to a CPU on the preferred node */
Rik van Riel2739d3e2013-10-07 11:29:41 +01001854 task_numa_migrate(p);
Mel Gorman6b9a7462013-10-07 11:29:11 +01001855}
1856
Rik van Riel04bb2f92013-10-07 11:29:36 +01001857/*
Rik van Riel4142c3e2016-01-25 17:07:39 -05001858 * Find out how many nodes on the workload is actively running on. Do this by
Rik van Riel20e07de2014-01-27 17:03:43 -05001859 * tracking the nodes from which NUMA hinting faults are triggered. This can
1860 * be different from the set of nodes where the workload's memory is currently
1861 * located.
Rik van Riel20e07de2014-01-27 17:03:43 -05001862 */
Rik van Riel4142c3e2016-01-25 17:07:39 -05001863static void numa_group_count_active_nodes(struct numa_group *numa_group)
Rik van Riel20e07de2014-01-27 17:03:43 -05001864{
1865 unsigned long faults, max_faults = 0;
Rik van Riel4142c3e2016-01-25 17:07:39 -05001866 int nid, active_nodes = 0;
Rik van Riel20e07de2014-01-27 17:03:43 -05001867
1868 for_each_online_node(nid) {
1869 faults = group_faults_cpu(numa_group, nid);
1870 if (faults > max_faults)
1871 max_faults = faults;
1872 }
1873
1874 for_each_online_node(nid) {
1875 faults = group_faults_cpu(numa_group, nid);
Rik van Riel4142c3e2016-01-25 17:07:39 -05001876 if (faults * ACTIVE_NODE_FRACTION > max_faults)
1877 active_nodes++;
Rik van Riel20e07de2014-01-27 17:03:43 -05001878 }
Rik van Riel4142c3e2016-01-25 17:07:39 -05001879
1880 numa_group->max_faults_cpu = max_faults;
1881 numa_group->active_nodes = active_nodes;
Rik van Riel20e07de2014-01-27 17:03:43 -05001882}
1883
1884/*
Rik van Riel04bb2f92013-10-07 11:29:36 +01001885 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1886 * increments. The more local the fault statistics are, the higher the scan
Rik van Riela22b4b02014-06-23 11:41:35 -04001887 * period will be for the next scan window. If local/(local+remote) ratio is
1888 * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
1889 * the scan period will decrease. Aim for 70% local accesses.
Rik van Riel04bb2f92013-10-07 11:29:36 +01001890 */
1891#define NUMA_PERIOD_SLOTS 10
Rik van Riela22b4b02014-06-23 11:41:35 -04001892#define NUMA_PERIOD_THRESHOLD 7
Rik van Riel04bb2f92013-10-07 11:29:36 +01001893
1894/*
1895 * Increase the scan period (slow down scanning) if the majority of
1896 * our memory is already on our local node, or if the majority of
1897 * the page accesses are shared with other processes.
1898 * Otherwise, decrease the scan period.
1899 */
1900static void update_task_scan_period(struct task_struct *p,
1901 unsigned long shared, unsigned long private)
1902{
1903 unsigned int period_slot;
Rik van Riel37ec97de2017-07-31 15:28:46 -04001904 int lr_ratio, ps_ratio;
Rik van Riel04bb2f92013-10-07 11:29:36 +01001905 int diff;
1906
1907 unsigned long remote = p->numa_faults_locality[0];
1908 unsigned long local = p->numa_faults_locality[1];
1909
1910 /*
1911 * If there were no record hinting faults then either the task is
1912 * completely idle or all activity is areas that are not of interest
Mel Gorman074c2382015-03-25 15:55:42 -07001913 * to automatic numa balancing. Related to that, if there were failed
1914 * migration then it implies we are migrating too quickly or the local
1915 * node is overloaded. In either case, scan slower
Rik van Riel04bb2f92013-10-07 11:29:36 +01001916 */
Mel Gorman074c2382015-03-25 15:55:42 -07001917 if (local + shared == 0 || p->numa_faults_locality[2]) {
Rik van Riel04bb2f92013-10-07 11:29:36 +01001918 p->numa_scan_period = min(p->numa_scan_period_max,
1919 p->numa_scan_period << 1);
1920
1921 p->mm->numa_next_scan = jiffies +
1922 msecs_to_jiffies(p->numa_scan_period);
1923
1924 return;
1925 }
1926
1927 /*
1928 * Prepare to scale scan period relative to the current period.
1929 * == NUMA_PERIOD_THRESHOLD scan period stays the same
1930 * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
1931 * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
1932 */
1933 period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
Rik van Riel37ec97de2017-07-31 15:28:46 -04001934 lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1935 ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
1936
1937 if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
1938 /*
1939 * Most memory accesses are local. There is no need to
1940 * do fast NUMA scanning, since memory is already local.
1941 */
1942 int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
1943 if (!slot)
1944 slot = 1;
1945 diff = slot * period_slot;
1946 } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
1947 /*
1948 * Most memory accesses are shared with other tasks.
1949 * There is no point in continuing fast NUMA scanning,
1950 * since other tasks may just move the memory elsewhere.
1951 */
1952 int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
Rik van Riel04bb2f92013-10-07 11:29:36 +01001953 if (!slot)
1954 slot = 1;
1955 diff = slot * period_slot;
1956 } else {
Rik van Riel04bb2f92013-10-07 11:29:36 +01001957 /*
Rik van Riel37ec97de2017-07-31 15:28:46 -04001958 * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
1959 * yet they are not on the local NUMA node. Speed up
1960 * NUMA scanning to get the memory moved over.
Rik van Riel04bb2f92013-10-07 11:29:36 +01001961 */
Rik van Riel37ec97de2017-07-31 15:28:46 -04001962 int ratio = max(lr_ratio, ps_ratio);
1963 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
Rik van Riel04bb2f92013-10-07 11:29:36 +01001964 }
1965
1966 p->numa_scan_period = clamp(p->numa_scan_period + diff,
1967 task_scan_min(p), task_scan_max(p));
1968 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1969}
1970
Rik van Riel7e2703e2014-01-27 17:03:45 -05001971/*
1972 * Get the fraction of time the task has been running since the last
1973 * NUMA placement cycle. The scheduler keeps similar statistics, but
1974 * decays those on a 32ms period, which is orders of magnitude off
1975 * from the dozens-of-seconds NUMA balancing period. Use the scheduler
1976 * stats only if the task is so new there are no NUMA statistics yet.
1977 */
1978static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
1979{
1980 u64 runtime, delta, now;
1981 /* Use the start of this time slice to avoid calculations. */
1982 now = p->se.exec_start;
1983 runtime = p->se.sum_exec_runtime;
1984
1985 if (p->last_task_numa_placement) {
1986 delta = runtime - p->last_sum_exec_runtime;
1987 *period = now - p->last_task_numa_placement;
1988 } else {
Peter Zijlstrac7b50212017-05-06 16:42:08 +02001989 delta = p->se.avg.load_sum;
Yuyang Du9d89c252015-07-15 08:04:37 +08001990 *period = LOAD_AVG_MAX;
Rik van Riel7e2703e2014-01-27 17:03:45 -05001991 }
1992
1993 p->last_sum_exec_runtime = runtime;
1994 p->last_task_numa_placement = now;
1995
1996 return delta;
1997}
1998
Rik van Riel54009412014-10-17 03:29:53 -04001999/*
2000 * Determine the preferred nid for a task in a numa_group. This needs to
2001 * be done in a way that produces consistent results with group_weight,
2002 * otherwise workloads might not converge.
2003 */
2004static int preferred_group_nid(struct task_struct *p, int nid)
2005{
2006 nodemask_t nodes;
2007 int dist;
2008
2009 /* Direct connections between all NUMA nodes. */
2010 if (sched_numa_topology_type == NUMA_DIRECT)
2011 return nid;
2012
2013 /*
2014 * On a system with glueless mesh NUMA topology, group_weight
2015 * scores nodes according to the number of NUMA hinting faults on
2016 * both the node itself, and on nearby nodes.
2017 */
2018 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
2019 unsigned long score, max_score = 0;
2020 int node, max_node = nid;
2021
2022 dist = sched_max_numa_distance;
2023
2024 for_each_online_node(node) {
2025 score = group_weight(p, node, dist);
2026 if (score > max_score) {
2027 max_score = score;
2028 max_node = node;
2029 }
2030 }
2031 return max_node;
2032 }
2033
2034 /*
2035 * Finding the preferred nid in a system with NUMA backplane
2036 * interconnect topology is more involved. The goal is to locate
2037 * tasks from numa_groups near each other in the system, and
2038 * untangle workloads from different sides of the system. This requires
2039 * searching down the hierarchy of node groups, recursively searching
2040 * inside the highest scoring group of nodes. The nodemask tricks
2041 * keep the complexity of the search down.
2042 */
2043 nodes = node_online_map;
2044 for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
2045 unsigned long max_faults = 0;
Jan Beulich81907472015-01-23 08:25:38 +00002046 nodemask_t max_group = NODE_MASK_NONE;
Rik van Riel54009412014-10-17 03:29:53 -04002047 int a, b;
2048
2049 /* Are there nodes at this distance from each other? */
2050 if (!find_numa_distance(dist))
2051 continue;
2052
2053 for_each_node_mask(a, nodes) {
2054 unsigned long faults = 0;
2055 nodemask_t this_group;
2056 nodes_clear(this_group);
2057
2058 /* Sum group's NUMA faults; includes a==b case. */
2059 for_each_node_mask(b, nodes) {
2060 if (node_distance(a, b) < dist) {
2061 faults += group_faults(p, b);
2062 node_set(b, this_group);
2063 node_clear(b, nodes);
2064 }
2065 }
2066
2067 /* Remember the top group. */
2068 if (faults > max_faults) {
2069 max_faults = faults;
2070 max_group = this_group;
2071 /*
2072 * subtle: at the smallest distance there is
2073 * just one node left in each "group", the
2074 * winner is the preferred nid.
2075 */
2076 nid = a;
2077 }
2078 }
2079 /* Next round, evaluate the nodes within max_group. */
Jan Beulich890a5402015-02-09 12:30:00 +01002080 if (!max_faults)
2081 break;
Rik van Riel54009412014-10-17 03:29:53 -04002082 nodes = max_group;
2083 }
2084 return nid;
2085}
2086
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002087static void task_numa_placement(struct task_struct *p)
2088{
Srikar Dronamrajuf03bb672018-06-20 22:32:46 +05302089 int seq, nid, max_nid = -1;
2090 unsigned long max_faults = 0;
Rik van Riel04bb2f92013-10-07 11:29:36 +01002091 unsigned long fault_types[2] = { 0, 0 };
Rik van Riel7e2703e2014-01-27 17:03:45 -05002092 unsigned long total_faults;
2093 u64 runtime, period;
Mel Gorman7dbd13e2013-10-07 11:29:29 +01002094 spinlock_t *group_lock = NULL;
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002095
Jason Low7e5a2c12015-04-30 17:28:14 -07002096 /*
2097 * The p->mm->numa_scan_seq field gets updated without
2098 * exclusive access. Use READ_ONCE() here to ensure
2099 * that the field is read in a single access:
2100 */
Jason Low316c1608d2015-04-28 13:00:20 -07002101 seq = READ_ONCE(p->mm->numa_scan_seq);
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002102 if (p->numa_scan_seq == seq)
2103 return;
2104 p->numa_scan_seq = seq;
Mel Gorman598f0ec2013-10-07 11:28:55 +01002105 p->numa_scan_period_max = task_scan_max(p);
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002106
Rik van Riel7e2703e2014-01-27 17:03:45 -05002107 total_faults = p->numa_faults_locality[0] +
2108 p->numa_faults_locality[1];
2109 runtime = numa_get_avg_runtime(p, &period);
2110
Mel Gorman7dbd13e2013-10-07 11:29:29 +01002111 /* If the task is part of a group prevent parallel updates to group stats */
2112 if (p->numa_group) {
2113 group_lock = &p->numa_group->lock;
Mike Galbraith60e69ee2014-04-07 10:55:15 +02002114 spin_lock_irq(group_lock);
Mel Gorman7dbd13e2013-10-07 11:29:29 +01002115 }
2116
Mel Gorman688b7582013-10-07 11:28:58 +01002117 /* Find the node with the highest number of faults */
2118 for_each_online_node(nid) {
Iulia Manda44dba3d2014-10-31 02:13:31 +02002119 /* Keep track of the offsets in numa_faults array */
2120 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
Mel Gorman83e1d2c2013-10-07 11:29:27 +01002121 unsigned long faults = 0, group_faults = 0;
Iulia Manda44dba3d2014-10-31 02:13:31 +02002122 int priv;
Mel Gorman745d6142013-10-07 11:28:59 +01002123
Rik van Rielbe1e4e72014-01-27 17:03:48 -05002124 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
Rik van Riel7e2703e2014-01-27 17:03:45 -05002125 long diff, f_diff, f_weight;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002126
Iulia Manda44dba3d2014-10-31 02:13:31 +02002127 mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
2128 membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
2129 cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
2130 cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
Mel Gorman745d6142013-10-07 11:28:59 +01002131
Mel Gormanac8e8952013-10-07 11:29:03 +01002132 /* Decay existing window, copy faults since last scan */
Iulia Manda44dba3d2014-10-31 02:13:31 +02002133 diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
2134 fault_types[priv] += p->numa_faults[membuf_idx];
2135 p->numa_faults[membuf_idx] = 0;
Mel Gormanfb13c7e2013-10-07 11:29:17 +01002136
Rik van Riel7e2703e2014-01-27 17:03:45 -05002137 /*
2138 * Normalize the faults_from, so all tasks in a group
2139 * count according to CPU use, instead of by the raw
2140 * number of faults. Tasks with little runtime have
2141 * little over-all impact on throughput, and thus their
2142 * faults are less important.
2143 */
2144 f_weight = div64_u64(runtime << 16, period + 1);
Iulia Manda44dba3d2014-10-31 02:13:31 +02002145 f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
Rik van Riel7e2703e2014-01-27 17:03:45 -05002146 (total_faults + 1);
Iulia Manda44dba3d2014-10-31 02:13:31 +02002147 f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
2148 p->numa_faults[cpubuf_idx] = 0;
Rik van Riel50ec8a42014-01-27 17:03:42 -05002149
Iulia Manda44dba3d2014-10-31 02:13:31 +02002150 p->numa_faults[mem_idx] += diff;
2151 p->numa_faults[cpu_idx] += f_diff;
2152 faults += p->numa_faults[mem_idx];
Mel Gorman83e1d2c2013-10-07 11:29:27 +01002153 p->total_numa_faults += diff;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002154 if (p->numa_group) {
Iulia Manda44dba3d2014-10-31 02:13:31 +02002155 /*
2156 * safe because we can only change our own group
2157 *
2158 * mem_idx represents the offset for a given
2159 * nid and priv in a specific region because it
2160 * is at the beginning of the numa_faults array.
2161 */
2162 p->numa_group->faults[mem_idx] += diff;
2163 p->numa_group->faults_cpu[mem_idx] += f_diff;
Mel Gorman989348b2013-10-07 11:29:40 +01002164 p->numa_group->total_faults += diff;
Iulia Manda44dba3d2014-10-31 02:13:31 +02002165 group_faults += p->numa_group->faults[mem_idx];
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002166 }
Mel Gormanac8e8952013-10-07 11:29:03 +01002167 }
2168
Srikar Dronamrajuf03bb672018-06-20 22:32:46 +05302169 if (!p->numa_group) {
2170 if (faults > max_faults) {
2171 max_faults = faults;
2172 max_nid = nid;
2173 }
2174 } else if (group_faults > max_faults) {
2175 max_faults = group_faults;
Mel Gorman688b7582013-10-07 11:28:58 +01002176 max_nid = nid;
2177 }
Mel Gorman83e1d2c2013-10-07 11:29:27 +01002178 }
2179
Mel Gorman7dbd13e2013-10-07 11:29:29 +01002180 if (p->numa_group) {
Rik van Riel4142c3e2016-01-25 17:07:39 -05002181 numa_group_count_active_nodes(p->numa_group);
Mike Galbraith60e69ee2014-04-07 10:55:15 +02002182 spin_unlock_irq(group_lock);
Srikar Dronamrajuf03bb672018-06-20 22:32:46 +05302183 max_nid = preferred_group_nid(p, max_nid);
Mel Gorman688b7582013-10-07 11:28:58 +01002184 }
2185
Rik van Rielbb97fc32014-06-04 16:33:15 -04002186 if (max_faults) {
2187 /* Set the new preferred node */
2188 if (max_nid != p->numa_preferred_nid)
2189 sched_setnuma(p, max_nid);
Mel Gorman3a7053b2013-10-07 11:29:00 +01002190 }
Srikar Dronamraju30619c82018-06-20 22:32:55 +05302191
2192 update_task_scan_period(p, fault_types[0], fault_types[1]);
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002193}
2194
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002195static inline int get_numa_group(struct numa_group *grp)
2196{
2197 return atomic_inc_not_zero(&grp->refcount);
2198}
2199
2200static inline void put_numa_group(struct numa_group *grp)
2201{
2202 if (atomic_dec_and_test(&grp->refcount))
2203 kfree_rcu(grp, rcu);
2204}
2205
Mel Gorman3e6a9412013-10-07 11:29:35 +01002206static void task_numa_group(struct task_struct *p, int cpupid, int flags,
2207 int *priv)
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002208{
2209 struct numa_group *grp, *my_grp;
2210 struct task_struct *tsk;
2211 bool join = false;
2212 int cpu = cpupid_to_cpu(cpupid);
2213 int i;
2214
2215 if (unlikely(!p->numa_group)) {
2216 unsigned int size = sizeof(struct numa_group) +
Rik van Riel50ec8a42014-01-27 17:03:42 -05002217 4*nr_node_ids*sizeof(unsigned long);
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002218
2219 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
2220 if (!grp)
2221 return;
2222
2223 atomic_set(&grp->refcount, 1);
Rik van Riel4142c3e2016-01-25 17:07:39 -05002224 grp->active_nodes = 1;
2225 grp->max_faults_cpu = 0;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002226 spin_lock_init(&grp->lock);
Mel Gormane29cf082013-10-07 11:29:22 +01002227 grp->gid = p->pid;
Rik van Riel50ec8a42014-01-27 17:03:42 -05002228 /* Second half of the array tracks nids where faults happen */
Rik van Rielbe1e4e72014-01-27 17:03:48 -05002229 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
2230 nr_node_ids;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002231
Rik van Rielbe1e4e72014-01-27 17:03:48 -05002232 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
Iulia Manda44dba3d2014-10-31 02:13:31 +02002233 grp->faults[i] = p->numa_faults[i];
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002234
Mel Gorman989348b2013-10-07 11:29:40 +01002235 grp->total_faults = p->total_numa_faults;
Mel Gorman83e1d2c2013-10-07 11:29:27 +01002236
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002237 grp->nr_tasks++;
2238 rcu_assign_pointer(p->numa_group, grp);
2239 }
2240
2241 rcu_read_lock();
Jason Low316c1608d2015-04-28 13:00:20 -07002242 tsk = READ_ONCE(cpu_rq(cpu)->curr);
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002243
2244 if (!cpupid_match_pid(tsk, cpupid))
Peter Zijlstra33547812013-10-09 10:24:48 +02002245 goto no_join;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002246
2247 grp = rcu_dereference(tsk->numa_group);
2248 if (!grp)
Peter Zijlstra33547812013-10-09 10:24:48 +02002249 goto no_join;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002250
2251 my_grp = p->numa_group;
2252 if (grp == my_grp)
Peter Zijlstra33547812013-10-09 10:24:48 +02002253 goto no_join;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002254
2255 /*
2256 * Only join the other group if its bigger; if we're the bigger group,
2257 * the other task will join us.
2258 */
2259 if (my_grp->nr_tasks > grp->nr_tasks)
Peter Zijlstra33547812013-10-09 10:24:48 +02002260 goto no_join;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002261
2262 /*
2263 * Tie-break on the grp address.
2264 */
2265 if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
Peter Zijlstra33547812013-10-09 10:24:48 +02002266 goto no_join;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002267
Rik van Rieldabe1d92013-10-07 11:29:34 +01002268 /* Always join threads in the same process. */
2269 if (tsk->mm == current->mm)
2270 join = true;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002271
Rik van Rieldabe1d92013-10-07 11:29:34 +01002272 /* Simple filter to avoid false positives due to PID collisions */
2273 if (flags & TNF_SHARED)
2274 join = true;
2275
Mel Gorman3e6a9412013-10-07 11:29:35 +01002276 /* Update priv based on whether false sharing was detected */
2277 *priv = !join;
2278
Rik van Rieldabe1d92013-10-07 11:29:34 +01002279 if (join && !get_numa_group(grp))
Peter Zijlstra33547812013-10-09 10:24:48 +02002280 goto no_join;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002281
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002282 rcu_read_unlock();
2283
2284 if (!join)
2285 return;
2286
Mike Galbraith60e69ee2014-04-07 10:55:15 +02002287 BUG_ON(irqs_disabled());
2288 double_lock_irq(&my_grp->lock, &grp->lock);
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002289
Rik van Rielbe1e4e72014-01-27 17:03:48 -05002290 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
Iulia Manda44dba3d2014-10-31 02:13:31 +02002291 my_grp->faults[i] -= p->numa_faults[i];
2292 grp->faults[i] += p->numa_faults[i];
Mel Gorman989348b2013-10-07 11:29:40 +01002293 }
2294 my_grp->total_faults -= p->total_numa_faults;
2295 grp->total_faults += p->total_numa_faults;
2296
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002297 my_grp->nr_tasks--;
2298 grp->nr_tasks++;
2299
2300 spin_unlock(&my_grp->lock);
Mike Galbraith60e69ee2014-04-07 10:55:15 +02002301 spin_unlock_irq(&grp->lock);
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002302
2303 rcu_assign_pointer(p->numa_group, grp);
2304
2305 put_numa_group(my_grp);
Peter Zijlstra33547812013-10-09 10:24:48 +02002306 return;
2307
2308no_join:
2309 rcu_read_unlock();
2310 return;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002311}
2312
2313void task_numa_free(struct task_struct *p)
2314{
2315 struct numa_group *grp = p->numa_group;
Iulia Manda44dba3d2014-10-31 02:13:31 +02002316 void *numa_faults = p->numa_faults;
Steven Rostedte9dd6852014-05-27 17:02:04 -04002317 unsigned long flags;
2318 int i;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002319
2320 if (grp) {
Steven Rostedte9dd6852014-05-27 17:02:04 -04002321 spin_lock_irqsave(&grp->lock, flags);
Rik van Rielbe1e4e72014-01-27 17:03:48 -05002322 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
Iulia Manda44dba3d2014-10-31 02:13:31 +02002323 grp->faults[i] -= p->numa_faults[i];
Mel Gorman989348b2013-10-07 11:29:40 +01002324 grp->total_faults -= p->total_numa_faults;
2325
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002326 grp->nr_tasks--;
Steven Rostedte9dd6852014-05-27 17:02:04 -04002327 spin_unlock_irqrestore(&grp->lock, flags);
Andreea-Cristina Bernat35b123e2014-08-22 17:50:43 +03002328 RCU_INIT_POINTER(p->numa_group, NULL);
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002329 put_numa_group(grp);
2330 }
2331
Iulia Manda44dba3d2014-10-31 02:13:31 +02002332 p->numa_faults = NULL;
Rik van Riel82727012013-10-07 11:29:28 +01002333 kfree(numa_faults);
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002334}
2335
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002336/*
2337 * Got a PROT_NONE fault for a page on @node.
2338 */
Rik van Riel58b46da2014-01-27 17:03:47 -05002339void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002340{
2341 struct task_struct *p = current;
Peter Zijlstra6688cc02013-10-07 11:29:24 +01002342 bool migrated = flags & TNF_MIGRATED;
Rik van Riel58b46da2014-01-27 17:03:47 -05002343 int cpu_node = task_node(current);
Rik van Riel792568e2014-04-11 13:00:27 -04002344 int local = !!(flags & TNF_FAULT_LOCAL);
Rik van Riel4142c3e2016-01-25 17:07:39 -05002345 struct numa_group *ng;
Mel Gormanac8e8952013-10-07 11:29:03 +01002346 int priv;
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002347
Srikar Dronamraju2a595722015-08-11 21:54:21 +05302348 if (!static_branch_likely(&sched_numa_balancing))
Mel Gorman1a687c22012-11-22 11:16:36 +00002349 return;
2350
Mel Gorman9ff1d9f2013-10-07 11:29:04 +01002351 /* for example, ksmd faulting in a user's mm */
2352 if (!p->mm)
2353 return;
2354
Mel Gormanf809ca92013-10-07 11:28:57 +01002355 /* Allocate buffer to track faults on a per-node basis */
Iulia Manda44dba3d2014-10-31 02:13:31 +02002356 if (unlikely(!p->numa_faults)) {
2357 int size = sizeof(*p->numa_faults) *
Rik van Rielbe1e4e72014-01-27 17:03:48 -05002358 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
Mel Gormanf809ca92013-10-07 11:28:57 +01002359
Iulia Manda44dba3d2014-10-31 02:13:31 +02002360 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2361 if (!p->numa_faults)
Mel Gormanf809ca92013-10-07 11:28:57 +01002362 return;
Mel Gorman745d6142013-10-07 11:28:59 +01002363
Mel Gorman83e1d2c2013-10-07 11:29:27 +01002364 p->total_numa_faults = 0;
Rik van Riel04bb2f92013-10-07 11:29:36 +01002365 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
Mel Gormanf809ca92013-10-07 11:28:57 +01002366 }
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002367
Mel Gormanfb003b82012-11-15 09:01:14 +00002368 /*
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002369 * First accesses are treated as private, otherwise consider accesses
2370 * to be private if the accessing pid has not changed
2371 */
2372 if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2373 priv = 1;
2374 } else {
2375 priv = cpupid_match_pid(p, last_cpupid);
Peter Zijlstra6688cc02013-10-07 11:29:24 +01002376 if (!priv && !(flags & TNF_NO_GROUP))
Mel Gorman3e6a9412013-10-07 11:29:35 +01002377 task_numa_group(p, last_cpupid, flags, &priv);
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002378 }
2379
Rik van Riel792568e2014-04-11 13:00:27 -04002380 /*
2381 * If a workload spans multiple NUMA nodes, a shared fault that
2382 * occurs wholly within the set of nodes that the workload is
2383 * actively using should be counted as local. This allows the
2384 * scan rate to slow down when a workload has settled down.
2385 */
Rik van Riel4142c3e2016-01-25 17:07:39 -05002386 ng = p->numa_group;
2387 if (!priv && !local && ng && ng->active_nodes > 1 &&
2388 numa_is_active_node(cpu_node, ng) &&
2389 numa_is_active_node(mem_node, ng))
Rik van Riel792568e2014-04-11 13:00:27 -04002390 local = 1;
2391
Rik van Riel2739d3e2013-10-07 11:29:41 +01002392 /*
2393 * Retry task to preferred node migration periodically, in case it
2394 * case it previously failed, or the scheduler moved us.
2395 */
Srikar Dronamrajub6a60cf2018-06-20 22:33:00 +05302396 if (time_after(jiffies, p->numa_migrate_retry)) {
2397 task_numa_placement(p);
Mel Gorman6b9a7462013-10-07 11:29:11 +01002398 numa_migrate_preferred(p);
Srikar Dronamrajub6a60cf2018-06-20 22:33:00 +05302399 }
Mel Gorman6b9a7462013-10-07 11:29:11 +01002400
Ingo Molnarb32e86b2013-10-07 11:29:30 +01002401 if (migrated)
2402 p->numa_pages_migrated += pages;
Mel Gorman074c2382015-03-25 15:55:42 -07002403 if (flags & TNF_MIGRATE_FAIL)
2404 p->numa_faults_locality[2] += pages;
Ingo Molnarb32e86b2013-10-07 11:29:30 +01002405
Iulia Manda44dba3d2014-10-31 02:13:31 +02002406 p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2407 p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
Rik van Riel792568e2014-04-11 13:00:27 -04002408 p->numa_faults_locality[local] += pages;
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002409}
2410
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002411static void reset_ptenuma_scan(struct task_struct *p)
2412{
Jason Low7e5a2c12015-04-30 17:28:14 -07002413 /*
2414 * We only did a read acquisition of the mmap sem, so
2415 * p->mm->numa_scan_seq is written to without exclusive access
2416 * and the update is not guaranteed to be atomic. That's not
2417 * much of an issue though, since this is just used for
2418 * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
2419 * expensive, to avoid any form of compiler optimizations:
2420 */
Jason Low316c1608d2015-04-28 13:00:20 -07002421 WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002422 p->mm->numa_scan_offset = 0;
2423}
2424
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002425/*
2426 * The expensive part of numa migration is done from task_work context.
2427 * Triggered from task_tick_numa().
2428 */
2429void task_numa_work(struct callback_head *work)
2430{
2431 unsigned long migrate, next_scan, now = jiffies;
2432 struct task_struct *p = current;
2433 struct mm_struct *mm = p->mm;
Rik van Riel51170842015-11-05 15:56:23 -05002434 u64 runtime = p->se.sum_exec_runtime;
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002435 struct vm_area_struct *vma;
Mel Gorman9f406042012-11-14 18:34:32 +00002436 unsigned long start, end;
Mel Gorman598f0ec2013-10-07 11:28:55 +01002437 unsigned long nr_pte_updates = 0;
Rik van Riel4620f8c2015-09-11 09:00:27 -04002438 long pages, virtpages;
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002439
Peter Zijlstra9148a3a2016-09-20 22:34:51 +02002440 SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002441
2442 work->next = work; /* protect against double add */
2443 /*
2444 * Who cares about NUMA placement when they're dying.
2445 *
2446 * NOTE: make sure not to dereference p->mm before this check,
2447 * exit_task_work() happens _after_ exit_mm() so we could be called
2448 * without p->mm even though we still had it when we enqueued this
2449 * work.
2450 */
2451 if (p->flags & PF_EXITING)
2452 return;
2453
Mel Gorman930aa172013-10-07 11:29:37 +01002454 if (!mm->numa_next_scan) {
Mel Gorman7e8d16b2013-10-07 11:28:54 +01002455 mm->numa_next_scan = now +
2456 msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
Mel Gormanb8593bf2012-11-21 01:18:23 +00002457 }
2458
2459 /*
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002460 * Enforce maximal scan/migration frequency..
2461 */
2462 migrate = mm->numa_next_scan;
2463 if (time_before(now, migrate))
2464 return;
2465
Mel Gorman598f0ec2013-10-07 11:28:55 +01002466 if (p->numa_scan_period == 0) {
2467 p->numa_scan_period_max = task_scan_max(p);
Rik van Rielb5dd77c2017-07-31 15:28:47 -04002468 p->numa_scan_period = task_scan_start(p);
Mel Gorman598f0ec2013-10-07 11:28:55 +01002469 }
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002470
Mel Gormanfb003b82012-11-15 09:01:14 +00002471 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002472 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2473 return;
2474
Mel Gormane14808b2012-11-19 10:59:15 +00002475 /*
Peter Zijlstra19a78d12013-10-07 11:28:51 +01002476 * Delay this task enough that another task of this mm will likely win
2477 * the next time around.
2478 */
2479 p->node_stamp += 2 * TICK_NSEC;
2480
Mel Gorman9f406042012-11-14 18:34:32 +00002481 start = mm->numa_scan_offset;
2482 pages = sysctl_numa_balancing_scan_size;
2483 pages <<= 20 - PAGE_SHIFT; /* MB in pages */
Rik van Riel4620f8c2015-09-11 09:00:27 -04002484 virtpages = pages * 8; /* Scan up to this much virtual space */
Mel Gorman9f406042012-11-14 18:34:32 +00002485 if (!pages)
2486 return;
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002487
Rik van Riel4620f8c2015-09-11 09:00:27 -04002488
Vlastimil Babka8655d542017-05-15 15:13:16 +02002489 if (!down_read_trylock(&mm->mmap_sem))
2490 return;
Mel Gorman9f406042012-11-14 18:34:32 +00002491 vma = find_vma(mm, start);
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002492 if (!vma) {
2493 reset_ptenuma_scan(p);
Mel Gorman9f406042012-11-14 18:34:32 +00002494 start = 0;
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002495 vma = mm->mmap;
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002496 }
Mel Gorman9f406042012-11-14 18:34:32 +00002497 for (; vma; vma = vma->vm_next) {
Naoya Horiguchi6b79c572015-04-07 14:26:47 -07002498 if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
Mel Gorman8e76d4e2015-06-10 11:15:00 -07002499 is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002500 continue;
Naoya Horiguchi6b79c572015-04-07 14:26:47 -07002501 }
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002502
Mel Gorman4591ce4f2013-10-07 11:29:13 +01002503 /*
2504 * Shared library pages mapped by multiple processes are not
2505 * migrated as it is expected they are cache replicated. Avoid
2506 * hinting faults in read-only file-backed mappings or the vdso
2507 * as migrating the pages will be of marginal benefit.
2508 */
2509 if (!vma->vm_mm ||
2510 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2511 continue;
2512
Mel Gorman3c67f472013-12-18 17:08:40 -08002513 /*
2514 * Skip inaccessible VMAs to avoid any confusion between
2515 * PROT_NONE and NUMA hinting ptes
2516 */
2517 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
2518 continue;
2519
Mel Gorman9f406042012-11-14 18:34:32 +00002520 do {
2521 start = max(start, vma->vm_start);
2522 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2523 end = min(end, vma->vm_end);
Rik van Riel4620f8c2015-09-11 09:00:27 -04002524 nr_pte_updates = change_prot_numa(vma, start, end);
Mel Gorman598f0ec2013-10-07 11:28:55 +01002525
2526 /*
Rik van Riel4620f8c2015-09-11 09:00:27 -04002527 * Try to scan sysctl_numa_balancing_size worth of
2528 * hpages that have at least one present PTE that
2529 * is not already pte-numa. If the VMA contains
2530 * areas that are unused or already full of prot_numa
2531 * PTEs, scan up to virtpages, to skip through those
2532 * areas faster.
Mel Gorman598f0ec2013-10-07 11:28:55 +01002533 */
2534 if (nr_pte_updates)
2535 pages -= (end - start) >> PAGE_SHIFT;
Rik van Riel4620f8c2015-09-11 09:00:27 -04002536 virtpages -= (end - start) >> PAGE_SHIFT;
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002537
Mel Gorman9f406042012-11-14 18:34:32 +00002538 start = end;
Rik van Riel4620f8c2015-09-11 09:00:27 -04002539 if (pages <= 0 || virtpages <= 0)
Mel Gorman9f406042012-11-14 18:34:32 +00002540 goto out;
Rik van Riel3cf19622014-02-18 17:12:44 -05002541
2542 cond_resched();
Mel Gorman9f406042012-11-14 18:34:32 +00002543 } while (end != vma->vm_end);
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002544 }
2545
Mel Gorman9f406042012-11-14 18:34:32 +00002546out:
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002547 /*
Peter Zijlstrac69307d2013-10-07 11:28:41 +01002548 * It is possible to reach the end of the VMA list but the last few
2549 * VMAs are not guaranteed to the vma_migratable. If they are not, we
2550 * would find the !migratable VMA on the next scan but not reset the
2551 * scanner to the start so check it now.
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002552 */
2553 if (vma)
Mel Gorman9f406042012-11-14 18:34:32 +00002554 mm->numa_scan_offset = start;
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002555 else
2556 reset_ptenuma_scan(p);
2557 up_read(&mm->mmap_sem);
Rik van Riel51170842015-11-05 15:56:23 -05002558
2559 /*
2560 * Make sure tasks use at least 32x as much time to run other code
2561 * than they used here, to limit NUMA PTE scanning overhead to 3% max.
2562 * Usually update_task_scan_period slows down scanning enough; on an
2563 * overloaded system we need to limit overhead on a per task basis.
2564 */
2565 if (unlikely(p->se.sum_exec_runtime != runtime)) {
2566 u64 diff = p->se.sum_exec_runtime - runtime;
2567 p->node_stamp += 32 * diff;
2568 }
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002569}
2570
2571/*
2572 * Drive the periodic memory faults..
2573 */
2574void task_tick_numa(struct rq *rq, struct task_struct *curr)
2575{
2576 struct callback_head *work = &curr->numa_work;
2577 u64 period, now;
2578
2579 /*
2580 * We don't care about NUMA placement if we don't have memory.
2581 */
2582 if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
2583 return;
2584
2585 /*
2586 * Using runtime rather than walltime has the dual advantage that
2587 * we (mostly) drive the selection from busy threads and that the
2588 * task needs to have done some actual work before we bother with
2589 * NUMA placement.
2590 */
2591 now = curr->se.sum_exec_runtime;
2592 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2593
Rik van Riel25b3e5a2015-11-05 15:56:22 -05002594 if (now > curr->node_stamp + period) {
Peter Zijlstra4b96a292012-10-25 14:16:47 +02002595 if (!curr->node_stamp)
Rik van Rielb5dd77c2017-07-31 15:28:47 -04002596 curr->numa_scan_period = task_scan_start(curr);
Peter Zijlstra19a78d12013-10-07 11:28:51 +01002597 curr->node_stamp += period;
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002598
2599 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
2600 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
2601 task_work_add(curr, work, true);
2602 }
2603 }
2604}
Rik van Riel3fed3822017-06-23 12:55:29 -04002605
Srikar Dronamraju3f9672b2018-09-21 23:18:58 +05302606static void update_scan_period(struct task_struct *p, int new_cpu)
2607{
2608 int src_nid = cpu_to_node(task_cpu(p));
2609 int dst_nid = cpu_to_node(new_cpu);
2610
Mel Gorman05cbdf42018-09-21 23:18:59 +05302611 if (!static_branch_likely(&sched_numa_balancing))
2612 return;
2613
Srikar Dronamraju3f9672b2018-09-21 23:18:58 +05302614 if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
2615 return;
2616
Mel Gorman05cbdf42018-09-21 23:18:59 +05302617 if (src_nid == dst_nid)
2618 return;
2619
2620 /*
2621 * Allow resets if faults have been trapped before one scan
2622 * has completed. This is most likely due to a new task that
2623 * is pulled cross-node due to wakeups or load balancing.
2624 */
2625 if (p->numa_scan_seq) {
2626 /*
2627 * Avoid scan adjustments if moving to the preferred
2628 * node or if the task was not previously running on
2629 * the preferred node.
2630 */
2631 if (dst_nid == p->numa_preferred_nid ||
2632 (p->numa_preferred_nid != -1 && src_nid != p->numa_preferred_nid))
2633 return;
2634 }
2635
2636 p->numa_scan_period = task_scan_start(p);
Srikar Dronamraju3f9672b2018-09-21 23:18:58 +05302637}
2638
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002639#else
2640static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2641{
2642}
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01002643
2644static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2645{
2646}
2647
2648static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2649{
2650}
Rik van Riel3fed3822017-06-23 12:55:29 -04002651
Srikar Dronamraju3f9672b2018-09-21 23:18:58 +05302652static inline void update_scan_period(struct task_struct *p, int new_cpu)
2653{
2654}
2655
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002656#endif /* CONFIG_NUMA_BALANCING */
2657
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02002658static void
2659account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2660{
2661 update_load_add(&cfs_rq->load, se->load.weight);
Peter Zijlstrac09595f2008-06-27 13:41:14 +02002662 if (!parent_entity(se))
Peter Zijlstra029632f2011-10-25 10:00:11 +02002663 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
Peter Zijlstra367456c2012-02-20 21:49:09 +01002664#ifdef CONFIG_SMP
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01002665 if (entity_is_task(se)) {
2666 struct rq *rq = rq_of(cfs_rq);
2667
2668 account_numa_enqueue(rq, task_of(se));
2669 list_add(&se->group_node, &rq->cfs_tasks);
2670 }
Peter Zijlstra367456c2012-02-20 21:49:09 +01002671#endif
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02002672 cfs_rq->nr_running++;
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02002673}
2674
2675static void
2676account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2677{
2678 update_load_sub(&cfs_rq->load, se->load.weight);
Peter Zijlstrac09595f2008-06-27 13:41:14 +02002679 if (!parent_entity(se))
Peter Zijlstra029632f2011-10-25 10:00:11 +02002680 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
Tim Chenbfdb1982016-02-01 14:47:59 -08002681#ifdef CONFIG_SMP
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01002682 if (entity_is_task(se)) {
2683 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
Bharata B Raob87f1722008-09-25 09:53:54 +05302684 list_del_init(&se->group_node);
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01002685 }
Tim Chenbfdb1982016-02-01 14:47:59 -08002686#endif
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02002687 cfs_rq->nr_running--;
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02002688}
2689
Peter Zijlstra8d5b9022017-08-24 17:45:35 +02002690/*
2691 * Signed add and clamp on underflow.
2692 *
2693 * Explicitly do a load-store to ensure the intermediate value never hits
2694 * memory. This allows lockless observations without ever seeing the negative
2695 * values.
2696 */
2697#define add_positive(_ptr, _val) do { \
2698 typeof(_ptr) ptr = (_ptr); \
2699 typeof(_val) val = (_val); \
2700 typeof(*ptr) res, var = READ_ONCE(*ptr); \
2701 \
2702 res = var + val; \
2703 \
2704 if (val < 0 && res > var) \
2705 res = 0; \
2706 \
2707 WRITE_ONCE(*ptr, res); \
2708} while (0)
2709
2710/*
2711 * Unsigned subtract and clamp on underflow.
2712 *
2713 * Explicitly do a load-store to ensure the intermediate value never hits
2714 * memory. This allows lockless observations without ever seeing the negative
2715 * values.
2716 */
2717#define sub_positive(_ptr, _val) do { \
2718 typeof(_ptr) ptr = (_ptr); \
2719 typeof(*ptr) val = (_val); \
2720 typeof(*ptr) res, var = READ_ONCE(*ptr); \
2721 res = var - val; \
2722 if (res > var) \
2723 res = 0; \
2724 WRITE_ONCE(*ptr, res); \
2725} while (0)
2726
2727#ifdef CONFIG_SMP
Peter Zijlstra8d5b9022017-08-24 17:45:35 +02002728static inline void
2729enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2730{
Peter Zijlstra1ea6c462017-05-06 15:59:54 +02002731 cfs_rq->runnable_weight += se->runnable_weight;
2732
2733 cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
2734 cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
Peter Zijlstra8d5b9022017-08-24 17:45:35 +02002735}
2736
2737static inline void
2738dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2739{
Peter Zijlstra1ea6c462017-05-06 15:59:54 +02002740 cfs_rq->runnable_weight -= se->runnable_weight;
2741
2742 sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
2743 sub_positive(&cfs_rq->avg.runnable_load_sum,
2744 se_runnable(se) * se->avg.runnable_load_sum);
Peter Zijlstra8d5b9022017-08-24 17:45:35 +02002745}
2746
2747static inline void
2748enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2749{
2750 cfs_rq->avg.load_avg += se->avg.load_avg;
2751 cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
2752}
2753
2754static inline void
2755dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2756{
2757 sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
2758 sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
2759}
2760#else
2761static inline void
2762enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2763static inline void
2764dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2765static inline void
2766enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2767static inline void
2768dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
2769#endif
2770
Vincent Guittot90593932017-05-17 11:50:45 +02002771static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
Peter Zijlstra1ea6c462017-05-06 15:59:54 +02002772 unsigned long weight, unsigned long runnable)
Vincent Guittot90593932017-05-17 11:50:45 +02002773{
2774 if (se->on_rq) {
2775 /* commit outstanding execution time */
2776 if (cfs_rq->curr == se)
2777 update_curr(cfs_rq);
2778 account_entity_dequeue(cfs_rq, se);
2779 dequeue_runnable_load_avg(cfs_rq, se);
2780 }
2781 dequeue_load_avg(cfs_rq, se);
2782
Peter Zijlstra1ea6c462017-05-06 15:59:54 +02002783 se->runnable_weight = runnable;
Vincent Guittot90593932017-05-17 11:50:45 +02002784 update_load_set(&se->load, weight);
2785
2786#ifdef CONFIG_SMP
Peter Zijlstra1ea6c462017-05-06 15:59:54 +02002787 do {
2788 u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
2789
2790 se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
2791 se->avg.runnable_load_avg =
2792 div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
2793 } while (0);
Vincent Guittot90593932017-05-17 11:50:45 +02002794#endif
2795
2796 enqueue_load_avg(cfs_rq, se);
2797 if (se->on_rq) {
2798 account_entity_enqueue(cfs_rq, se);
2799 enqueue_runnable_load_avg(cfs_rq, se);
2800 }
2801}
2802
2803void reweight_task(struct task_struct *p, int prio)
2804{
2805 struct sched_entity *se = &p->se;
2806 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2807 struct load_weight *load = &se->load;
2808 unsigned long weight = scale_load(sched_prio_to_weight[prio]);
2809
Peter Zijlstra1ea6c462017-05-06 15:59:54 +02002810 reweight_entity(cfs_rq, se, weight, weight);
Vincent Guittot90593932017-05-17 11:50:45 +02002811 load->inv_weight = sched_prio_to_wmult[prio];
2812}
2813
Yong Zhang3ff6dca2011-01-24 15:33:52 +08002814#ifdef CONFIG_FAIR_GROUP_SCHED
Vincent Guittot387f77c2018-02-13 09:59:42 +01002815#ifdef CONFIG_SMP
Peter Zijlstracef27402017-05-09 11:04:07 +02002816/*
2817 * All this does is approximate the hierarchical proportion which includes that
2818 * global sum we all love to hate.
2819 *
2820 * That is, the weight of a group entity, is the proportional share of the
2821 * group weight based on the group runqueue weights. That is:
2822 *
2823 * tg->weight * grq->load.weight
2824 * ge->load.weight = ----------------------------- (1)
2825 * \Sum grq->load.weight
2826 *
2827 * Now, because computing that sum is prohibitively expensive to compute (been
2828 * there, done that) we approximate it with this average stuff. The average
2829 * moves slower and therefore the approximation is cheaper and more stable.
2830 *
2831 * So instead of the above, we substitute:
2832 *
2833 * grq->load.weight -> grq->avg.load_avg (2)
2834 *
2835 * which yields the following:
2836 *
2837 * tg->weight * grq->avg.load_avg
2838 * ge->load.weight = ------------------------------ (3)
2839 * tg->load_avg
2840 *
2841 * Where: tg->load_avg ~= \Sum grq->avg.load_avg
2842 *
2843 * That is shares_avg, and it is right (given the approximation (2)).
2844 *
2845 * The problem with it is that because the average is slow -- it was designed
2846 * to be exactly that of course -- this leads to transients in boundary
2847 * conditions. In specific, the case where the group was idle and we start the
2848 * one task. It takes time for our CPU's grq->avg.load_avg to build up,
2849 * yielding bad latency etc..
2850 *
2851 * Now, in that special case (1) reduces to:
2852 *
2853 * tg->weight * grq->load.weight
Peter Zijlstra17de4ee2017-08-24 13:06:35 +02002854 * ge->load.weight = ----------------------------- = tg->weight (4)
Peter Zijlstracef27402017-05-09 11:04:07 +02002855 * grp->load.weight
2856 *
2857 * That is, the sum collapses because all other CPUs are idle; the UP scenario.
2858 *
2859 * So what we do is modify our approximation (3) to approach (4) in the (near)
2860 * UP case, like:
2861 *
2862 * ge->load.weight =
2863 *
2864 * tg->weight * grq->load.weight
2865 * --------------------------------------------------- (5)
2866 * tg->load_avg - grq->avg.load_avg + grq->load.weight
2867 *
Peter Zijlstra17de4ee2017-08-24 13:06:35 +02002868 * But because grq->load.weight can drop to 0, resulting in a divide by zero,
2869 * we need to use grq->avg.load_avg as its lower bound, which then gives:
2870 *
2871 *
2872 * tg->weight * grq->load.weight
2873 * ge->load.weight = ----------------------------- (6)
2874 * tg_load_avg'
2875 *
2876 * Where:
2877 *
2878 * tg_load_avg' = tg->load_avg - grq->avg.load_avg +
2879 * max(grq->load.weight, grq->avg.load_avg)
Peter Zijlstracef27402017-05-09 11:04:07 +02002880 *
2881 * And that is shares_weight and is icky. In the (near) UP case it approaches
2882 * (4) while in the normal case it approaches (3). It consistently
2883 * overestimates the ge->load.weight and therefore:
2884 *
2885 * \Sum ge->load.weight >= tg->weight
2886 *
2887 * hence icky!
2888 */
Josef Bacik2c8e4dc2017-08-03 11:13:39 -04002889static long calc_group_shares(struct cfs_rq *cfs_rq)
Yong Zhang3ff6dca2011-01-24 15:33:52 +08002890{
Peter Zijlstra7c80cfc2017-05-06 16:03:17 +02002891 long tg_weight, tg_shares, load, shares;
2892 struct task_group *tg = cfs_rq->tg;
2893
2894 tg_shares = READ_ONCE(tg->shares);
Yong Zhang3ff6dca2011-01-24 15:33:52 +08002895
Peter Zijlstra3d4b60d2017-05-11 18:16:06 +02002896 load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
Peter Zijlstraea1dc6f2016-06-24 16:11:02 +02002897
2898 tg_weight = atomic_long_read(&tg->load_avg);
2899
2900 /* Ensure tg_weight >= load */
2901 tg_weight -= cfs_rq->tg_load_avg_contrib;
2902 tg_weight += load;
Yong Zhang3ff6dca2011-01-24 15:33:52 +08002903
Peter Zijlstra7c80cfc2017-05-06 16:03:17 +02002904 shares = (tg_shares * load);
Peter Zijlstracf5f0ac2011-10-13 16:52:28 +02002905 if (tg_weight)
2906 shares /= tg_weight;
Yong Zhang3ff6dca2011-01-24 15:33:52 +08002907
Dietmar Eggemannb8fd8422017-01-11 11:29:47 +00002908 /*
2909 * MIN_SHARES has to be unscaled here to support per-CPU partitioning
2910 * of a group with small tg->shares value. It is a floor value which is
2911 * assigned as a minimum load.weight to the sched_entity representing
2912 * the group on a CPU.
2913 *
2914 * E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024
2915 * on an 8-core system with 8 tasks each runnable on one CPU shares has
2916 * to be 15*1024*1/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In
2917 * case no task is runnable on a CPU MIN_SHARES=2 should be returned
2918 * instead of 0.
2919 */
Peter Zijlstra7c80cfc2017-05-06 16:03:17 +02002920 return clamp_t(long, shares, MIN_SHARES, tg_shares);
Yong Zhang3ff6dca2011-01-24 15:33:52 +08002921}
Josef Bacik2c8e4dc2017-08-03 11:13:39 -04002922
2923/*
Peter Zijlstra17de4ee2017-08-24 13:06:35 +02002924 * This calculates the effective runnable weight for a group entity based on
2925 * the group entity weight calculated above.
Josef Bacik2c8e4dc2017-08-03 11:13:39 -04002926 *
Peter Zijlstra17de4ee2017-08-24 13:06:35 +02002927 * Because of the above approximation (2), our group entity weight is
2928 * an load_avg based ratio (3). This means that it includes blocked load and
2929 * does not represent the runnable weight.
Josef Bacik2c8e4dc2017-08-03 11:13:39 -04002930 *
Peter Zijlstra17de4ee2017-08-24 13:06:35 +02002931 * Approximate the group entity's runnable weight per ratio from the group
2932 * runqueue:
2933 *
2934 * grq->avg.runnable_load_avg
2935 * ge->runnable_weight = ge->load.weight * -------------------------- (7)
2936 * grq->avg.load_avg
2937 *
2938 * However, analogous to above, since the avg numbers are slow, this leads to
2939 * transients in the from-idle case. Instead we use:
2940 *
2941 * ge->runnable_weight = ge->load.weight *
2942 *
2943 * max(grq->avg.runnable_load_avg, grq->runnable_weight)
2944 * ----------------------------------------------------- (8)
2945 * max(grq->avg.load_avg, grq->load.weight)
2946 *
2947 * Where these max() serve both to use the 'instant' values to fix the slow
2948 * from-idle and avoid the /0 on to-idle, similar to (6).
Josef Bacik2c8e4dc2017-08-03 11:13:39 -04002949 */
2950static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
2951{
Peter Zijlstra17de4ee2017-08-24 13:06:35 +02002952 long runnable, load_avg;
2953
2954 load_avg = max(cfs_rq->avg.load_avg,
2955 scale_load_down(cfs_rq->load.weight));
2956
2957 runnable = max(cfs_rq->avg.runnable_load_avg,
2958 scale_load_down(cfs_rq->runnable_weight));
Josef Bacik2c8e4dc2017-08-03 11:13:39 -04002959
2960 runnable *= shares;
2961 if (load_avg)
2962 runnable /= load_avg;
Peter Zijlstra17de4ee2017-08-24 13:06:35 +02002963
Josef Bacik2c8e4dc2017-08-03 11:13:39 -04002964 return clamp_t(long, runnable, MIN_SHARES, shares);
2965}
Vincent Guittot387f77c2018-02-13 09:59:42 +01002966#endif /* CONFIG_SMP */
Peter Zijlstraea1dc6f2016-06-24 16:11:02 +02002967
Paul Turner82958362012-10-04 13:18:31 +02002968static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
2969
Peter Zijlstra1ea6c462017-05-06 15:59:54 +02002970/*
2971 * Recomputes the group entity based on the current state of its group
2972 * runqueue.
2973 */
2974static void update_cfs_group(struct sched_entity *se)
Peter Zijlstra2069dd72010-11-15 15:47:00 -08002975{
Peter Zijlstra1ea6c462017-05-06 15:59:54 +02002976 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
2977 long shares, runnable;
Peter Zijlstra2069dd72010-11-15 15:47:00 -08002978
Peter Zijlstra1ea6c462017-05-06 15:59:54 +02002979 if (!gcfs_rq)
Peter Zijlstra2069dd72010-11-15 15:47:00 -08002980 return;
Vincent Guittot89ee0482016-12-21 16:50:26 +01002981
Peter Zijlstra1ea6c462017-05-06 15:59:54 +02002982 if (throttled_hierarchy(gcfs_rq))
Vincent Guittot89ee0482016-12-21 16:50:26 +01002983 return;
2984
Yong Zhang3ff6dca2011-01-24 15:33:52 +08002985#ifndef CONFIG_SMP
Peter Zijlstra1ea6c462017-05-06 15:59:54 +02002986 runnable = shares = READ_ONCE(gcfs_rq->tg->shares);
Peter Zijlstra7c80cfc2017-05-06 16:03:17 +02002987
2988 if (likely(se->load.weight == shares))
Yong Zhang3ff6dca2011-01-24 15:33:52 +08002989 return;
Peter Zijlstra7c80cfc2017-05-06 16:03:17 +02002990#else
Josef Bacik2c8e4dc2017-08-03 11:13:39 -04002991 shares = calc_group_shares(gcfs_rq);
2992 runnable = calc_group_runnable(gcfs_rq, shares);
Yong Zhang3ff6dca2011-01-24 15:33:52 +08002993#endif
Peter Zijlstra2069dd72010-11-15 15:47:00 -08002994
Peter Zijlstra1ea6c462017-05-06 15:59:54 +02002995 reweight_entity(cfs_rq_of(se), se, shares, runnable);
Peter Zijlstra2069dd72010-11-15 15:47:00 -08002996}
Vincent Guittot89ee0482016-12-21 16:50:26 +01002997
Peter Zijlstra2069dd72010-11-15 15:47:00 -08002998#else /* CONFIG_FAIR_GROUP_SCHED */
Peter Zijlstra1ea6c462017-05-06 15:59:54 +02002999static inline void update_cfs_group(struct sched_entity *se)
Peter Zijlstra2069dd72010-11-15 15:47:00 -08003000{
3001}
3002#endif /* CONFIG_FAIR_GROUP_SCHED */
3003
Peter Zijlstraea14b57e2018-02-02 10:27:00 +01003004static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
Viresh Kumara030d732017-05-24 10:59:52 +05303005{
Linus Torvalds43964402017-09-05 12:19:08 -07003006 struct rq *rq = rq_of(cfs_rq);
3007
Peter Zijlstraea14b57e2018-02-02 10:27:00 +01003008 if (&rq->cfs == cfs_rq || (flags & SCHED_CPUFREQ_MIGRATION)) {
Viresh Kumara030d732017-05-24 10:59:52 +05303009 /*
3010 * There are a few boundary cases this might miss but it should
3011 * get called often enough that that should (hopefully) not be
Joel Fernandes9783be22017-12-15 07:39:43 -08003012 * a real problem.
Viresh Kumara030d732017-05-24 10:59:52 +05303013 *
3014 * It will not get called when we go idle, because the idle
3015 * thread is a different class (!fair), nor will the utilization
3016 * number include things like RT tasks.
3017 *
3018 * As is, the util number is not freq-invariant (we'd have to
3019 * implement arch_scale_freq_capacity() for that).
3020 *
3021 * See cpu_util().
3022 */
Peter Zijlstraea14b57e2018-02-02 10:27:00 +01003023 cpufreq_update_util(rq, flags);
Viresh Kumara030d732017-05-24 10:59:52 +05303024 }
3025}
3026
Alex Shi141965c2013-06-26 13:05:39 +08003027#ifdef CONFIG_SMP
Paul Turnerc566e8e2012-10-04 13:18:30 +02003028#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra7c3edd22016-07-13 10:56:25 +02003029/**
3030 * update_tg_load_avg - update the tg's load avg
3031 * @cfs_rq: the cfs_rq whose avg changed
3032 * @force: update regardless of how small the difference
3033 *
3034 * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
3035 * However, because tg->load_avg is a global value there are performance
3036 * considerations.
3037 *
3038 * In order to avoid having to look at the other cfs_rq's, we use a
3039 * differential update where we store the last value we propagated. This in
3040 * turn allows skipping updates if the differential is 'small'.
3041 *
Rik van Riel815abf52017-06-23 12:55:30 -04003042 * Updating tg's load_avg is necessary before update_cfs_share().
Paul Turnerbb17f652012-10-04 13:18:31 +02003043 */
Yuyang Du9d89c252015-07-15 08:04:37 +08003044static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
Paul Turnerbb17f652012-10-04 13:18:31 +02003045{
Yuyang Du9d89c252015-07-15 08:04:37 +08003046 long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
Paul Turnerbb17f652012-10-04 13:18:31 +02003047
Waiman Longaa0b7ae2015-12-02 13:41:50 -05003048 /*
3049 * No need to update load_avg for root_task_group as it is not used.
3050 */
3051 if (cfs_rq->tg == &root_task_group)
3052 return;
3053
Yuyang Du9d89c252015-07-15 08:04:37 +08003054 if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
3055 atomic_long_add(delta, &cfs_rq->tg->load_avg);
3056 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
Paul Turnerbb17f652012-10-04 13:18:31 +02003057 }
Paul Turner8165e142012-10-04 13:18:31 +02003058}
Dietmar Eggemannf5f97392014-02-26 11:19:33 +00003059
Byungchul Parkad936d82015-10-24 01:16:19 +09003060/*
Ingo Molnar97fb7a02018-03-03 14:01:12 +01003061 * Called within set_task_rq() right before setting a task's CPU. The
Byungchul Parkad936d82015-10-24 01:16:19 +09003062 * caller only guarantees p->pi_lock is held; no other assumptions,
3063 * including the state of rq->lock, should be made.
3064 */
3065void set_task_rq_fair(struct sched_entity *se,
3066 struct cfs_rq *prev, struct cfs_rq *next)
3067{
Peter Zijlstra0ccb9772017-03-28 11:08:20 +02003068 u64 p_last_update_time;
3069 u64 n_last_update_time;
3070
Byungchul Parkad936d82015-10-24 01:16:19 +09003071 if (!sched_feat(ATTACH_AGE_LOAD))
3072 return;
3073
3074 /*
3075 * We are supposed to update the task to "current" time, then its up to
3076 * date and ready to go to new CPU/cfs_rq. But we have difficulty in
3077 * getting what current time is, so simply throw away the out-of-date
3078 * time. This will result in the wakee task is less decayed, but giving
3079 * the wakee more load sounds not bad.
3080 */
Peter Zijlstra0ccb9772017-03-28 11:08:20 +02003081 if (!(se->avg.last_update_time && prev))
3082 return;
Byungchul Parkad936d82015-10-24 01:16:19 +09003083
3084#ifndef CONFIG_64BIT
Peter Zijlstra0ccb9772017-03-28 11:08:20 +02003085 {
Byungchul Parkad936d82015-10-24 01:16:19 +09003086 u64 p_last_update_time_copy;
3087 u64 n_last_update_time_copy;
3088
3089 do {
3090 p_last_update_time_copy = prev->load_last_update_time_copy;
3091 n_last_update_time_copy = next->load_last_update_time_copy;
3092
3093 smp_rmb();
3094
3095 p_last_update_time = prev->avg.last_update_time;
3096 n_last_update_time = next->avg.last_update_time;
3097
3098 } while (p_last_update_time != p_last_update_time_copy ||
3099 n_last_update_time != n_last_update_time_copy);
Byungchul Parkad936d82015-10-24 01:16:19 +09003100 }
Peter Zijlstra0ccb9772017-03-28 11:08:20 +02003101#else
3102 p_last_update_time = prev->avg.last_update_time;
3103 n_last_update_time = next->avg.last_update_time;
3104#endif
3105 __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se);
3106 se->avg.last_update_time = n_last_update_time;
Byungchul Parkad936d82015-10-24 01:16:19 +09003107}
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003108
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003109
3110/*
3111 * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
3112 * propagate its contribution. The key to this propagation is the invariant
3113 * that for each group:
3114 *
3115 * ge->avg == grq->avg (1)
3116 *
3117 * _IFF_ we look at the pure running and runnable sums. Because they
3118 * represent the very same entity, just at different points in the hierarchy.
3119 *
Vincent Guittota4c3c042017-11-16 15:21:52 +01003120 * Per the above update_tg_cfs_util() is trivial and simply copies the running
3121 * sum over (but still wrong, because the group entity and group rq do not have
3122 * their PELT windows aligned).
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003123 *
3124 * However, update_tg_cfs_runnable() is more complex. So we have:
3125 *
3126 * ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2)
3127 *
3128 * And since, like util, the runnable part should be directly transferable,
3129 * the following would _appear_ to be the straight forward approach:
3130 *
Vincent Guittota4c3c042017-11-16 15:21:52 +01003131 * grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg (3)
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003132 *
3133 * And per (1) we have:
3134 *
Vincent Guittota4c3c042017-11-16 15:21:52 +01003135 * ge->avg.runnable_avg == grq->avg.runnable_avg
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003136 *
3137 * Which gives:
3138 *
3139 * ge->load.weight * grq->avg.load_avg
3140 * ge->avg.load_avg = ----------------------------------- (4)
3141 * grq->load.weight
3142 *
3143 * Except that is wrong!
3144 *
3145 * Because while for entities historical weight is not important and we
3146 * really only care about our future and therefore can consider a pure
3147 * runnable sum, runqueues can NOT do this.
3148 *
3149 * We specifically want runqueues to have a load_avg that includes
3150 * historical weights. Those represent the blocked load, the load we expect
3151 * to (shortly) return to us. This only works by keeping the weights as
3152 * integral part of the sum. We therefore cannot decompose as per (3).
3153 *
Vincent Guittota4c3c042017-11-16 15:21:52 +01003154 * Another reason this doesn't work is that runnable isn't a 0-sum entity.
3155 * Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the
3156 * rq itself is runnable anywhere between 2/3 and 1 depending on how the
3157 * runnable section of these tasks overlap (or not). If they were to perfectly
3158 * align the rq as a whole would be runnable 2/3 of the time. If however we
3159 * always have at least 1 runnable task, the rq as a whole is always runnable.
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003160 *
Vincent Guittota4c3c042017-11-16 15:21:52 +01003161 * So we'll have to approximate.. :/
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003162 *
Vincent Guittota4c3c042017-11-16 15:21:52 +01003163 * Given the constraint:
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003164 *
Vincent Guittota4c3c042017-11-16 15:21:52 +01003165 * ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003166 *
Vincent Guittota4c3c042017-11-16 15:21:52 +01003167 * We can construct a rule that adds runnable to a rq by assuming minimal
3168 * overlap.
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003169 *
Vincent Guittota4c3c042017-11-16 15:21:52 +01003170 * On removal, we'll assume each task is equally runnable; which yields:
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003171 *
Vincent Guittota4c3c042017-11-16 15:21:52 +01003172 * grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003173 *
Vincent Guittota4c3c042017-11-16 15:21:52 +01003174 * XXX: only do this for the part of runnable > running ?
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003175 *
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003176 */
3177
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003178static inline void
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003179update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003180{
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003181 long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
3182
3183 /* Nothing to update */
3184 if (!delta)
3185 return;
3186
Vincent Guittota4c3c042017-11-16 15:21:52 +01003187 /*
3188 * The relation between sum and avg is:
3189 *
3190 * LOAD_AVG_MAX - 1024 + sa->period_contrib
3191 *
3192 * however, the PELT windows are not aligned between grq and gse.
3193 */
3194
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003195 /* Set new sched_entity's utilization */
3196 se->avg.util_avg = gcfs_rq->avg.util_avg;
3197 se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
3198
3199 /* Update parent cfs_rq utilization */
3200 add_positive(&cfs_rq->avg.util_avg, delta);
3201 cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
3202}
3203
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003204static inline void
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003205update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003206{
Vincent Guittota4c3c042017-11-16 15:21:52 +01003207 long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
3208 unsigned long runnable_load_avg, load_avg;
3209 u64 runnable_load_sum, load_sum = 0;
3210 s64 delta_sum;
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003211
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003212 if (!runnable_sum)
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003213 return;
3214
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003215 gcfs_rq->prop_runnable_sum = 0;
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003216
Vincent Guittota4c3c042017-11-16 15:21:52 +01003217 if (runnable_sum >= 0) {
3218 /*
3219 * Add runnable; clip at LOAD_AVG_MAX. Reflects that until
3220 * the CPU is saturated running == runnable.
3221 */
3222 runnable_sum += se->avg.load_sum;
3223 runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX);
3224 } else {
3225 /*
3226 * Estimate the new unweighted runnable_sum of the gcfs_rq by
3227 * assuming all tasks are equally runnable.
3228 */
3229 if (scale_load_down(gcfs_rq->load.weight)) {
3230 load_sum = div_s64(gcfs_rq->avg.load_sum,
3231 scale_load_down(gcfs_rq->load.weight));
3232 }
3233
3234 /* But make sure to not inflate se's runnable */
3235 runnable_sum = min(se->avg.load_sum, load_sum);
3236 }
3237
3238 /*
3239 * runnable_sum can't be lower than running_sum
Ingo Molnar97fb7a02018-03-03 14:01:12 +01003240 * As running sum is scale with CPU capacity wehreas the runnable sum
Vincent Guittota4c3c042017-11-16 15:21:52 +01003241 * is not we rescale running_sum 1st
3242 */
3243 running_sum = se->avg.util_sum /
3244 arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
3245 runnable_sum = max(runnable_sum, running_sum);
3246
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003247 load_sum = (s64)se_weight(se) * runnable_sum;
3248 load_avg = div_s64(load_sum, LOAD_AVG_MAX);
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003249
Vincent Guittota4c3c042017-11-16 15:21:52 +01003250 delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
3251 delta_avg = load_avg - se->avg.load_avg;
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003252
Vincent Guittota4c3c042017-11-16 15:21:52 +01003253 se->avg.load_sum = runnable_sum;
3254 se->avg.load_avg = load_avg;
3255 add_positive(&cfs_rq->avg.load_avg, delta_avg);
3256 add_positive(&cfs_rq->avg.load_sum, delta_sum);
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003257
Peter Zijlstra1ea6c462017-05-06 15:59:54 +02003258 runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
3259 runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
Vincent Guittota4c3c042017-11-16 15:21:52 +01003260 delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum;
3261 delta_avg = runnable_load_avg - se->avg.runnable_load_avg;
Peter Zijlstra1ea6c462017-05-06 15:59:54 +02003262
Vincent Guittota4c3c042017-11-16 15:21:52 +01003263 se->avg.runnable_load_sum = runnable_sum;
3264 se->avg.runnable_load_avg = runnable_load_avg;
Peter Zijlstra1ea6c462017-05-06 15:59:54 +02003265
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003266 if (se->on_rq) {
Vincent Guittota4c3c042017-11-16 15:21:52 +01003267 add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg);
3268 add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum);
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003269 }
3270}
3271
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003272static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003273{
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003274 cfs_rq->propagate = 1;
3275 cfs_rq->prop_runnable_sum += runnable_sum;
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003276}
3277
3278/* Update task and its cfs_rq load average */
3279static inline int propagate_entity_load_avg(struct sched_entity *se)
3280{
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003281 struct cfs_rq *cfs_rq, *gcfs_rq;
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003282
3283 if (entity_is_task(se))
3284 return 0;
3285
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003286 gcfs_rq = group_cfs_rq(se);
3287 if (!gcfs_rq->propagate)
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003288 return 0;
3289
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003290 gcfs_rq->propagate = 0;
3291
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003292 cfs_rq = cfs_rq_of(se);
3293
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003294 add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003295
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003296 update_tg_cfs_util(cfs_rq, se, gcfs_rq);
3297 update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003298
3299 return 1;
3300}
3301
Vincent Guittotbc427892017-03-17 14:47:22 +01003302/*
3303 * Check if we need to update the load and the utilization of a blocked
3304 * group_entity:
3305 */
3306static inline bool skip_blocked_update(struct sched_entity *se)
3307{
3308 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3309
3310 /*
3311 * If sched_entity still have not zero load or utilization, we have to
3312 * decay it:
3313 */
3314 if (se->avg.load_avg || se->avg.util_avg)
3315 return false;
3316
3317 /*
3318 * If there is a pending propagation, we have to update the load and
3319 * the utilization of the sched_entity:
3320 */
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003321 if (gcfs_rq->propagate)
Vincent Guittotbc427892017-03-17 14:47:22 +01003322 return false;
3323
3324 /*
3325 * Otherwise, the load and the utilization of the sched_entity is
3326 * already zero and there is no pending propagation, so it will be a
3327 * waste of time to try to decay it:
3328 */
3329 return true;
3330}
3331
Peter Zijlstra6e831252014-02-11 16:11:48 +01003332#else /* CONFIG_FAIR_GROUP_SCHED */
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003333
Yuyang Du9d89c252015-07-15 08:04:37 +08003334static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003335
3336static inline int propagate_entity_load_avg(struct sched_entity *se)
3337{
3338 return 0;
3339}
3340
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003341static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003342
Peter Zijlstra6e831252014-02-11 16:11:48 +01003343#endif /* CONFIG_FAIR_GROUP_SCHED */
Paul Turnerc566e8e2012-10-04 13:18:30 +02003344
Peter Zijlstra3d30544f2016-06-21 14:27:50 +02003345/**
3346 * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
3347 * @now: current time, as per cfs_rq_clock_task()
3348 * @cfs_rq: cfs_rq to update
Peter Zijlstra3d30544f2016-06-21 14:27:50 +02003349 *
3350 * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
3351 * avg. The immediate corollary is that all (fair) tasks must be attached, see
3352 * post_init_entity_util_avg().
3353 *
3354 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
3355 *
Peter Zijlstra7c3edd22016-07-13 10:56:25 +02003356 * Returns true if the load decayed or we removed load.
3357 *
3358 * Since both these conditions indicate a changed cfs_rq->avg.load we should
3359 * call update_tg_load_avg() when this function returns true.
Peter Zijlstra3d30544f2016-06-21 14:27:50 +02003360 */
Steve Mucklea2c6c912016-03-24 15:26:07 -07003361static inline int
Viresh Kumar3a123bb2017-05-24 10:59:56 +05303362update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
Steve Mucklea2c6c912016-03-24 15:26:07 -07003363{
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003364 unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
Steve Mucklea2c6c912016-03-24 15:26:07 -07003365 struct sched_avg *sa = &cfs_rq->avg;
Peter Zijlstra2a2f5d4e2017-05-08 16:51:41 +02003366 int decayed = 0;
Steve Mucklea2c6c912016-03-24 15:26:07 -07003367
Peter Zijlstra2a2f5d4e2017-05-08 16:51:41 +02003368 if (cfs_rq->removed.nr) {
3369 unsigned long r;
Peter Zijlstra9a2dd582017-05-12 14:18:10 +02003370 u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
Peter Zijlstra2a2f5d4e2017-05-08 16:51:41 +02003371
3372 raw_spin_lock(&cfs_rq->removed.lock);
3373 swap(cfs_rq->removed.util_avg, removed_util);
3374 swap(cfs_rq->removed.load_avg, removed_load);
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003375 swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
Peter Zijlstra2a2f5d4e2017-05-08 16:51:41 +02003376 cfs_rq->removed.nr = 0;
3377 raw_spin_unlock(&cfs_rq->removed.lock);
3378
Peter Zijlstra2a2f5d4e2017-05-08 16:51:41 +02003379 r = removed_load;
Peter Zijlstra89741892016-06-16 10:50:40 +02003380 sub_positive(&sa->load_avg, r);
Peter Zijlstra9a2dd582017-05-12 14:18:10 +02003381 sub_positive(&sa->load_sum, r * divider);
Steve Mucklea2c6c912016-03-24 15:26:07 -07003382
Peter Zijlstra2a2f5d4e2017-05-08 16:51:41 +02003383 r = removed_util;
Peter Zijlstra89741892016-06-16 10:50:40 +02003384 sub_positive(&sa->util_avg, r);
Peter Zijlstra9a2dd582017-05-12 14:18:10 +02003385 sub_positive(&sa->util_sum, r * divider);
Peter Zijlstra2a2f5d4e2017-05-08 16:51:41 +02003386
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003387 add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
Peter Zijlstra2a2f5d4e2017-05-08 16:51:41 +02003388
3389 decayed = 1;
Steve Mucklea2c6c912016-03-24 15:26:07 -07003390 }
3391
Peter Zijlstra2a2f5d4e2017-05-08 16:51:41 +02003392 decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
Steve Mucklea2c6c912016-03-24 15:26:07 -07003393
3394#ifndef CONFIG_64BIT
3395 smp_wmb();
3396 cfs_rq->load_last_update_time_copy = sa->last_update_time;
3397#endif
3398
Peter Zijlstra2a2f5d4e2017-05-08 16:51:41 +02003399 if (decayed)
Peter Zijlstraea14b57e2018-02-02 10:27:00 +01003400 cfs_rq_util_change(cfs_rq, 0);
Steve Muckle21e96f82016-03-21 17:21:07 -07003401
Peter Zijlstra2a2f5d4e2017-05-08 16:51:41 +02003402 return decayed;
Yuyang Du9d89c252015-07-15 08:04:37 +08003403}
3404
Peter Zijlstra3d30544f2016-06-21 14:27:50 +02003405/**
3406 * attach_entity_load_avg - attach this entity to its cfs_rq load avg
3407 * @cfs_rq: cfs_rq to attach to
3408 * @se: sched_entity to attach
Randy Dunlap882a78a2018-09-03 12:53:17 -07003409 * @flags: migration hints
Peter Zijlstra3d30544f2016-06-21 14:27:50 +02003410 *
3411 * Must call update_cfs_rq_load_avg() before this, since we rely on
3412 * cfs_rq->avg.last_update_time being current.
3413 */
Peter Zijlstraea14b57e2018-02-02 10:27:00 +01003414static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
Byungchul Parka05e8c52015-08-20 20:21:56 +09003415{
Peter Zijlstraf2079342017-05-12 14:16:30 +02003416 u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
3417
3418 /*
3419 * When we attach the @se to the @cfs_rq, we must align the decay
3420 * window because without that, really weird and wonderful things can
3421 * happen.
3422 *
3423 * XXX illustrate
3424 */
Byungchul Parka05e8c52015-08-20 20:21:56 +09003425 se->avg.last_update_time = cfs_rq->avg.last_update_time;
Peter Zijlstraf2079342017-05-12 14:16:30 +02003426 se->avg.period_contrib = cfs_rq->avg.period_contrib;
3427
3428 /*
3429 * Hell(o) Nasty stuff.. we need to recompute _sum based on the new
3430 * period_contrib. This isn't strictly correct, but since we're
3431 * entirely outside of the PELT hierarchy, nobody cares if we truncate
3432 * _sum a little.
3433 */
3434 se->avg.util_sum = se->avg.util_avg * divider;
3435
3436 se->avg.load_sum = divider;
3437 if (se_weight(se)) {
3438 se->avg.load_sum =
3439 div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
3440 }
3441
3442 se->avg.runnable_load_sum = se->avg.load_sum;
3443
Peter Zijlstra8d5b9022017-08-24 17:45:35 +02003444 enqueue_load_avg(cfs_rq, se);
Byungchul Parka05e8c52015-08-20 20:21:56 +09003445 cfs_rq->avg.util_avg += se->avg.util_avg;
3446 cfs_rq->avg.util_sum += se->avg.util_sum;
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003447
3448 add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
Steve Mucklea2c6c912016-03-24 15:26:07 -07003449
Peter Zijlstraea14b57e2018-02-02 10:27:00 +01003450 cfs_rq_util_change(cfs_rq, flags);
Byungchul Parka05e8c52015-08-20 20:21:56 +09003451}
3452
Peter Zijlstra3d30544f2016-06-21 14:27:50 +02003453/**
3454 * detach_entity_load_avg - detach this entity from its cfs_rq load avg
3455 * @cfs_rq: cfs_rq to detach from
3456 * @se: sched_entity to detach
3457 *
3458 * Must call update_cfs_rq_load_avg() before this, since we rely on
3459 * cfs_rq->avg.last_update_time being current.
3460 */
Byungchul Parka05e8c52015-08-20 20:21:56 +09003461static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3462{
Peter Zijlstra8d5b9022017-08-24 17:45:35 +02003463 dequeue_load_avg(cfs_rq, se);
Peter Zijlstra89741892016-06-16 10:50:40 +02003464 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
3465 sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003466
3467 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
Steve Mucklea2c6c912016-03-24 15:26:07 -07003468
Peter Zijlstraea14b57e2018-02-02 10:27:00 +01003469 cfs_rq_util_change(cfs_rq, 0);
Byungchul Parka05e8c52015-08-20 20:21:56 +09003470}
3471
Peter Zijlstrab382a532017-05-06 17:37:03 +02003472/*
3473 * Optional action to be done while updating the load average
3474 */
3475#define UPDATE_TG 0x1
3476#define SKIP_AGE_LOAD 0x2
3477#define DO_ATTACH 0x4
3478
3479/* Update task and its cfs_rq load average */
3480static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3481{
3482 u64 now = cfs_rq_clock_task(cfs_rq);
3483 struct rq *rq = rq_of(cfs_rq);
3484 int cpu = cpu_of(rq);
3485 int decayed;
3486
3487 /*
3488 * Track task load average for carrying it to new CPU after migrated, and
3489 * track group sched_entity load average for task_h_load calc in migration
3490 */
3491 if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
3492 __update_load_avg_se(now, cpu, cfs_rq, se);
3493
3494 decayed = update_cfs_rq_load_avg(now, cfs_rq);
3495 decayed |= propagate_entity_load_avg(se);
3496
3497 if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
3498
Peter Zijlstraea14b57e2018-02-02 10:27:00 +01003499 /*
3500 * DO_ATTACH means we're here from enqueue_entity().
3501 * !last_update_time means we've passed through
3502 * migrate_task_rq_fair() indicating we migrated.
3503 *
3504 * IOW we're enqueueing a task on a new CPU.
3505 */
3506 attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION);
Peter Zijlstrab382a532017-05-06 17:37:03 +02003507 update_tg_load_avg(cfs_rq, 0);
3508
3509 } else if (decayed && (flags & UPDATE_TG))
3510 update_tg_load_avg(cfs_rq, 0);
3511}
3512
Yuyang Du0905f042015-12-17 07:34:27 +08003513#ifndef CONFIG_64BIT
3514static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3515{
3516 u64 last_update_time_copy;
3517 u64 last_update_time;
3518
3519 do {
3520 last_update_time_copy = cfs_rq->load_last_update_time_copy;
3521 smp_rmb();
3522 last_update_time = cfs_rq->avg.last_update_time;
3523 } while (last_update_time != last_update_time_copy);
3524
3525 return last_update_time;
3526}
3527#else
3528static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3529{
3530 return cfs_rq->avg.last_update_time;
3531}
3532#endif
3533
Paul Turner9ee474f2012-10-04 13:18:30 +02003534/*
Morten Rasmussen104cb162016-10-14 14:41:07 +01003535 * Synchronize entity load avg of dequeued entity without locking
3536 * the previous rq.
3537 */
3538void sync_entity_load_avg(struct sched_entity *se)
3539{
3540 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3541 u64 last_update_time;
3542
3543 last_update_time = cfs_rq_last_update_time(cfs_rq);
Peter Zijlstra0ccb9772017-03-28 11:08:20 +02003544 __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se);
Morten Rasmussen104cb162016-10-14 14:41:07 +01003545}
3546
3547/*
Yuyang Du9d89c252015-07-15 08:04:37 +08003548 * Task first catches up with cfs_rq, and then subtract
3549 * itself from the cfs_rq (task must be off the queue now).
Paul Turner9ee474f2012-10-04 13:18:30 +02003550 */
Yuyang Du9d89c252015-07-15 08:04:37 +08003551void remove_entity_load_avg(struct sched_entity *se)
Paul Turner9ee474f2012-10-04 13:18:30 +02003552{
Yuyang Du9d89c252015-07-15 08:04:37 +08003553 struct cfs_rq *cfs_rq = cfs_rq_of(se);
Peter Zijlstra2a2f5d4e2017-05-08 16:51:41 +02003554 unsigned long flags;
Paul Turner9ee474f2012-10-04 13:18:30 +02003555
Yuyang Du0905f042015-12-17 07:34:27 +08003556 /*
Peter Zijlstra7dc603c2016-06-16 13:29:28 +02003557 * tasks cannot exit without having gone through wake_up_new_task() ->
3558 * post_init_entity_util_avg() which will have added things to the
3559 * cfs_rq, so we can remove unconditionally.
3560 *
3561 * Similarly for groups, they will have passed through
3562 * post_init_entity_util_avg() before unregister_sched_fair_group()
3563 * calls this.
Yuyang Du0905f042015-12-17 07:34:27 +08003564 */
Paul Turner9ee474f2012-10-04 13:18:30 +02003565
Morten Rasmussen104cb162016-10-14 14:41:07 +01003566 sync_entity_load_avg(se);
Peter Zijlstra2a2f5d4e2017-05-08 16:51:41 +02003567
3568 raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
3569 ++cfs_rq->removed.nr;
3570 cfs_rq->removed.util_avg += se->avg.util_avg;
3571 cfs_rq->removed.load_avg += se->avg.load_avg;
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003572 cfs_rq->removed.runnable_sum += se->avg.load_sum; /* == runnable_sum */
Peter Zijlstra2a2f5d4e2017-05-08 16:51:41 +02003573 raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
Paul Turner2dac7542012-10-04 13:18:30 +02003574}
Vincent Guittot642dbc32013-04-18 18:34:26 +02003575
Yuyang Du7ea241a2015-07-15 08:04:42 +08003576static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
3577{
Peter Zijlstra1ea6c462017-05-06 15:59:54 +02003578 return cfs_rq->avg.runnable_load_avg;
Yuyang Du7ea241a2015-07-15 08:04:42 +08003579}
3580
3581static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
3582{
3583 return cfs_rq->avg.load_avg;
3584}
3585
Matt Fleming46f69fa2016-09-21 14:38:12 +01003586static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
Peter Zijlstra6e831252014-02-11 16:11:48 +01003587
Patrick Bellasi7f65ea42018-03-09 09:52:42 +00003588static inline unsigned long task_util(struct task_struct *p)
3589{
3590 return READ_ONCE(p->se.avg.util_avg);
3591}
3592
3593static inline unsigned long _task_util_est(struct task_struct *p)
3594{
3595 struct util_est ue = READ_ONCE(p->se.avg.util_est);
3596
3597 return max(ue.ewma, ue.enqueued);
3598}
3599
3600static inline unsigned long task_util_est(struct task_struct *p)
3601{
3602 return max(task_util(p), _task_util_est(p));
3603}
3604
3605static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
3606 struct task_struct *p)
3607{
3608 unsigned int enqueued;
3609
3610 if (!sched_feat(UTIL_EST))
3611 return;
3612
3613 /* Update root cfs_rq's estimated utilization */
3614 enqueued = cfs_rq->avg.util_est.enqueued;
Patrick Bellasid5193292018-03-09 09:52:45 +00003615 enqueued += (_task_util_est(p) | UTIL_AVG_UNCHANGED);
Patrick Bellasi7f65ea42018-03-09 09:52:42 +00003616 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
3617}
3618
3619/*
3620 * Check if a (signed) value is within a specified (unsigned) margin,
3621 * based on the observation that:
3622 *
3623 * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
3624 *
3625 * NOTE: this only works when value + maring < INT_MAX.
3626 */
3627static inline bool within_margin(int value, int margin)
3628{
3629 return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
3630}
3631
3632static void
3633util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
3634{
3635 long last_ewma_diff;
3636 struct util_est ue;
3637
3638 if (!sched_feat(UTIL_EST))
3639 return;
3640
Vincent Guittot3482d982018-06-14 12:33:00 +02003641 /* Update root cfs_rq's estimated utilization */
3642 ue.enqueued = cfs_rq->avg.util_est.enqueued;
3643 ue.enqueued -= min_t(unsigned int, ue.enqueued,
3644 (_task_util_est(p) | UTIL_AVG_UNCHANGED));
Patrick Bellasi7f65ea42018-03-09 09:52:42 +00003645 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
3646
3647 /*
3648 * Skip update of task's estimated utilization when the task has not
3649 * yet completed an activation, e.g. being migrated.
3650 */
3651 if (!task_sleep)
3652 return;
3653
3654 /*
Patrick Bellasid5193292018-03-09 09:52:45 +00003655 * If the PELT values haven't changed since enqueue time,
3656 * skip the util_est update.
3657 */
3658 ue = p->se.avg.util_est;
3659 if (ue.enqueued & UTIL_AVG_UNCHANGED)
3660 return;
3661
3662 /*
Patrick Bellasi7f65ea42018-03-09 09:52:42 +00003663 * Skip update of task's estimated utilization when its EWMA is
3664 * already ~1% close to its last activation value.
3665 */
Patrick Bellasid5193292018-03-09 09:52:45 +00003666 ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
Patrick Bellasi7f65ea42018-03-09 09:52:42 +00003667 last_ewma_diff = ue.enqueued - ue.ewma;
3668 if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
3669 return;
3670
3671 /*
3672 * Update Task's estimated utilization
3673 *
3674 * When *p completes an activation we can consolidate another sample
3675 * of the task size. This is done by storing the current PELT value
3676 * as ue.enqueued and by using this value to update the Exponential
3677 * Weighted Moving Average (EWMA):
3678 *
3679 * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1)
3680 * = w * task_util(p) + ewma(t-1) - w * ewma(t-1)
3681 * = w * (task_util(p) - ewma(t-1)) + ewma(t-1)
3682 * = w * ( last_ewma_diff ) + ewma(t-1)
3683 * = w * (last_ewma_diff + ewma(t-1) / w)
3684 *
3685 * Where 'w' is the weight of new samples, which is configured to be
3686 * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
3687 */
3688 ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
3689 ue.ewma += last_ewma_diff;
3690 ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
3691 WRITE_ONCE(p->se.avg.util_est, ue);
3692}
3693
Morten Rasmussen3b1baa62018-07-04 11:17:40 +01003694static inline int task_fits_capacity(struct task_struct *p, long capacity)
3695{
3696 return capacity * 1024 > task_util_est(p) * capacity_margin;
3697}
3698
3699static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
3700{
3701 if (!static_branch_unlikely(&sched_asym_cpucapacity))
3702 return;
3703
3704 if (!p) {
3705 rq->misfit_task_load = 0;
3706 return;
3707 }
3708
3709 if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
3710 rq->misfit_task_load = 0;
3711 return;
3712 }
3713
3714 rq->misfit_task_load = task_h_load(p);
3715}
3716
Peter Zijlstra38033c32014-01-23 20:32:21 +01003717#else /* CONFIG_SMP */
3718
Vincent Guittotd31b1a62016-11-08 10:53:44 +01003719#define UPDATE_TG 0x0
3720#define SKIP_AGE_LOAD 0x0
Peter Zijlstrab382a532017-05-06 17:37:03 +02003721#define DO_ATTACH 0x0
Vincent Guittotd31b1a62016-11-08 10:53:44 +01003722
Peter Zijlstra88c06162017-05-06 17:32:43 +02003723static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
Rafael J. Wysocki536bd002016-05-06 14:58:43 +02003724{
Peter Zijlstraea14b57e2018-02-02 10:27:00 +01003725 cfs_rq_util_change(cfs_rq, 0);
Rafael J. Wysocki536bd002016-05-06 14:58:43 +02003726}
3727
Yuyang Du9d89c252015-07-15 08:04:37 +08003728static inline void remove_entity_load_avg(struct sched_entity *se) {}
Peter Zijlstra6e831252014-02-11 16:11:48 +01003729
Byungchul Parka05e8c52015-08-20 20:21:56 +09003730static inline void
Peter Zijlstraea14b57e2018-02-02 10:27:00 +01003731attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) {}
Byungchul Parka05e8c52015-08-20 20:21:56 +09003732static inline void
3733detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3734
Matt Fleming46f69fa2016-09-21 14:38:12 +01003735static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
Peter Zijlstra6e831252014-02-11 16:11:48 +01003736{
3737 return 0;
3738}
3739
Patrick Bellasi7f65ea42018-03-09 09:52:42 +00003740static inline void
3741util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
3742
3743static inline void
3744util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
3745 bool task_sleep) {}
Morten Rasmussen3b1baa62018-07-04 11:17:40 +01003746static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
Patrick Bellasi7f65ea42018-03-09 09:52:42 +00003747
Peter Zijlstra38033c32014-01-23 20:32:21 +01003748#endif /* CONFIG_SMP */
Paul Turner9d85f212012-10-04 13:18:29 +02003749
Peter Zijlstraddc97292007-10-15 17:00:10 +02003750static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
3751{
3752#ifdef CONFIG_SCHED_DEBUG
3753 s64 d = se->vruntime - cfs_rq->min_vruntime;
3754
3755 if (d < 0)
3756 d = -d;
3757
3758 if (d > 3*sysctl_sched_latency)
Josh Poimboeufae928822016-06-17 12:43:24 -05003759 schedstat_inc(cfs_rq->nr_spread_over);
Peter Zijlstraddc97292007-10-15 17:00:10 +02003760#endif
3761}
3762
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02003763static void
Peter Zijlstraaeb73b02007-10-15 17:00:05 +02003764place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
3765{
Peter Zijlstra1af5f732008-10-24 11:06:13 +02003766 u64 vruntime = cfs_rq->min_vruntime;
Peter Zijlstra94dfb5e2007-10-15 17:00:05 +02003767
Peter Zijlstra2cb86002007-11-09 22:39:37 +01003768 /*
3769 * The 'current' period is already promised to the current tasks,
3770 * however the extra weight of the new task will slow them down a
3771 * little, place the new task so that it fits in the slot that
3772 * stays open at the end.
3773 */
Peter Zijlstra94dfb5e2007-10-15 17:00:05 +02003774 if (initial && sched_feat(START_DEBIT))
Peter Zijlstraf9c0b092008-10-17 19:27:04 +02003775 vruntime += sched_vslice(cfs_rq, se);
Peter Zijlstraaeb73b02007-10-15 17:00:05 +02003776
Mike Galbraitha2e7a7e2009-09-18 09:19:25 +02003777 /* sleeps up to a single latency don't count. */
Mike Galbraith5ca98802010-03-11 17:17:17 +01003778 if (!initial) {
Mike Galbraitha2e7a7e2009-09-18 09:19:25 +02003779 unsigned long thresh = sysctl_sched_latency;
Peter Zijlstraa7be37a2008-06-27 13:41:11 +02003780
Mike Galbraitha2e7a7e2009-09-18 09:19:25 +02003781 /*
Mike Galbraitha2e7a7e2009-09-18 09:19:25 +02003782 * Halve their sleep time's effect, to allow
3783 * for a gentler effect of sleepers:
3784 */
3785 if (sched_feat(GENTLE_FAIR_SLEEPERS))
3786 thresh >>= 1;
Ingo Molnar51e03042009-09-16 08:54:45 +02003787
Mike Galbraitha2e7a7e2009-09-18 09:19:25 +02003788 vruntime -= thresh;
Peter Zijlstraaeb73b02007-10-15 17:00:05 +02003789 }
3790
Mike Galbraithb5d9d732009-09-08 11:12:28 +02003791 /* ensure we never gain time by being placed backwards. */
Viresh Kumar16c8f1c2012-11-08 13:33:46 +05303792 se->vruntime = max_vruntime(se->vruntime, vruntime);
Peter Zijlstraaeb73b02007-10-15 17:00:05 +02003793}
3794
Paul Turnerd3d9dc32011-07-21 09:43:39 -07003795static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
3796
Mel Gormancb251762016-02-05 09:08:36 +00003797static inline void check_schedstat_required(void)
3798{
3799#ifdef CONFIG_SCHEDSTATS
3800 if (schedstat_enabled())
3801 return;
3802
3803 /* Force schedstat enabled if a dependent tracepoint is active */
3804 if (trace_sched_stat_wait_enabled() ||
3805 trace_sched_stat_sleep_enabled() ||
3806 trace_sched_stat_iowait_enabled() ||
3807 trace_sched_stat_blocked_enabled() ||
3808 trace_sched_stat_runtime_enabled()) {
Josh Poimboeufeda8dca2016-06-13 02:32:09 -05003809 printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
Mel Gormancb251762016-02-05 09:08:36 +00003810 "stat_blocked and stat_runtime require the "
Marcin Nowakowskif67abed2017-06-09 10:00:29 +02003811 "kernel parameter schedstats=enable or "
Mel Gormancb251762016-02-05 09:08:36 +00003812 "kernel.sched_schedstats=1\n");
3813 }
3814#endif
3815}
3816
Peter Zijlstrab5179ac2016-05-11 16:10:34 +02003817
3818/*
3819 * MIGRATION
3820 *
3821 * dequeue
3822 * update_curr()
3823 * update_min_vruntime()
3824 * vruntime -= min_vruntime
3825 *
3826 * enqueue
3827 * update_curr()
3828 * update_min_vruntime()
3829 * vruntime += min_vruntime
3830 *
3831 * this way the vruntime transition between RQs is done when both
3832 * min_vruntime are up-to-date.
3833 *
3834 * WAKEUP (remote)
3835 *
Peter Zijlstra59efa0b2016-05-10 18:24:37 +02003836 * ->migrate_task_rq_fair() (p->state == TASK_WAKING)
Peter Zijlstrab5179ac2016-05-11 16:10:34 +02003837 * vruntime -= min_vruntime
3838 *
3839 * enqueue
3840 * update_curr()
3841 * update_min_vruntime()
3842 * vruntime += min_vruntime
3843 *
3844 * this way we don't have the most up-to-date min_vruntime on the originating
3845 * CPU and an up-to-date min_vruntime on the destination CPU.
3846 */
3847
Peter Zijlstraaeb73b02007-10-15 17:00:05 +02003848static void
Peter Zijlstra88ec22d2009-12-16 18:04:41 +01003849enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02003850{
Peter Zijlstra2f950352016-05-11 19:27:56 +02003851 bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
3852 bool curr = cfs_rq->curr == se;
Peter Zijlstra3a47d512016-03-09 13:04:03 +01003853
Ingo Molnar53d3bc72016-05-11 08:25:53 +02003854 /*
Peter Zijlstra2f950352016-05-11 19:27:56 +02003855 * If we're the current task, we must renormalise before calling
3856 * update_curr().
Ingo Molnar53d3bc72016-05-11 08:25:53 +02003857 */
Peter Zijlstra2f950352016-05-11 19:27:56 +02003858 if (renorm && curr)
3859 se->vruntime += cfs_rq->min_vruntime;
3860
Ingo Molnarb7cc0892007-08-09 11:16:47 +02003861 update_curr(cfs_rq);
Peter Zijlstra2f950352016-05-11 19:27:56 +02003862
3863 /*
3864 * Otherwise, renormalise after, such that we're placed at the current
3865 * moment in time, instead of some random moment in the past. Being
3866 * placed in the past could significantly boost this task to the
3867 * fairness detriment of existing tasks.
3868 */
3869 if (renorm && !curr)
3870 se->vruntime += cfs_rq->min_vruntime;
3871
Vincent Guittot89ee0482016-12-21 16:50:26 +01003872 /*
3873 * When enqueuing a sched_entity, we must:
3874 * - Update loads to have both entity and cfs_rq synced with now.
3875 * - Add its load to cfs_rq->runnable_avg
3876 * - For group_entity, update its weight to reflect the new share of
3877 * its group cfs_rq
3878 * - Add its new weight to cfs_rq->load.weight
3879 */
Peter Zijlstrab382a532017-05-06 17:37:03 +02003880 update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
Peter Zijlstra1ea6c462017-05-06 15:59:54 +02003881 update_cfs_group(se);
Peter Zijlstrab5b3e352017-08-24 17:38:30 +02003882 enqueue_runnable_load_avg(cfs_rq, se);
Linus Torvalds17bc14b2012-12-14 07:20:43 -08003883 account_entity_enqueue(cfs_rq, se);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02003884
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -05003885 if (flags & ENQUEUE_WAKEUP)
Peter Zijlstraaeb73b02007-10-15 17:00:05 +02003886 place_entity(cfs_rq, se, 0);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02003887
Mel Gormancb251762016-02-05 09:08:36 +00003888 check_schedstat_required();
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -05003889 update_stats_enqueue(cfs_rq, se, flags);
3890 check_spread(cfs_rq, se);
Peter Zijlstra2f950352016-05-11 19:27:56 +02003891 if (!curr)
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +02003892 __enqueue_entity(cfs_rq, se);
Peter Zijlstra2069dd72010-11-15 15:47:00 -08003893 se->on_rq = 1;
Peter Zijlstra3d4b47b2010-11-15 15:47:01 -08003894
Paul Turnerd3d9dc32011-07-21 09:43:39 -07003895 if (cfs_rq->nr_running == 1) {
Peter Zijlstra3d4b47b2010-11-15 15:47:01 -08003896 list_add_leaf_cfs_rq(cfs_rq);
Paul Turnerd3d9dc32011-07-21 09:43:39 -07003897 check_enqueue_throttle(cfs_rq);
3898 }
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02003899}
3900
Rik van Riel2c13c9192011-02-01 09:48:37 -05003901static void __clear_buddies_last(struct sched_entity *se)
Peter Zijlstra2002c692008-11-11 11:52:33 +01003902{
Rik van Riel2c13c9192011-02-01 09:48:37 -05003903 for_each_sched_entity(se) {
3904 struct cfs_rq *cfs_rq = cfs_rq_of(se);
Peter Zijlstraf1044792012-02-11 06:05:00 +01003905 if (cfs_rq->last != se)
Rik van Riel2c13c9192011-02-01 09:48:37 -05003906 break;
Peter Zijlstraf1044792012-02-11 06:05:00 +01003907
3908 cfs_rq->last = NULL;
Rik van Riel2c13c9192011-02-01 09:48:37 -05003909 }
3910}
Peter Zijlstra2002c692008-11-11 11:52:33 +01003911
Rik van Riel2c13c9192011-02-01 09:48:37 -05003912static void __clear_buddies_next(struct sched_entity *se)
3913{
3914 for_each_sched_entity(se) {
3915 struct cfs_rq *cfs_rq = cfs_rq_of(se);
Peter Zijlstraf1044792012-02-11 06:05:00 +01003916 if (cfs_rq->next != se)
Rik van Riel2c13c9192011-02-01 09:48:37 -05003917 break;
Peter Zijlstraf1044792012-02-11 06:05:00 +01003918
3919 cfs_rq->next = NULL;
Rik van Riel2c13c9192011-02-01 09:48:37 -05003920 }
Peter Zijlstra2002c692008-11-11 11:52:33 +01003921}
3922
Rik van Rielac53db52011-02-01 09:51:03 -05003923static void __clear_buddies_skip(struct sched_entity *se)
3924{
3925 for_each_sched_entity(se) {
3926 struct cfs_rq *cfs_rq = cfs_rq_of(se);
Peter Zijlstraf1044792012-02-11 06:05:00 +01003927 if (cfs_rq->skip != se)
Rik van Rielac53db52011-02-01 09:51:03 -05003928 break;
Peter Zijlstraf1044792012-02-11 06:05:00 +01003929
3930 cfs_rq->skip = NULL;
Rik van Rielac53db52011-02-01 09:51:03 -05003931 }
3932}
3933
Peter Zijlstraa571bbe2009-01-28 14:51:40 +01003934static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
3935{
Rik van Riel2c13c9192011-02-01 09:48:37 -05003936 if (cfs_rq->last == se)
3937 __clear_buddies_last(se);
3938
3939 if (cfs_rq->next == se)
3940 __clear_buddies_next(se);
Rik van Rielac53db52011-02-01 09:51:03 -05003941
3942 if (cfs_rq->skip == se)
3943 __clear_buddies_skip(se);
Peter Zijlstraa571bbe2009-01-28 14:51:40 +01003944}
3945
Peter Zijlstra6c16a6d2012-03-21 13:07:16 -07003946static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
Paul Turnerd8b49862011-07-21 09:43:41 -07003947
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02003948static void
Peter Zijlstra371fd7e2010-03-24 16:38:48 +01003949dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02003950{
Dmitry Adamushkoa2a2d682007-10-15 17:00:13 +02003951 /*
3952 * Update run-time statistics of the 'current'.
3953 */
3954 update_curr(cfs_rq);
Vincent Guittot89ee0482016-12-21 16:50:26 +01003955
3956 /*
3957 * When dequeuing a sched_entity, we must:
3958 * - Update loads to have both entity and cfs_rq synced with now.
3959 * - Substract its load from the cfs_rq->runnable_avg.
3960 * - Substract its previous weight from cfs_rq->load.weight.
3961 * - For group entity, update its weight to reflect the new share
3962 * of its group cfs_rq.
3963 */
Peter Zijlstra88c06162017-05-06 17:32:43 +02003964 update_load_avg(cfs_rq, se, UPDATE_TG);
Peter Zijlstrab5b3e352017-08-24 17:38:30 +02003965 dequeue_runnable_load_avg(cfs_rq, se);
Dmitry Adamushkoa2a2d682007-10-15 17:00:13 +02003966
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -05003967 update_stats_dequeue(cfs_rq, se, flags);
Peter Zijlstra67e9fb22007-10-15 17:00:10 +02003968
Peter Zijlstra2002c692008-11-11 11:52:33 +01003969 clear_buddies(cfs_rq, se);
Peter Zijlstra47932412008-11-04 21:25:09 +01003970
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +02003971 if (se != cfs_rq->curr)
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02003972 __dequeue_entity(cfs_rq, se);
Linus Torvalds17bc14b2012-12-14 07:20:43 -08003973 se->on_rq = 0;
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02003974 account_entity_dequeue(cfs_rq, se);
Peter Zijlstra88ec22d2009-12-16 18:04:41 +01003975
3976 /*
Peter Zijlstrab60205c2016-09-20 21:58:12 +02003977 * Normalize after update_curr(); which will also have moved
3978 * min_vruntime if @se is the one holding it back. But before doing
3979 * update_min_vruntime() again, which will discount @se's position and
3980 * can move min_vruntime forward still more.
Peter Zijlstra88ec22d2009-12-16 18:04:41 +01003981 */
Peter Zijlstra371fd7e2010-03-24 16:38:48 +01003982 if (!(flags & DEQUEUE_SLEEP))
Peter Zijlstra88ec22d2009-12-16 18:04:41 +01003983 se->vruntime -= cfs_rq->min_vruntime;
Peter Zijlstra1e876232011-05-17 16:21:10 -07003984
Paul Turnerd8b49862011-07-21 09:43:41 -07003985 /* return excess runtime on last dequeue */
3986 return_cfs_rq_runtime(cfs_rq);
3987
Peter Zijlstra1ea6c462017-05-06 15:59:54 +02003988 update_cfs_group(se);
Peter Zijlstrab60205c2016-09-20 21:58:12 +02003989
3990 /*
3991 * Now advance min_vruntime if @se was the entity holding it back,
3992 * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
3993 * put back on, and if we advance min_vruntime, we'll be placed back
3994 * further than we started -- ie. we'll be penalized.
3995 */
3996 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
3997 update_min_vruntime(cfs_rq);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02003998}
3999
4000/*
4001 * Preempt the current task with a newly woken task if needed:
4002 */
Peter Zijlstra7c92e542007-09-05 14:32:49 +02004003static void
Ingo Molnar2e09bf52007-10-15 17:00:05 +02004004check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004005{
Peter Zijlstra11697832007-09-05 14:32:49 +02004006 unsigned long ideal_runtime, delta_exec;
Wang Xingchaof4cfb332011-09-16 13:35:52 -04004007 struct sched_entity *se;
4008 s64 delta;
Peter Zijlstra11697832007-09-05 14:32:49 +02004009
Peter Zijlstra6d0f0eb2007-10-15 17:00:05 +02004010 ideal_runtime = sched_slice(cfs_rq, curr);
Peter Zijlstra11697832007-09-05 14:32:49 +02004011 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
Mike Galbraitha9f3e2b2009-01-28 14:51:39 +01004012 if (delta_exec > ideal_runtime) {
Kirill Tkhai88751252014-06-29 00:03:57 +04004013 resched_curr(rq_of(cfs_rq));
Mike Galbraitha9f3e2b2009-01-28 14:51:39 +01004014 /*
4015 * The current task ran long enough, ensure it doesn't get
4016 * re-elected due to buddy favours.
4017 */
4018 clear_buddies(cfs_rq, curr);
Mike Galbraithf685cea2009-10-23 23:09:22 +02004019 return;
4020 }
4021
4022 /*
4023 * Ensure that a task that missed wakeup preemption by a
4024 * narrow margin doesn't have to wait for a full slice.
4025 * This also mitigates buddy induced latencies under load.
4026 */
Mike Galbraithf685cea2009-10-23 23:09:22 +02004027 if (delta_exec < sysctl_sched_min_granularity)
4028 return;
4029
Wang Xingchaof4cfb332011-09-16 13:35:52 -04004030 se = __pick_first_entity(cfs_rq);
4031 delta = curr->vruntime - se->vruntime;
Mike Galbraithf685cea2009-10-23 23:09:22 +02004032
Wang Xingchaof4cfb332011-09-16 13:35:52 -04004033 if (delta < 0)
4034 return;
Mike Galbraithd7d82942011-01-05 05:41:17 +01004035
Wang Xingchaof4cfb332011-09-16 13:35:52 -04004036 if (delta > ideal_runtime)
Kirill Tkhai88751252014-06-29 00:03:57 +04004037 resched_curr(rq_of(cfs_rq));
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004038}
4039
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +02004040static void
Ingo Molnar8494f412007-08-09 11:16:48 +02004041set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004042{
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +02004043 /* 'current' is not kept within the tree. */
4044 if (se->on_rq) {
4045 /*
4046 * Any task has to be enqueued before it get to execute on
4047 * a CPU. So account for the time it spent waiting on the
4048 * runqueue.
4049 */
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -05004050 update_stats_wait_end(cfs_rq, se);
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +02004051 __dequeue_entity(cfs_rq, se);
Peter Zijlstra88c06162017-05-06 17:32:43 +02004052 update_load_avg(cfs_rq, se, UPDATE_TG);
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +02004053 }
4054
Ingo Molnar79303e92007-08-09 11:16:47 +02004055 update_stats_curr_start(cfs_rq, se);
Ingo Molnar429d43b2007-10-15 17:00:03 +02004056 cfs_rq->curr = se;
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -05004057
Ingo Molnareba1ed42007-10-15 17:00:02 +02004058 /*
4059 * Track our maximum slice length, if the CPU's load is at
4060 * least twice that of our own weight (i.e. dont track it
4061 * when there are only lesser-weight tasks around):
4062 */
Mel Gormancb251762016-02-05 09:08:36 +00004063 if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -05004064 schedstat_set(se->statistics.slice_max,
4065 max((u64)schedstat_val(se->statistics.slice_max),
4066 se->sum_exec_runtime - se->prev_sum_exec_runtime));
Ingo Molnareba1ed42007-10-15 17:00:02 +02004067 }
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -05004068
Peter Zijlstra4a55b452007-09-05 14:32:49 +02004069 se->prev_sum_exec_runtime = se->sum_exec_runtime;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004070}
4071
Peter Zijlstra3f3a4902008-10-24 11:06:16 +02004072static int
4073wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
4074
Rik van Rielac53db52011-02-01 09:51:03 -05004075/*
4076 * Pick the next process, keeping these things in mind, in this order:
4077 * 1) keep things fair between processes/task groups
4078 * 2) pick the "next" process, since someone really wants that to run
4079 * 3) pick the "last" process, for cache locality
4080 * 4) do not run the "skip" process, if something else is available
4081 */
Peter Zijlstra678d5712012-02-11 06:05:00 +01004082static struct sched_entity *
4083pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
Peter Zijlstraaa2ac252008-03-14 21:12:12 +01004084{
Peter Zijlstra678d5712012-02-11 06:05:00 +01004085 struct sched_entity *left = __pick_first_entity(cfs_rq);
4086 struct sched_entity *se;
4087
4088 /*
4089 * If curr is set we have to see if its left of the leftmost entity
4090 * still in the tree, provided there was anything in the tree at all.
4091 */
4092 if (!left || (curr && entity_before(curr, left)))
4093 left = curr;
4094
4095 se = left; /* ideally we run the leftmost entity */
Peter Zijlstraf4b67552008-11-04 21:25:07 +01004096
Rik van Rielac53db52011-02-01 09:51:03 -05004097 /*
4098 * Avoid running the skip buddy, if running something else can
4099 * be done without getting too unfair.
4100 */
4101 if (cfs_rq->skip == se) {
Peter Zijlstra678d5712012-02-11 06:05:00 +01004102 struct sched_entity *second;
4103
4104 if (se == curr) {
4105 second = __pick_first_entity(cfs_rq);
4106 } else {
4107 second = __pick_next_entity(se);
4108 if (!second || (curr && entity_before(curr, second)))
4109 second = curr;
4110 }
4111
Rik van Rielac53db52011-02-01 09:51:03 -05004112 if (second && wakeup_preempt_entity(second, left) < 1)
4113 se = second;
4114 }
Peter Zijlstraaa2ac252008-03-14 21:12:12 +01004115
Mike Galbraithf685cea2009-10-23 23:09:22 +02004116 /*
4117 * Prefer last buddy, try to return the CPU to a preempted task.
4118 */
4119 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
4120 se = cfs_rq->last;
4121
Rik van Rielac53db52011-02-01 09:51:03 -05004122 /*
4123 * Someone really wants this to run. If it's not unfair, run it.
4124 */
4125 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
4126 se = cfs_rq->next;
4127
Mike Galbraithf685cea2009-10-23 23:09:22 +02004128 clear_buddies(cfs_rq, se);
Peter Zijlstra47932412008-11-04 21:25:09 +01004129
4130 return se;
Peter Zijlstraaa2ac252008-03-14 21:12:12 +01004131}
4132
Peter Zijlstra678d5712012-02-11 06:05:00 +01004133static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
Paul Turnerd3d9dc32011-07-21 09:43:39 -07004134
Ingo Molnarab6cde22007-08-09 11:16:48 +02004135static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004136{
4137 /*
4138 * If still on the runqueue then deactivate_task()
4139 * was not called and update_curr() has to be done:
4140 */
4141 if (prev->on_rq)
Ingo Molnarb7cc0892007-08-09 11:16:47 +02004142 update_curr(cfs_rq);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004143
Paul Turnerd3d9dc32011-07-21 09:43:39 -07004144 /* throttle cfs_rqs exceeding runtime */
4145 check_cfs_rq_runtime(cfs_rq);
4146
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -05004147 check_spread(cfs_rq, prev);
Mel Gormancb251762016-02-05 09:08:36 +00004148
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02004149 if (prev->on_rq) {
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -05004150 update_stats_wait_start(cfs_rq, prev);
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02004151 /* Put 'current' back into the tree. */
4152 __enqueue_entity(cfs_rq, prev);
Paul Turner9d85f212012-10-04 13:18:29 +02004153 /* in !on_rq case, update occurred at dequeue */
Peter Zijlstra88c06162017-05-06 17:32:43 +02004154 update_load_avg(cfs_rq, prev, 0);
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02004155 }
Ingo Molnar429d43b2007-10-15 17:00:03 +02004156 cfs_rq->curr = NULL;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004157}
4158
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01004159static void
4160entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004161{
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004162 /*
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02004163 * Update run-time statistics of the 'current'.
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004164 */
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02004165 update_curr(cfs_rq);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004166
Paul Turner43365bd2010-12-15 19:10:17 -08004167 /*
Paul Turner9d85f212012-10-04 13:18:29 +02004168 * Ensure that runnable average is periodically updated.
4169 */
Peter Zijlstra88c06162017-05-06 17:32:43 +02004170 update_load_avg(cfs_rq, curr, UPDATE_TG);
Peter Zijlstra1ea6c462017-05-06 15:59:54 +02004171 update_cfs_group(curr);
Paul Turner9d85f212012-10-04 13:18:29 +02004172
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01004173#ifdef CONFIG_SCHED_HRTICK
4174 /*
4175 * queued ticks are scheduled to match the slice, so don't bother
4176 * validating it and just reschedule.
4177 */
Harvey Harrison983ed7a2008-04-24 18:17:55 -07004178 if (queued) {
Kirill Tkhai88751252014-06-29 00:03:57 +04004179 resched_curr(rq_of(cfs_rq));
Harvey Harrison983ed7a2008-04-24 18:17:55 -07004180 return;
4181 }
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01004182 /*
4183 * don't let the period tick interfere with the hrtick preemption
4184 */
4185 if (!sched_feat(DOUBLE_TICK) &&
4186 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
4187 return;
4188#endif
4189
Yong Zhang2c2efae2011-07-29 16:20:33 +08004190 if (cfs_rq->nr_running > 1)
Ingo Molnar2e09bf52007-10-15 17:00:05 +02004191 check_preempt_tick(cfs_rq, curr);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004192}
4193
Paul Turnerab84d312011-07-21 09:43:28 -07004194
4195/**************************************************
4196 * CFS bandwidth control machinery
4197 */
4198
4199#ifdef CONFIG_CFS_BANDWIDTH
Peter Zijlstra029632f2011-10-25 10:00:11 +02004200
4201#ifdef HAVE_JUMP_LABEL
Ingo Molnarc5905af2012-02-24 08:31:31 +01004202static struct static_key __cfs_bandwidth_used;
Peter Zijlstra029632f2011-10-25 10:00:11 +02004203
4204static inline bool cfs_bandwidth_used(void)
4205{
Ingo Molnarc5905af2012-02-24 08:31:31 +01004206 return static_key_false(&__cfs_bandwidth_used);
Peter Zijlstra029632f2011-10-25 10:00:11 +02004207}
4208
Ben Segall1ee14e62013-10-16 11:16:12 -07004209void cfs_bandwidth_usage_inc(void)
Peter Zijlstra029632f2011-10-25 10:00:11 +02004210{
Peter Zijlstrace48c1462018-01-22 22:53:28 +01004211 static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
Ben Segall1ee14e62013-10-16 11:16:12 -07004212}
4213
4214void cfs_bandwidth_usage_dec(void)
4215{
Peter Zijlstrace48c1462018-01-22 22:53:28 +01004216 static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
Peter Zijlstra029632f2011-10-25 10:00:11 +02004217}
4218#else /* HAVE_JUMP_LABEL */
4219static bool cfs_bandwidth_used(void)
4220{
4221 return true;
4222}
4223
Ben Segall1ee14e62013-10-16 11:16:12 -07004224void cfs_bandwidth_usage_inc(void) {}
4225void cfs_bandwidth_usage_dec(void) {}
Peter Zijlstra029632f2011-10-25 10:00:11 +02004226#endif /* HAVE_JUMP_LABEL */
4227
Paul Turnerab84d312011-07-21 09:43:28 -07004228/*
4229 * default period for cfs group bandwidth.
4230 * default: 0.1s, units: nanoseconds
4231 */
4232static inline u64 default_cfs_period(void)
4233{
4234 return 100000000ULL;
4235}
Paul Turnerec12cb72011-07-21 09:43:30 -07004236
4237static inline u64 sched_cfs_bandwidth_slice(void)
4238{
4239 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
4240}
4241
Paul Turnera9cf55b2011-07-21 09:43:32 -07004242/*
4243 * Replenish runtime according to assigned quota and update expiration time.
4244 * We use sched_clock_cpu directly instead of rq->clock to avoid adding
4245 * additional synchronization around rq->lock.
4246 *
4247 * requires cfs_b->lock
4248 */
Peter Zijlstra029632f2011-10-25 10:00:11 +02004249void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
Paul Turnera9cf55b2011-07-21 09:43:32 -07004250{
4251 u64 now;
4252
4253 if (cfs_b->quota == RUNTIME_INF)
4254 return;
4255
4256 now = sched_clock_cpu(smp_processor_id());
4257 cfs_b->runtime = cfs_b->quota;
4258 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
Xunlei Pang512ac992018-06-20 18:18:33 +08004259 cfs_b->expires_seq++;
Paul Turnera9cf55b2011-07-21 09:43:32 -07004260}
4261
Peter Zijlstra029632f2011-10-25 10:00:11 +02004262static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
4263{
4264 return &tg->cfs_bandwidth;
4265}
4266
Paul Turnerf1b17282012-10-04 13:18:31 +02004267/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
4268static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
4269{
4270 if (unlikely(cfs_rq->throttle_count))
Xunlei Pang1a99ae32016-05-10 21:03:18 +08004271 return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
Paul Turnerf1b17282012-10-04 13:18:31 +02004272
Frederic Weisbecker78becc22013-04-12 01:51:02 +02004273 return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
Paul Turnerf1b17282012-10-04 13:18:31 +02004274}
4275
Paul Turner85dac902011-07-21 09:43:33 -07004276/* returns 0 on failure to allocate runtime */
4277static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
Paul Turnerec12cb72011-07-21 09:43:30 -07004278{
4279 struct task_group *tg = cfs_rq->tg;
4280 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
Paul Turnera9cf55b2011-07-21 09:43:32 -07004281 u64 amount = 0, min_amount, expires;
Xunlei Pang512ac992018-06-20 18:18:33 +08004282 int expires_seq;
Paul Turnerec12cb72011-07-21 09:43:30 -07004283
4284 /* note: this is a positive sum as runtime_remaining <= 0 */
4285 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
4286
4287 raw_spin_lock(&cfs_b->lock);
4288 if (cfs_b->quota == RUNTIME_INF)
4289 amount = min_amount;
Paul Turner58088ad2011-07-21 09:43:31 -07004290 else {
Peter Zijlstra77a4d1a2015-04-15 11:41:57 +02004291 start_cfs_bandwidth(cfs_b);
Paul Turner58088ad2011-07-21 09:43:31 -07004292
4293 if (cfs_b->runtime > 0) {
4294 amount = min(cfs_b->runtime, min_amount);
4295 cfs_b->runtime -= amount;
4296 cfs_b->idle = 0;
4297 }
Paul Turnerec12cb72011-07-21 09:43:30 -07004298 }
Xunlei Pang512ac992018-06-20 18:18:33 +08004299 expires_seq = cfs_b->expires_seq;
Paul Turnera9cf55b2011-07-21 09:43:32 -07004300 expires = cfs_b->runtime_expires;
Paul Turnerec12cb72011-07-21 09:43:30 -07004301 raw_spin_unlock(&cfs_b->lock);
4302
4303 cfs_rq->runtime_remaining += amount;
Paul Turnera9cf55b2011-07-21 09:43:32 -07004304 /*
4305 * we may have advanced our local expiration to account for allowed
4306 * spread between our sched_clock and the one on which runtime was
4307 * issued.
4308 */
Xunlei Pang512ac992018-06-20 18:18:33 +08004309 if (cfs_rq->expires_seq != expires_seq) {
4310 cfs_rq->expires_seq = expires_seq;
Paul Turnera9cf55b2011-07-21 09:43:32 -07004311 cfs_rq->runtime_expires = expires;
Xunlei Pang512ac992018-06-20 18:18:33 +08004312 }
Paul Turner85dac902011-07-21 09:43:33 -07004313
4314 return cfs_rq->runtime_remaining > 0;
Paul Turnera9cf55b2011-07-21 09:43:32 -07004315}
4316
4317/*
4318 * Note: This depends on the synchronization provided by sched_clock and the
4319 * fact that rq->clock snapshots this value.
4320 */
4321static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4322{
4323 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
Paul Turnera9cf55b2011-07-21 09:43:32 -07004324
4325 /* if the deadline is ahead of our clock, nothing to do */
Frederic Weisbecker78becc22013-04-12 01:51:02 +02004326 if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
Paul Turnera9cf55b2011-07-21 09:43:32 -07004327 return;
4328
4329 if (cfs_rq->runtime_remaining < 0)
4330 return;
4331
4332 /*
4333 * If the local deadline has passed we have to consider the
4334 * possibility that our sched_clock is 'fast' and the global deadline
4335 * has not truly expired.
4336 *
4337 * Fortunately we can check determine whether this the case by checking
Xunlei Pang512ac992018-06-20 18:18:33 +08004338 * whether the global deadline(cfs_b->expires_seq) has advanced.
Paul Turnera9cf55b2011-07-21 09:43:32 -07004339 */
Xunlei Pang512ac992018-06-20 18:18:33 +08004340 if (cfs_rq->expires_seq == cfs_b->expires_seq) {
Paul Turnera9cf55b2011-07-21 09:43:32 -07004341 /* extend local deadline, drift is bounded above by 2 ticks */
4342 cfs_rq->runtime_expires += TICK_NSEC;
4343 } else {
4344 /* global deadline is ahead, expiration has passed */
4345 cfs_rq->runtime_remaining = 0;
4346 }
Paul Turnerec12cb72011-07-21 09:43:30 -07004347}
4348
Peter Zijlstra9dbdb152013-11-18 18:27:06 +01004349static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
Paul Turnerec12cb72011-07-21 09:43:30 -07004350{
Paul Turnera9cf55b2011-07-21 09:43:32 -07004351 /* dock delta_exec before expiring quota (as it could span periods) */
Paul Turnerec12cb72011-07-21 09:43:30 -07004352 cfs_rq->runtime_remaining -= delta_exec;
Paul Turnera9cf55b2011-07-21 09:43:32 -07004353 expire_cfs_rq_runtime(cfs_rq);
4354
4355 if (likely(cfs_rq->runtime_remaining > 0))
Paul Turnerec12cb72011-07-21 09:43:30 -07004356 return;
4357
Paul Turner85dac902011-07-21 09:43:33 -07004358 /*
4359 * if we're unable to extend our runtime we resched so that the active
4360 * hierarchy can be throttled
4361 */
4362 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
Kirill Tkhai88751252014-06-29 00:03:57 +04004363 resched_curr(rq_of(cfs_rq));
Paul Turnerec12cb72011-07-21 09:43:30 -07004364}
4365
Peter Zijlstra6c16a6d2012-03-21 13:07:16 -07004366static __always_inline
Peter Zijlstra9dbdb152013-11-18 18:27:06 +01004367void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
Paul Turnerec12cb72011-07-21 09:43:30 -07004368{
Paul Turner56f570e2011-11-07 20:26:33 -08004369 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
Paul Turnerec12cb72011-07-21 09:43:30 -07004370 return;
4371
4372 __account_cfs_rq_runtime(cfs_rq, delta_exec);
4373}
4374
Paul Turner85dac902011-07-21 09:43:33 -07004375static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4376{
Paul Turner56f570e2011-11-07 20:26:33 -08004377 return cfs_bandwidth_used() && cfs_rq->throttled;
Paul Turner85dac902011-07-21 09:43:33 -07004378}
4379
Paul Turner64660c82011-07-21 09:43:36 -07004380/* check whether cfs_rq, or any parent, is throttled */
4381static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4382{
Paul Turner56f570e2011-11-07 20:26:33 -08004383 return cfs_bandwidth_used() && cfs_rq->throttle_count;
Paul Turner64660c82011-07-21 09:43:36 -07004384}
4385
4386/*
4387 * Ensure that neither of the group entities corresponding to src_cpu or
4388 * dest_cpu are members of a throttled hierarchy when performing group
4389 * load-balance operations.
4390 */
4391static inline int throttled_lb_pair(struct task_group *tg,
4392 int src_cpu, int dest_cpu)
4393{
4394 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
4395
4396 src_cfs_rq = tg->cfs_rq[src_cpu];
4397 dest_cfs_rq = tg->cfs_rq[dest_cpu];
4398
4399 return throttled_hierarchy(src_cfs_rq) ||
4400 throttled_hierarchy(dest_cfs_rq);
4401}
4402
Paul Turner64660c82011-07-21 09:43:36 -07004403static int tg_unthrottle_up(struct task_group *tg, void *data)
4404{
4405 struct rq *rq = data;
4406 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4407
4408 cfs_rq->throttle_count--;
Paul Turner64660c82011-07-21 09:43:36 -07004409 if (!cfs_rq->throttle_count) {
Paul Turnerf1b17282012-10-04 13:18:31 +02004410 /* adjust cfs_rq_clock_task() */
Frederic Weisbecker78becc22013-04-12 01:51:02 +02004411 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
Paul Turnerf1b17282012-10-04 13:18:31 +02004412 cfs_rq->throttled_clock_task;
Paul Turner64660c82011-07-21 09:43:36 -07004413 }
Paul Turner64660c82011-07-21 09:43:36 -07004414
4415 return 0;
4416}
4417
4418static int tg_throttle_down(struct task_group *tg, void *data)
4419{
4420 struct rq *rq = data;
4421 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4422
Paul Turner82958362012-10-04 13:18:31 +02004423 /* group is entering throttled state, stop time */
4424 if (!cfs_rq->throttle_count)
Frederic Weisbecker78becc22013-04-12 01:51:02 +02004425 cfs_rq->throttled_clock_task = rq_clock_task(rq);
Paul Turner64660c82011-07-21 09:43:36 -07004426 cfs_rq->throttle_count++;
4427
4428 return 0;
4429}
4430
Paul Turnerd3d9dc32011-07-21 09:43:39 -07004431static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
Paul Turner85dac902011-07-21 09:43:33 -07004432{
4433 struct rq *rq = rq_of(cfs_rq);
4434 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4435 struct sched_entity *se;
4436 long task_delta, dequeue = 1;
Peter Zijlstra77a4d1a2015-04-15 11:41:57 +02004437 bool empty;
Paul Turner85dac902011-07-21 09:43:33 -07004438
4439 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
4440
Paul Turnerf1b17282012-10-04 13:18:31 +02004441 /* freeze hierarchy runnable averages while throttled */
Paul Turner64660c82011-07-21 09:43:36 -07004442 rcu_read_lock();
4443 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
4444 rcu_read_unlock();
Paul Turner85dac902011-07-21 09:43:33 -07004445
4446 task_delta = cfs_rq->h_nr_running;
4447 for_each_sched_entity(se) {
4448 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
4449 /* throttled entity or throttle-on-deactivate */
4450 if (!se->on_rq)
4451 break;
4452
4453 if (dequeue)
4454 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
4455 qcfs_rq->h_nr_running -= task_delta;
4456
4457 if (qcfs_rq->load.weight)
4458 dequeue = 0;
4459 }
4460
4461 if (!se)
Kirill Tkhai72465442014-05-09 03:00:14 +04004462 sub_nr_running(rq, task_delta);
Paul Turner85dac902011-07-21 09:43:33 -07004463
4464 cfs_rq->throttled = 1;
Frederic Weisbecker78becc22013-04-12 01:51:02 +02004465 cfs_rq->throttled_clock = rq_clock(rq);
Paul Turner85dac902011-07-21 09:43:33 -07004466 raw_spin_lock(&cfs_b->lock);
Cong Wangd49db342015-06-24 12:41:47 -07004467 empty = list_empty(&cfs_b->throttled_cfs_rq);
Peter Zijlstra77a4d1a2015-04-15 11:41:57 +02004468
Ben Segallc06f04c2014-06-20 15:21:20 -07004469 /*
4470 * Add to the _head_ of the list, so that an already-started
4471 * distribute_cfs_runtime will not see us
4472 */
4473 list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
Peter Zijlstra77a4d1a2015-04-15 11:41:57 +02004474
4475 /*
4476 * If we're the first throttled task, make sure the bandwidth
4477 * timer is running.
4478 */
4479 if (empty)
4480 start_cfs_bandwidth(cfs_b);
4481
Paul Turner85dac902011-07-21 09:43:33 -07004482 raw_spin_unlock(&cfs_b->lock);
4483}
4484
Peter Zijlstra029632f2011-10-25 10:00:11 +02004485void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
Paul Turner671fd9d2011-07-21 09:43:34 -07004486{
4487 struct rq *rq = rq_of(cfs_rq);
4488 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4489 struct sched_entity *se;
4490 int enqueue = 1;
4491 long task_delta;
4492
Michael Wang22b958d2013-06-04 14:23:39 +08004493 se = cfs_rq->tg->se[cpu_of(rq)];
Paul Turner671fd9d2011-07-21 09:43:34 -07004494
4495 cfs_rq->throttled = 0;
Frederic Weisbecker1a55af22013-04-12 01:51:01 +02004496
4497 update_rq_clock(rq);
4498
Paul Turner671fd9d2011-07-21 09:43:34 -07004499 raw_spin_lock(&cfs_b->lock);
Frederic Weisbecker78becc22013-04-12 01:51:02 +02004500 cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
Paul Turner671fd9d2011-07-21 09:43:34 -07004501 list_del_rcu(&cfs_rq->throttled_list);
4502 raw_spin_unlock(&cfs_b->lock);
4503
Paul Turner64660c82011-07-21 09:43:36 -07004504 /* update hierarchical throttle state */
4505 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
4506
Paul Turner671fd9d2011-07-21 09:43:34 -07004507 if (!cfs_rq->load.weight)
4508 return;
4509
4510 task_delta = cfs_rq->h_nr_running;
4511 for_each_sched_entity(se) {
4512 if (se->on_rq)
4513 enqueue = 0;
4514
4515 cfs_rq = cfs_rq_of(se);
4516 if (enqueue)
4517 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
4518 cfs_rq->h_nr_running += task_delta;
4519
4520 if (cfs_rq_throttled(cfs_rq))
4521 break;
4522 }
4523
4524 if (!se)
Kirill Tkhai72465442014-05-09 03:00:14 +04004525 add_nr_running(rq, task_delta);
Paul Turner671fd9d2011-07-21 09:43:34 -07004526
Ingo Molnar97fb7a02018-03-03 14:01:12 +01004527 /* Determine whether we need to wake up potentially idle CPU: */
Paul Turner671fd9d2011-07-21 09:43:34 -07004528 if (rq->curr == rq->idle && rq->cfs.nr_running)
Kirill Tkhai88751252014-06-29 00:03:57 +04004529 resched_curr(rq);
Paul Turner671fd9d2011-07-21 09:43:34 -07004530}
4531
4532static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
4533 u64 remaining, u64 expires)
4534{
4535 struct cfs_rq *cfs_rq;
Ben Segallc06f04c2014-06-20 15:21:20 -07004536 u64 runtime;
4537 u64 starting_runtime = remaining;
Paul Turner671fd9d2011-07-21 09:43:34 -07004538
4539 rcu_read_lock();
4540 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
4541 throttled_list) {
4542 struct rq *rq = rq_of(cfs_rq);
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02004543 struct rq_flags rf;
Paul Turner671fd9d2011-07-21 09:43:34 -07004544
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02004545 rq_lock(rq, &rf);
Paul Turner671fd9d2011-07-21 09:43:34 -07004546 if (!cfs_rq_throttled(cfs_rq))
4547 goto next;
4548
4549 runtime = -cfs_rq->runtime_remaining + 1;
4550 if (runtime > remaining)
4551 runtime = remaining;
4552 remaining -= runtime;
4553
4554 cfs_rq->runtime_remaining += runtime;
4555 cfs_rq->runtime_expires = expires;
4556
4557 /* we check whether we're throttled above */
4558 if (cfs_rq->runtime_remaining > 0)
4559 unthrottle_cfs_rq(cfs_rq);
4560
4561next:
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02004562 rq_unlock(rq, &rf);
Paul Turner671fd9d2011-07-21 09:43:34 -07004563
4564 if (!remaining)
4565 break;
4566 }
4567 rcu_read_unlock();
4568
Ben Segallc06f04c2014-06-20 15:21:20 -07004569 return starting_runtime - remaining;
Paul Turner671fd9d2011-07-21 09:43:34 -07004570}
4571
Paul Turner58088ad2011-07-21 09:43:31 -07004572/*
4573 * Responsible for refilling a task_group's bandwidth and unthrottling its
4574 * cfs_rqs as appropriate. If there has been no activity within the last
4575 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
4576 * used to track this state.
4577 */
4578static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
4579{
Paul Turner671fd9d2011-07-21 09:43:34 -07004580 u64 runtime, runtime_expires;
Ben Segall51f21762014-05-19 15:49:45 -07004581 int throttled;
Paul Turner58088ad2011-07-21 09:43:31 -07004582
Paul Turner58088ad2011-07-21 09:43:31 -07004583 /* no need to continue the timer with no bandwidth constraint */
4584 if (cfs_b->quota == RUNTIME_INF)
Ben Segall51f21762014-05-19 15:49:45 -07004585 goto out_deactivate;
Paul Turner58088ad2011-07-21 09:43:31 -07004586
Paul Turner671fd9d2011-07-21 09:43:34 -07004587 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
Nikhil Raoe8da1b12011-07-21 09:43:40 -07004588 cfs_b->nr_periods += overrun;
Paul Turner671fd9d2011-07-21 09:43:34 -07004589
Ben Segall51f21762014-05-19 15:49:45 -07004590 /*
4591 * idle depends on !throttled (for the case of a large deficit), and if
4592 * we're going inactive then everything else can be deferred
4593 */
4594 if (cfs_b->idle && !throttled)
4595 goto out_deactivate;
Paul Turnera9cf55b2011-07-21 09:43:32 -07004596
4597 __refill_cfs_bandwidth_runtime(cfs_b);
4598
Paul Turner671fd9d2011-07-21 09:43:34 -07004599 if (!throttled) {
4600 /* mark as potentially idle for the upcoming period */
4601 cfs_b->idle = 1;
Ben Segall51f21762014-05-19 15:49:45 -07004602 return 0;
Paul Turner671fd9d2011-07-21 09:43:34 -07004603 }
Paul Turner58088ad2011-07-21 09:43:31 -07004604
Nikhil Raoe8da1b12011-07-21 09:43:40 -07004605 /* account preceding periods in which throttling occurred */
4606 cfs_b->nr_throttled += overrun;
4607
Paul Turner671fd9d2011-07-21 09:43:34 -07004608 runtime_expires = cfs_b->runtime_expires;
Paul Turner671fd9d2011-07-21 09:43:34 -07004609
4610 /*
Ben Segallc06f04c2014-06-20 15:21:20 -07004611 * This check is repeated as we are holding onto the new bandwidth while
4612 * we unthrottle. This can potentially race with an unthrottled group
4613 * trying to acquire new bandwidth from the global pool. This can result
4614 * in us over-using our runtime if it is all used during this loop, but
4615 * only by limited amounts in that extreme case.
Paul Turner671fd9d2011-07-21 09:43:34 -07004616 */
Ben Segallc06f04c2014-06-20 15:21:20 -07004617 while (throttled && cfs_b->runtime > 0) {
4618 runtime = cfs_b->runtime;
Paul Turner671fd9d2011-07-21 09:43:34 -07004619 raw_spin_unlock(&cfs_b->lock);
4620 /* we can't nest cfs_b->lock while distributing bandwidth */
4621 runtime = distribute_cfs_runtime(cfs_b, runtime,
4622 runtime_expires);
4623 raw_spin_lock(&cfs_b->lock);
4624
4625 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
Ben Segallc06f04c2014-06-20 15:21:20 -07004626
4627 cfs_b->runtime -= min(runtime, cfs_b->runtime);
Paul Turner671fd9d2011-07-21 09:43:34 -07004628 }
4629
Paul Turner671fd9d2011-07-21 09:43:34 -07004630 /*
4631 * While we are ensured activity in the period following an
4632 * unthrottle, this also covers the case in which the new bandwidth is
4633 * insufficient to cover the existing bandwidth deficit. (Forcing the
4634 * timer to remain active while there are any throttled entities.)
4635 */
4636 cfs_b->idle = 0;
Paul Turner58088ad2011-07-21 09:43:31 -07004637
Ben Segall51f21762014-05-19 15:49:45 -07004638 return 0;
4639
4640out_deactivate:
Ben Segall51f21762014-05-19 15:49:45 -07004641 return 1;
Paul Turner58088ad2011-07-21 09:43:31 -07004642}
Paul Turnerd3d9dc32011-07-21 09:43:39 -07004643
Paul Turnerd8b49862011-07-21 09:43:41 -07004644/* a cfs_rq won't donate quota below this amount */
4645static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
4646/* minimum remaining period time to redistribute slack quota */
4647static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
4648/* how long we wait to gather additional slack before distributing */
4649static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
4650
Ben Segalldb06e782013-10-16 11:16:17 -07004651/*
4652 * Are we near the end of the current quota period?
4653 *
4654 * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
Thomas Gleixner4961b6e2015-04-14 21:09:05 +00004655 * hrtimer base being cleared by hrtimer_start. In the case of
Ben Segalldb06e782013-10-16 11:16:17 -07004656 * migrate_hrtimers, base is never cleared, so we are fine.
4657 */
Paul Turnerd8b49862011-07-21 09:43:41 -07004658static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
4659{
4660 struct hrtimer *refresh_timer = &cfs_b->period_timer;
4661 u64 remaining;
4662
4663 /* if the call-back is running a quota refresh is already occurring */
4664 if (hrtimer_callback_running(refresh_timer))
4665 return 1;
4666
4667 /* is a quota refresh about to occur? */
4668 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
4669 if (remaining < min_expire)
4670 return 1;
4671
4672 return 0;
4673}
4674
4675static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
4676{
4677 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
4678
4679 /* if there's a quota refresh soon don't bother with slack */
4680 if (runtime_refresh_within(cfs_b, min_left))
4681 return;
4682
Peter Zijlstra4cfafd32015-05-14 12:23:11 +02004683 hrtimer_start(&cfs_b->slack_timer,
4684 ns_to_ktime(cfs_bandwidth_slack_period),
4685 HRTIMER_MODE_REL);
Paul Turnerd8b49862011-07-21 09:43:41 -07004686}
4687
4688/* we know any runtime found here is valid as update_curr() precedes return */
4689static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4690{
4691 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4692 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
4693
4694 if (slack_runtime <= 0)
4695 return;
4696
4697 raw_spin_lock(&cfs_b->lock);
4698 if (cfs_b->quota != RUNTIME_INF &&
4699 cfs_rq->runtime_expires == cfs_b->runtime_expires) {
4700 cfs_b->runtime += slack_runtime;
4701
4702 /* we are under rq->lock, defer unthrottling using a timer */
4703 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
4704 !list_empty(&cfs_b->throttled_cfs_rq))
4705 start_cfs_slack_bandwidth(cfs_b);
4706 }
4707 raw_spin_unlock(&cfs_b->lock);
4708
4709 /* even if it's not valid for return we don't want to try again */
4710 cfs_rq->runtime_remaining -= slack_runtime;
4711}
4712
4713static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4714{
Paul Turner56f570e2011-11-07 20:26:33 -08004715 if (!cfs_bandwidth_used())
4716 return;
4717
Paul Turnerfccfdc62011-11-07 20:26:34 -08004718 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
Paul Turnerd8b49862011-07-21 09:43:41 -07004719 return;
4720
4721 __return_cfs_rq_runtime(cfs_rq);
4722}
4723
4724/*
4725 * This is done with a timer (instead of inline with bandwidth return) since
4726 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
4727 */
4728static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
4729{
4730 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
4731 u64 expires;
4732
4733 /* confirm we're still not at a refresh boundary */
Paul Turnerd8b49862011-07-21 09:43:41 -07004734 raw_spin_lock(&cfs_b->lock);
Ben Segalldb06e782013-10-16 11:16:17 -07004735 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
4736 raw_spin_unlock(&cfs_b->lock);
4737 return;
4738 }
4739
Ben Segallc06f04c2014-06-20 15:21:20 -07004740 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
Paul Turnerd8b49862011-07-21 09:43:41 -07004741 runtime = cfs_b->runtime;
Ben Segallc06f04c2014-06-20 15:21:20 -07004742
Paul Turnerd8b49862011-07-21 09:43:41 -07004743 expires = cfs_b->runtime_expires;
4744 raw_spin_unlock(&cfs_b->lock);
4745
4746 if (!runtime)
4747 return;
4748
4749 runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
4750
4751 raw_spin_lock(&cfs_b->lock);
4752 if (expires == cfs_b->runtime_expires)
Ben Segallc06f04c2014-06-20 15:21:20 -07004753 cfs_b->runtime -= min(runtime, cfs_b->runtime);
Paul Turnerd8b49862011-07-21 09:43:41 -07004754 raw_spin_unlock(&cfs_b->lock);
4755}
4756
Paul Turnerd3d9dc32011-07-21 09:43:39 -07004757/*
4758 * When a group wakes up we want to make sure that its quota is not already
4759 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
4760 * runtime as update_curr() throttling can not not trigger until it's on-rq.
4761 */
4762static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
4763{
Paul Turner56f570e2011-11-07 20:26:33 -08004764 if (!cfs_bandwidth_used())
4765 return;
4766
Paul Turnerd3d9dc32011-07-21 09:43:39 -07004767 /* an active group must be handled by the update_curr()->put() path */
4768 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
4769 return;
4770
4771 /* ensure the group is not already throttled */
4772 if (cfs_rq_throttled(cfs_rq))
4773 return;
4774
4775 /* update runtime allocation */
4776 account_cfs_rq_runtime(cfs_rq, 0);
4777 if (cfs_rq->runtime_remaining <= 0)
4778 throttle_cfs_rq(cfs_rq);
4779}
4780
Peter Zijlstra55e16d32016-06-22 15:14:26 +02004781static void sync_throttle(struct task_group *tg, int cpu)
4782{
4783 struct cfs_rq *pcfs_rq, *cfs_rq;
4784
4785 if (!cfs_bandwidth_used())
4786 return;
4787
4788 if (!tg->parent)
4789 return;
4790
4791 cfs_rq = tg->cfs_rq[cpu];
4792 pcfs_rq = tg->parent->cfs_rq[cpu];
4793
4794 cfs_rq->throttle_count = pcfs_rq->throttle_count;
Xunlei Pangb8922122016-07-09 15:54:22 +08004795 cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
Peter Zijlstra55e16d32016-06-22 15:14:26 +02004796}
4797
Paul Turnerd3d9dc32011-07-21 09:43:39 -07004798/* conditionally throttle active cfs_rq's from put_prev_entity() */
Peter Zijlstra678d5712012-02-11 06:05:00 +01004799static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
Paul Turnerd3d9dc32011-07-21 09:43:39 -07004800{
Paul Turner56f570e2011-11-07 20:26:33 -08004801 if (!cfs_bandwidth_used())
Peter Zijlstra678d5712012-02-11 06:05:00 +01004802 return false;
Paul Turner56f570e2011-11-07 20:26:33 -08004803
Paul Turnerd3d9dc32011-07-21 09:43:39 -07004804 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
Peter Zijlstra678d5712012-02-11 06:05:00 +01004805 return false;
Paul Turnerd3d9dc32011-07-21 09:43:39 -07004806
4807 /*
4808 * it's possible for a throttled entity to be forced into a running
4809 * state (e.g. set_curr_task), in this case we're finished.
4810 */
4811 if (cfs_rq_throttled(cfs_rq))
Peter Zijlstra678d5712012-02-11 06:05:00 +01004812 return true;
Paul Turnerd3d9dc32011-07-21 09:43:39 -07004813
4814 throttle_cfs_rq(cfs_rq);
Peter Zijlstra678d5712012-02-11 06:05:00 +01004815 return true;
Paul Turnerd3d9dc32011-07-21 09:43:39 -07004816}
Peter Zijlstra029632f2011-10-25 10:00:11 +02004817
Peter Zijlstra029632f2011-10-25 10:00:11 +02004818static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
4819{
4820 struct cfs_bandwidth *cfs_b =
4821 container_of(timer, struct cfs_bandwidth, slack_timer);
Peter Zijlstra77a4d1a2015-04-15 11:41:57 +02004822
Peter Zijlstra029632f2011-10-25 10:00:11 +02004823 do_sched_cfs_slack_timer(cfs_b);
4824
4825 return HRTIMER_NORESTART;
4826}
4827
4828static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
4829{
4830 struct cfs_bandwidth *cfs_b =
4831 container_of(timer, struct cfs_bandwidth, period_timer);
Peter Zijlstra029632f2011-10-25 10:00:11 +02004832 int overrun;
4833 int idle = 0;
4834
Ben Segall51f21762014-05-19 15:49:45 -07004835 raw_spin_lock(&cfs_b->lock);
Peter Zijlstra029632f2011-10-25 10:00:11 +02004836 for (;;) {
Peter Zijlstra77a4d1a2015-04-15 11:41:57 +02004837 overrun = hrtimer_forward_now(timer, cfs_b->period);
Peter Zijlstra029632f2011-10-25 10:00:11 +02004838 if (!overrun)
4839 break;
4840
4841 idle = do_sched_cfs_period_timer(cfs_b, overrun);
4842 }
Peter Zijlstra4cfafd32015-05-14 12:23:11 +02004843 if (idle)
4844 cfs_b->period_active = 0;
Ben Segall51f21762014-05-19 15:49:45 -07004845 raw_spin_unlock(&cfs_b->lock);
Peter Zijlstra029632f2011-10-25 10:00:11 +02004846
4847 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
4848}
4849
4850void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4851{
4852 raw_spin_lock_init(&cfs_b->lock);
4853 cfs_b->runtime = 0;
4854 cfs_b->quota = RUNTIME_INF;
4855 cfs_b->period = ns_to_ktime(default_cfs_period());
4856
4857 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
Peter Zijlstra4cfafd32015-05-14 12:23:11 +02004858 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
Peter Zijlstra029632f2011-10-25 10:00:11 +02004859 cfs_b->period_timer.function = sched_cfs_period_timer;
4860 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4861 cfs_b->slack_timer.function = sched_cfs_slack_timer;
4862}
4863
4864static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4865{
4866 cfs_rq->runtime_enabled = 0;
4867 INIT_LIST_HEAD(&cfs_rq->throttled_list);
4868}
4869
Peter Zijlstra77a4d1a2015-04-15 11:41:57 +02004870void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
Peter Zijlstra029632f2011-10-25 10:00:11 +02004871{
Xunlei Pangf1d1be82018-06-20 18:18:34 +08004872 u64 overrun;
4873
Peter Zijlstra4cfafd32015-05-14 12:23:11 +02004874 lockdep_assert_held(&cfs_b->lock);
Peter Zijlstra029632f2011-10-25 10:00:11 +02004875
Xunlei Pangf1d1be82018-06-20 18:18:34 +08004876 if (cfs_b->period_active)
4877 return;
4878
4879 cfs_b->period_active = 1;
4880 overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
4881 cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period);
4882 cfs_b->expires_seq++;
4883 hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
Peter Zijlstra029632f2011-10-25 10:00:11 +02004884}
4885
4886static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4887{
Tetsuo Handa7f1a1692014-12-25 15:51:21 +09004888 /* init_cfs_bandwidth() was not called */
4889 if (!cfs_b->throttled_cfs_rq.next)
4890 return;
4891
Peter Zijlstra029632f2011-10-25 10:00:11 +02004892 hrtimer_cancel(&cfs_b->period_timer);
4893 hrtimer_cancel(&cfs_b->slack_timer);
4894}
4895
Peter Zijlstra502ce002017-05-04 15:31:22 +02004896/*
Ingo Molnar97fb7a02018-03-03 14:01:12 +01004897 * Both these CPU hotplug callbacks race against unregister_fair_sched_group()
Peter Zijlstra502ce002017-05-04 15:31:22 +02004898 *
4899 * The race is harmless, since modifying bandwidth settings of unhooked group
4900 * bits doesn't do much.
4901 */
4902
4903/* cpu online calback */
Kirill Tkhai0e59bda2014-06-25 12:19:42 +04004904static void __maybe_unused update_runtime_enabled(struct rq *rq)
4905{
Peter Zijlstra502ce002017-05-04 15:31:22 +02004906 struct task_group *tg;
Kirill Tkhai0e59bda2014-06-25 12:19:42 +04004907
Peter Zijlstra502ce002017-05-04 15:31:22 +02004908 lockdep_assert_held(&rq->lock);
4909
4910 rcu_read_lock();
4911 list_for_each_entry_rcu(tg, &task_groups, list) {
4912 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
4913 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
Kirill Tkhai0e59bda2014-06-25 12:19:42 +04004914
4915 raw_spin_lock(&cfs_b->lock);
4916 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
4917 raw_spin_unlock(&cfs_b->lock);
4918 }
Peter Zijlstra502ce002017-05-04 15:31:22 +02004919 rcu_read_unlock();
Kirill Tkhai0e59bda2014-06-25 12:19:42 +04004920}
4921
Peter Zijlstra502ce002017-05-04 15:31:22 +02004922/* cpu offline callback */
Arnd Bergmann38dc3342013-01-25 14:14:22 +00004923static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
Peter Zijlstra029632f2011-10-25 10:00:11 +02004924{
Peter Zijlstra502ce002017-05-04 15:31:22 +02004925 struct task_group *tg;
Peter Zijlstra029632f2011-10-25 10:00:11 +02004926
Peter Zijlstra502ce002017-05-04 15:31:22 +02004927 lockdep_assert_held(&rq->lock);
4928
4929 rcu_read_lock();
4930 list_for_each_entry_rcu(tg, &task_groups, list) {
4931 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4932
Peter Zijlstra029632f2011-10-25 10:00:11 +02004933 if (!cfs_rq->runtime_enabled)
4934 continue;
4935
4936 /*
4937 * clock_task is not advancing so we just need to make sure
4938 * there's some valid quota amount
4939 */
Ben Segall51f21762014-05-19 15:49:45 -07004940 cfs_rq->runtime_remaining = 1;
Kirill Tkhai0e59bda2014-06-25 12:19:42 +04004941 /*
Ingo Molnar97fb7a02018-03-03 14:01:12 +01004942 * Offline rq is schedulable till CPU is completely disabled
Kirill Tkhai0e59bda2014-06-25 12:19:42 +04004943 * in take_cpu_down(), so we prevent new cfs throttling here.
4944 */
4945 cfs_rq->runtime_enabled = 0;
4946
Peter Zijlstra029632f2011-10-25 10:00:11 +02004947 if (cfs_rq_throttled(cfs_rq))
4948 unthrottle_cfs_rq(cfs_rq);
4949 }
Peter Zijlstra502ce002017-05-04 15:31:22 +02004950 rcu_read_unlock();
Peter Zijlstra029632f2011-10-25 10:00:11 +02004951}
4952
4953#else /* CONFIG_CFS_BANDWIDTH */
Paul Turnerf1b17282012-10-04 13:18:31 +02004954static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
4955{
Frederic Weisbecker78becc22013-04-12 01:51:02 +02004956 return rq_clock_task(rq_of(cfs_rq));
Paul Turnerf1b17282012-10-04 13:18:31 +02004957}
4958
Peter Zijlstra9dbdb152013-11-18 18:27:06 +01004959static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
Peter Zijlstra678d5712012-02-11 06:05:00 +01004960static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
Paul Turnerd3d9dc32011-07-21 09:43:39 -07004961static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
Peter Zijlstra55e16d32016-06-22 15:14:26 +02004962static inline void sync_throttle(struct task_group *tg, int cpu) {}
Peter Zijlstra6c16a6d2012-03-21 13:07:16 -07004963static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
Paul Turner85dac902011-07-21 09:43:33 -07004964
4965static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4966{
4967 return 0;
4968}
Paul Turner64660c82011-07-21 09:43:36 -07004969
4970static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4971{
4972 return 0;
4973}
4974
4975static inline int throttled_lb_pair(struct task_group *tg,
4976 int src_cpu, int dest_cpu)
4977{
4978 return 0;
4979}
Peter Zijlstra029632f2011-10-25 10:00:11 +02004980
4981void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
4982
4983#ifdef CONFIG_FAIR_GROUP_SCHED
4984static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
Paul Turnerab84d312011-07-21 09:43:28 -07004985#endif
4986
Peter Zijlstra029632f2011-10-25 10:00:11 +02004987static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
4988{
4989 return NULL;
4990}
4991static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
Kirill Tkhai0e59bda2014-06-25 12:19:42 +04004992static inline void update_runtime_enabled(struct rq *rq) {}
Peter Boonstoppela4c96ae2012-08-09 15:34:47 -07004993static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
Peter Zijlstra029632f2011-10-25 10:00:11 +02004994
4995#endif /* CONFIG_CFS_BANDWIDTH */
4996
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004997/**************************************************
4998 * CFS operations on tasks:
4999 */
5000
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01005001#ifdef CONFIG_SCHED_HRTICK
5002static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
5003{
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01005004 struct sched_entity *se = &p->se;
5005 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5006
Peter Zijlstra9148a3a2016-09-20 22:34:51 +02005007 SCHED_WARN_ON(task_rq(p) != rq);
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01005008
Srivatsa Vaddagiri8bf46a32016-09-16 18:28:51 -07005009 if (rq->cfs.h_nr_running > 1) {
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01005010 u64 slice = sched_slice(cfs_rq, se);
5011 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
5012 s64 delta = slice - ran;
5013
5014 if (delta < 0) {
5015 if (rq->curr == p)
Kirill Tkhai88751252014-06-29 00:03:57 +04005016 resched_curr(rq);
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01005017 return;
5018 }
Peter Zijlstra31656512008-07-18 18:01:23 +02005019 hrtick_start(rq, delta);
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01005020 }
5021}
Peter Zijlstraa4c2f002008-10-17 19:27:03 +02005022
5023/*
5024 * called from enqueue/dequeue and updates the hrtick when the
5025 * current task is from our class and nr_running is low enough
5026 * to matter.
5027 */
5028static void hrtick_update(struct rq *rq)
5029{
5030 struct task_struct *curr = rq->curr;
5031
Mike Galbraithb39e66e2011-11-22 15:20:07 +01005032 if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
Peter Zijlstraa4c2f002008-10-17 19:27:03 +02005033 return;
5034
5035 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
5036 hrtick_start_fair(rq, curr);
5037}
Dhaval Giani55e12e52008-06-24 23:39:43 +05305038#else /* !CONFIG_SCHED_HRTICK */
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01005039static inline void
5040hrtick_start_fair(struct rq *rq, struct task_struct *p)
5041{
5042}
Peter Zijlstraa4c2f002008-10-17 19:27:03 +02005043
5044static inline void hrtick_update(struct rq *rq)
5045{
5046}
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01005047#endif
5048
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005049/*
5050 * The enqueue_task method is called before nr_running is
5051 * increased. Here we update the fair scheduling stats and
5052 * then put the task into the rbtree:
5053 */
Thomas Gleixnerea87bb72010-01-20 20:58:57 +00005054static void
Peter Zijlstra371fd7e2010-03-24 16:38:48 +01005055enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005056{
5057 struct cfs_rq *cfs_rq;
Peter Zijlstra62fb1852008-02-25 17:34:02 +01005058 struct sched_entity *se = &p->se;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005059
Rafael J. Wysocki8c34ab12016-09-09 23:59:33 +02005060 /*
Patrick Bellasi2539fc82018-05-24 15:10:23 +01005061 * The code below (indirectly) updates schedutil which looks at
5062 * the cfs_rq utilization to select a frequency.
5063 * Let's add the task's estimated utilization to the cfs_rq's
5064 * estimated utilization, before we update schedutil.
5065 */
5066 util_est_enqueue(&rq->cfs, p);
5067
5068 /*
Rafael J. Wysocki8c34ab12016-09-09 23:59:33 +02005069 * If in_iowait is set, the code below may not trigger any cpufreq
5070 * utilization updates, so do it here explicitly with the IOWAIT flag
5071 * passed.
5072 */
5073 if (p->in_iowait)
Viresh Kumar674e7542017-07-28 12:16:38 +05305074 cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
Rafael J. Wysocki8c34ab12016-09-09 23:59:33 +02005075
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005076 for_each_sched_entity(se) {
Peter Zijlstra62fb1852008-02-25 17:34:02 +01005077 if (se->on_rq)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005078 break;
5079 cfs_rq = cfs_rq_of(se);
Peter Zijlstra88ec22d2009-12-16 18:04:41 +01005080 enqueue_entity(cfs_rq, se, flags);
Paul Turner85dac902011-07-21 09:43:33 -07005081
5082 /*
5083 * end evaluation on encountering a throttled cfs_rq
5084 *
5085 * note: in the case of encountering a throttled cfs_rq we will
5086 * post the final h_nr_running increment below.
Peter Zijlstrae210bff2016-06-16 18:51:48 +02005087 */
Paul Turner85dac902011-07-21 09:43:33 -07005088 if (cfs_rq_throttled(cfs_rq))
5089 break;
Paul Turner953bfcd2011-07-21 09:43:27 -07005090 cfs_rq->h_nr_running++;
Paul Turner85dac902011-07-21 09:43:33 -07005091
Peter Zijlstra88ec22d2009-12-16 18:04:41 +01005092 flags = ENQUEUE_WAKEUP;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005093 }
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01005094
Peter Zijlstra2069dd72010-11-15 15:47:00 -08005095 for_each_sched_entity(se) {
Lin Ming0f317142011-07-22 09:14:31 +08005096 cfs_rq = cfs_rq_of(se);
Paul Turner953bfcd2011-07-21 09:43:27 -07005097 cfs_rq->h_nr_running++;
Peter Zijlstra2069dd72010-11-15 15:47:00 -08005098
Paul Turner85dac902011-07-21 09:43:33 -07005099 if (cfs_rq_throttled(cfs_rq))
5100 break;
5101
Peter Zijlstra88c06162017-05-06 17:32:43 +02005102 update_load_avg(cfs_rq, se, UPDATE_TG);
Peter Zijlstra1ea6c462017-05-06 15:59:54 +02005103 update_cfs_group(se);
Peter Zijlstra2069dd72010-11-15 15:47:00 -08005104 }
5105
Yuyang Ducd126af2015-07-15 08:04:36 +08005106 if (!se)
Kirill Tkhai72465442014-05-09 03:00:14 +04005107 add_nr_running(rq, 1);
Yuyang Ducd126af2015-07-15 08:04:36 +08005108
Peter Zijlstraa4c2f002008-10-17 19:27:03 +02005109 hrtick_update(rq);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005110}
5111
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07005112static void set_next_buddy(struct sched_entity *se);
5113
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005114/*
5115 * The dequeue_task method is called before nr_running is
5116 * decreased. We remove the task from the rbtree and
5117 * update the fair scheduling stats:
5118 */
Peter Zijlstra371fd7e2010-03-24 16:38:48 +01005119static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005120{
5121 struct cfs_rq *cfs_rq;
Peter Zijlstra62fb1852008-02-25 17:34:02 +01005122 struct sched_entity *se = &p->se;
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07005123 int task_sleep = flags & DEQUEUE_SLEEP;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005124
5125 for_each_sched_entity(se) {
5126 cfs_rq = cfs_rq_of(se);
Peter Zijlstra371fd7e2010-03-24 16:38:48 +01005127 dequeue_entity(cfs_rq, se, flags);
Paul Turner85dac902011-07-21 09:43:33 -07005128
5129 /*
5130 * end evaluation on encountering a throttled cfs_rq
5131 *
5132 * note: in the case of encountering a throttled cfs_rq we will
5133 * post the final h_nr_running decrement below.
5134 */
5135 if (cfs_rq_throttled(cfs_rq))
5136 break;
Paul Turner953bfcd2011-07-21 09:43:27 -07005137 cfs_rq->h_nr_running--;
Peter Zijlstra2069dd72010-11-15 15:47:00 -08005138
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005139 /* Don't dequeue parent if it has other entities besides us */
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07005140 if (cfs_rq->load.weight) {
Konstantin Khlebnikov754bd592016-06-16 15:57:15 +03005141 /* Avoid re-evaluating load for this entity: */
5142 se = parent_entity(se);
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07005143 /*
5144 * Bias pick_next to pick a task from this cfs_rq, as
5145 * p is sleeping when it is within its sched_slice.
5146 */
Konstantin Khlebnikov754bd592016-06-16 15:57:15 +03005147 if (task_sleep && se && !throttled_hierarchy(cfs_rq))
5148 set_next_buddy(se);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005149 break;
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07005150 }
Peter Zijlstra371fd7e2010-03-24 16:38:48 +01005151 flags |= DEQUEUE_SLEEP;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005152 }
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01005153
Peter Zijlstra2069dd72010-11-15 15:47:00 -08005154 for_each_sched_entity(se) {
Lin Ming0f317142011-07-22 09:14:31 +08005155 cfs_rq = cfs_rq_of(se);
Paul Turner953bfcd2011-07-21 09:43:27 -07005156 cfs_rq->h_nr_running--;
Peter Zijlstra2069dd72010-11-15 15:47:00 -08005157
Paul Turner85dac902011-07-21 09:43:33 -07005158 if (cfs_rq_throttled(cfs_rq))
5159 break;
5160
Peter Zijlstra88c06162017-05-06 17:32:43 +02005161 update_load_avg(cfs_rq, se, UPDATE_TG);
Peter Zijlstra1ea6c462017-05-06 15:59:54 +02005162 update_cfs_group(se);
Peter Zijlstra2069dd72010-11-15 15:47:00 -08005163 }
5164
Yuyang Ducd126af2015-07-15 08:04:36 +08005165 if (!se)
Kirill Tkhai72465442014-05-09 03:00:14 +04005166 sub_nr_running(rq, 1);
Yuyang Ducd126af2015-07-15 08:04:36 +08005167
Patrick Bellasi7f65ea42018-03-09 09:52:42 +00005168 util_est_dequeue(&rq->cfs, p, task_sleep);
Peter Zijlstraa4c2f002008-10-17 19:27:03 +02005169 hrtick_update(rq);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005170}
5171
Gregory Haskinse7693a32008-01-25 21:08:09 +01005172#ifdef CONFIG_SMP
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02005173
5174/* Working cpumask for: load_balance, load_balance_newidle. */
5175DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
5176DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
5177
Frederic Weisbecker9fd81dd2016-04-19 17:36:51 +02005178#ifdef CONFIG_NO_HZ_COMMON
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005179/*
5180 * per rq 'load' arrray crap; XXX kill this.
5181 */
5182
5183/*
Peter Zijlstrad937cdc2015-10-19 13:49:30 +02005184 * The exact cpuload calculated at every tick would be:
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005185 *
Peter Zijlstrad937cdc2015-10-19 13:49:30 +02005186 * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
5187 *
Ingo Molnar97fb7a02018-03-03 14:01:12 +01005188 * If a CPU misses updates for n ticks (as it was idle) and update gets
5189 * called on the n+1-th tick when CPU may be busy, then we have:
Peter Zijlstrad937cdc2015-10-19 13:49:30 +02005190 *
5191 * load_n = (1 - 1/2^i)^n * load_0
5192 * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005193 *
5194 * decay_load_missed() below does efficient calculation of
Peter Zijlstrad937cdc2015-10-19 13:49:30 +02005195 *
5196 * load' = (1 - 1/2^i)^n * load
5197 *
5198 * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
5199 * This allows us to precompute the above in said factors, thereby allowing the
5200 * reduction of an arbitrary n in O(log_2 n) steps. (See also
5201 * fixed_power_int())
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005202 *
5203 * The calculation is approximated on a 128 point scale.
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005204 */
5205#define DEGRADE_SHIFT 7
Peter Zijlstrad937cdc2015-10-19 13:49:30 +02005206
5207static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
5208static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
5209 { 0, 0, 0, 0, 0, 0, 0, 0 },
5210 { 64, 32, 8, 0, 0, 0, 0, 0 },
5211 { 96, 72, 40, 12, 1, 0, 0, 0 },
5212 { 112, 98, 75, 43, 15, 1, 0, 0 },
5213 { 120, 112, 98, 76, 45, 16, 2, 0 }
5214};
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005215
5216/*
5217 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
5218 * would be when CPU is idle and so we just decay the old load without
5219 * adding any new load.
5220 */
5221static unsigned long
5222decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
5223{
5224 int j = 0;
5225
5226 if (!missed_updates)
5227 return load;
5228
5229 if (missed_updates >= degrade_zero_ticks[idx])
5230 return 0;
5231
5232 if (idx == 1)
5233 return load >> missed_updates;
5234
5235 while (missed_updates) {
5236 if (missed_updates % 2)
5237 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
5238
5239 missed_updates >>= 1;
5240 j++;
5241 }
5242 return load;
5243}
Peter Zijlstrae022e0d2017-12-21 11:20:23 +01005244
5245static struct {
5246 cpumask_var_t idle_cpus_mask;
5247 atomic_t nr_cpus;
Vincent Guittotf643ea22018-02-13 11:31:17 +01005248 int has_blocked; /* Idle CPUS has blocked load */
Peter Zijlstrae022e0d2017-12-21 11:20:23 +01005249 unsigned long next_balance; /* in jiffy units */
Vincent Guittotf643ea22018-02-13 11:31:17 +01005250 unsigned long next_blocked; /* Next update of blocked load in jiffies */
Peter Zijlstrae022e0d2017-12-21 11:20:23 +01005251} nohz ____cacheline_aligned;
5252
Frederic Weisbecker9fd81dd2016-04-19 17:36:51 +02005253#endif /* CONFIG_NO_HZ_COMMON */
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005254
Byungchul Park59543272015-10-14 18:47:35 +09005255/**
Frederic Weisbeckercee1afc2016-04-13 15:56:50 +02005256 * __cpu_load_update - update the rq->cpu_load[] statistics
Byungchul Park59543272015-10-14 18:47:35 +09005257 * @this_rq: The rq to update statistics for
5258 * @this_load: The current load
5259 * @pending_updates: The number of missed updates
Byungchul Park59543272015-10-14 18:47:35 +09005260 *
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005261 * Update rq->cpu_load[] statistics. This function is usually called every
Byungchul Park59543272015-10-14 18:47:35 +09005262 * scheduler tick (TICK_NSEC).
5263 *
5264 * This function computes a decaying average:
5265 *
5266 * load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
5267 *
5268 * Because of NOHZ it might not get called on every tick which gives need for
5269 * the @pending_updates argument.
5270 *
5271 * load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
5272 * = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
5273 * = A * (A * load[i]_n-2 + B) + B
5274 * = A * (A * (A * load[i]_n-3 + B) + B) + B
5275 * = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
5276 * = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
5277 * = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
5278 * = (1 - 1/2^i)^n * (load[i]_0 - load) + load
5279 *
5280 * In the above we've assumed load_n := load, which is true for NOHZ_FULL as
5281 * any change in load would have resulted in the tick being turned back on.
5282 *
5283 * For regular NOHZ, this reduces to:
5284 *
5285 * load[i]_n = (1 - 1/2^i)^n * load[i]_0
5286 *
5287 * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005288 * term.
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005289 */
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005290static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
5291 unsigned long pending_updates)
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005292{
Frederic Weisbecker9fd81dd2016-04-19 17:36:51 +02005293 unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005294 int i, scale;
5295
5296 this_rq->nr_load_updates++;
5297
5298 /* Update our load: */
5299 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
5300 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
5301 unsigned long old_load, new_load;
5302
5303 /* scale is effectively 1 << i now, and >> i divides by scale */
5304
Byungchul Park7400d3b2016-01-15 16:07:49 +09005305 old_load = this_rq->cpu_load[i];
Frederic Weisbecker9fd81dd2016-04-19 17:36:51 +02005306#ifdef CONFIG_NO_HZ_COMMON
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005307 old_load = decay_load_missed(old_load, pending_updates - 1, i);
Byungchul Park7400d3b2016-01-15 16:07:49 +09005308 if (tickless_load) {
5309 old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
5310 /*
5311 * old_load can never be a negative value because a
5312 * decayed tickless_load cannot be greater than the
5313 * original tickless_load.
5314 */
5315 old_load += tickless_load;
5316 }
Frederic Weisbecker9fd81dd2016-04-19 17:36:51 +02005317#endif
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005318 new_load = this_load;
5319 /*
5320 * Round up the averaging division if load is increasing. This
5321 * prevents us from getting stuck on 9 if the load is 10, for
5322 * example.
5323 */
5324 if (new_load > old_load)
5325 new_load += scale - 1;
5326
5327 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
5328 }
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005329}
5330
Yuyang Du7ea241a2015-07-15 08:04:42 +08005331/* Used instead of source_load when we know the type == 0 */
Viresh Kumarc7132dd2017-05-24 10:59:54 +05305332static unsigned long weighted_cpuload(struct rq *rq)
Yuyang Du7ea241a2015-07-15 08:04:42 +08005333{
Viresh Kumarc7132dd2017-05-24 10:59:54 +05305334 return cfs_rq_runnable_load_avg(&rq->cfs);
Yuyang Du7ea241a2015-07-15 08:04:42 +08005335}
5336
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005337#ifdef CONFIG_NO_HZ_COMMON
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005338/*
5339 * There is no sane way to deal with nohz on smp when using jiffies because the
Ingo Molnar97fb7a02018-03-03 14:01:12 +01005340 * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005341 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
5342 *
5343 * Therefore we need to avoid the delta approach from the regular tick when
5344 * possible since that would seriously skew the load calculation. This is why we
5345 * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on
5346 * jiffies deltas for updates happening while in nohz mode (idle ticks, idle
5347 * loop exit, nohz_idle_balance, nohz full exit...)
5348 *
5349 * This means we might still be one tick off for nohz periods.
5350 */
5351
5352static void cpu_load_update_nohz(struct rq *this_rq,
5353 unsigned long curr_jiffies,
5354 unsigned long load)
Frederic Weisbeckerbe68a682016-01-13 17:01:29 +01005355{
5356 unsigned long pending_updates;
5357
5358 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
5359 if (pending_updates) {
5360 this_rq->last_load_update_tick = curr_jiffies;
5361 /*
5362 * In the regular NOHZ case, we were idle, this means load 0.
5363 * In the NOHZ_FULL case, we were non-idle, we should consider
5364 * its weighted load.
5365 */
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005366 cpu_load_update(this_rq, load, pending_updates);
Frederic Weisbeckerbe68a682016-01-13 17:01:29 +01005367 }
5368}
5369
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005370/*
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005371 * Called from nohz_idle_balance() to update the load ratings before doing the
5372 * idle balance.
5373 */
Frederic Weisbeckercee1afc2016-04-13 15:56:50 +02005374static void cpu_load_update_idle(struct rq *this_rq)
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005375{
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005376 /*
5377 * bail if there's load or we're actually up-to-date.
5378 */
Viresh Kumarc7132dd2017-05-24 10:59:54 +05305379 if (weighted_cpuload(this_rq))
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005380 return;
5381
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005382 cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005383}
5384
5385/*
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005386 * Record CPU load on nohz entry so we know the tickless load to account
5387 * on nohz exit. cpu_load[0] happens then to be updated more frequently
5388 * than other cpu_load[idx] but it should be fine as cpu_load readers
5389 * shouldn't rely into synchronized cpu_load[*] updates.
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005390 */
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005391void cpu_load_update_nohz_start(void)
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005392{
5393 struct rq *this_rq = this_rq();
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005394
5395 /*
5396 * This is all lockless but should be fine. If weighted_cpuload changes
5397 * concurrently we'll exit nohz. And cpu_load write can race with
5398 * cpu_load_update_idle() but both updater would be writing the same.
5399 */
Viresh Kumarc7132dd2017-05-24 10:59:54 +05305400 this_rq->cpu_load[0] = weighted_cpuload(this_rq);
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005401}
5402
5403/*
5404 * Account the tickless load in the end of a nohz frame.
5405 */
5406void cpu_load_update_nohz_stop(void)
5407{
Jason Low316c1608d2015-04-28 13:00:20 -07005408 unsigned long curr_jiffies = READ_ONCE(jiffies);
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005409 struct rq *this_rq = this_rq();
5410 unsigned long load;
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02005411 struct rq_flags rf;
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005412
5413 if (curr_jiffies == this_rq->last_load_update_tick)
5414 return;
5415
Viresh Kumarc7132dd2017-05-24 10:59:54 +05305416 load = weighted_cpuload(this_rq);
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02005417 rq_lock(this_rq, &rf);
Matt Flemingb52fad22016-05-03 20:46:54 +01005418 update_rq_clock(this_rq);
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005419 cpu_load_update_nohz(this_rq, curr_jiffies, load);
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02005420 rq_unlock(this_rq, &rf);
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005421}
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005422#else /* !CONFIG_NO_HZ_COMMON */
5423static inline void cpu_load_update_nohz(struct rq *this_rq,
5424 unsigned long curr_jiffies,
5425 unsigned long load) { }
5426#endif /* CONFIG_NO_HZ_COMMON */
5427
5428static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
5429{
Frederic Weisbecker9fd81dd2016-04-19 17:36:51 +02005430#ifdef CONFIG_NO_HZ_COMMON
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005431 /* See the mess around cpu_load_update_nohz(). */
5432 this_rq->last_load_update_tick = READ_ONCE(jiffies);
Frederic Weisbecker9fd81dd2016-04-19 17:36:51 +02005433#endif
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005434 cpu_load_update(this_rq, load, 1);
5435}
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005436
5437/*
5438 * Called from scheduler_tick()
5439 */
Frederic Weisbeckercee1afc2016-04-13 15:56:50 +02005440void cpu_load_update_active(struct rq *this_rq)
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005441{
Viresh Kumarc7132dd2017-05-24 10:59:54 +05305442 unsigned long load = weighted_cpuload(this_rq);
Frederic Weisbecker1f419062016-04-13 15:56:51 +02005443
5444 if (tick_nohz_tick_stopped())
5445 cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
5446 else
5447 cpu_load_update_periodic(this_rq, load);
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005448}
5449
Peter Zijlstra029632f2011-10-25 10:00:11 +02005450/*
Ingo Molnar97fb7a02018-03-03 14:01:12 +01005451 * Return a low guess at the load of a migration-source CPU weighted
Peter Zijlstra029632f2011-10-25 10:00:11 +02005452 * according to the scheduling class and "nice" value.
5453 *
5454 * We want to under-estimate the load of migration sources, to
5455 * balance conservatively.
5456 */
5457static unsigned long source_load(int cpu, int type)
5458{
5459 struct rq *rq = cpu_rq(cpu);
Viresh Kumarc7132dd2017-05-24 10:59:54 +05305460 unsigned long total = weighted_cpuload(rq);
Peter Zijlstra029632f2011-10-25 10:00:11 +02005461
5462 if (type == 0 || !sched_feat(LB_BIAS))
5463 return total;
5464
5465 return min(rq->cpu_load[type-1], total);
5466}
5467
5468/*
Ingo Molnar97fb7a02018-03-03 14:01:12 +01005469 * Return a high guess at the load of a migration-target CPU weighted
Peter Zijlstra029632f2011-10-25 10:00:11 +02005470 * according to the scheduling class and "nice" value.
5471 */
5472static unsigned long target_load(int cpu, int type)
5473{
5474 struct rq *rq = cpu_rq(cpu);
Viresh Kumarc7132dd2017-05-24 10:59:54 +05305475 unsigned long total = weighted_cpuload(rq);
Peter Zijlstra029632f2011-10-25 10:00:11 +02005476
5477 if (type == 0 || !sched_feat(LB_BIAS))
5478 return total;
5479
5480 return max(rq->cpu_load[type-1], total);
5481}
5482
Nicolas Pitreced549f2014-05-26 18:19:38 -04005483static unsigned long capacity_of(int cpu)
Peter Zijlstra029632f2011-10-25 10:00:11 +02005484{
Nicolas Pitreced549f2014-05-26 18:19:38 -04005485 return cpu_rq(cpu)->cpu_capacity;
Peter Zijlstra029632f2011-10-25 10:00:11 +02005486}
5487
Vincent Guittotca6d75e2015-02-27 16:54:09 +01005488static unsigned long capacity_orig_of(int cpu)
5489{
5490 return cpu_rq(cpu)->cpu_capacity_orig;
5491}
5492
Peter Zijlstra029632f2011-10-25 10:00:11 +02005493static unsigned long cpu_avg_load_per_task(int cpu)
5494{
5495 struct rq *rq = cpu_rq(cpu);
Jason Low316c1608d2015-04-28 13:00:20 -07005496 unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
Viresh Kumarc7132dd2017-05-24 10:59:54 +05305497 unsigned long load_avg = weighted_cpuload(rq);
Peter Zijlstra029632f2011-10-25 10:00:11 +02005498
5499 if (nr_running)
Alex Shib92486c2013-06-20 10:18:50 +08005500 return load_avg / nr_running;
Peter Zijlstra029632f2011-10-25 10:00:11 +02005501
5502 return 0;
5503}
5504
Peter Zijlstrac58d25f2016-05-12 09:19:59 +02005505static void record_wakee(struct task_struct *p)
5506{
5507 /*
5508 * Only decay a single time; tasks that have less then 1 wakeup per
5509 * jiffy will not have built up many flips.
5510 */
5511 if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
5512 current->wakee_flips >>= 1;
5513 current->wakee_flip_decay_ts = jiffies;
5514 }
5515
5516 if (current->last_wakee != p) {
5517 current->last_wakee = p;
5518 current->wakee_flips++;
5519 }
5520}
5521
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02005522/*
5523 * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
Peter Zijlstrac58d25f2016-05-12 09:19:59 +02005524 *
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02005525 * A waker of many should wake a different task than the one last awakened
Peter Zijlstrac58d25f2016-05-12 09:19:59 +02005526 * at a frequency roughly N times higher than one of its wakees.
5527 *
5528 * In order to determine whether we should let the load spread vs consolidating
5529 * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
5530 * partner, and a factor of lls_size higher frequency in the other.
5531 *
5532 * With both conditions met, we can be relatively sure that the relationship is
5533 * non-monogamous, with partner count exceeding socket size.
5534 *
5535 * Waker/wakee being client/server, worker/dispatcher, interrupt source or
5536 * whatever is irrelevant, spread criteria is apparent partner count exceeds
5537 * socket size.
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02005538 */
Michael Wang62470412013-07-04 12:55:51 +08005539static int wake_wide(struct task_struct *p)
5540{
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02005541 unsigned int master = current->wakee_flips;
5542 unsigned int slave = p->wakee_flips;
Peter Zijlstra7d9ffa82013-07-04 12:56:46 +08005543 int factor = this_cpu_read(sd_llc_size);
Michael Wang62470412013-07-04 12:55:51 +08005544
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02005545 if (master < slave)
5546 swap(master, slave);
5547 if (slave < factor || master < slave * factor)
5548 return 0;
5549 return 1;
Michael Wang62470412013-07-04 12:55:51 +08005550}
5551
Peter Zijlstra90001d62017-07-31 17:50:05 +02005552/*
Peter Zijlstrad153b152017-09-27 11:35:30 +02005553 * The purpose of wake_affine() is to quickly determine on which CPU we can run
5554 * soonest. For the purpose of speed we only consider the waking and previous
5555 * CPU.
Peter Zijlstra90001d62017-07-31 17:50:05 +02005556 *
Mel Gorman7332dec2017-12-19 08:59:47 +00005557 * wake_affine_idle() - only considers 'now', it check if the waking CPU is
5558 * cache-affine and is (or will be) idle.
Peter Zijlstraf2cdd9c2017-10-06 09:23:24 +02005559 *
5560 * wake_affine_weight() - considers the weight to reflect the average
5561 * scheduling latency of the CPUs. This seems to work
5562 * for the overloaded case.
Peter Zijlstra90001d62017-07-31 17:50:05 +02005563 */
Mel Gorman3b76c4a2018-01-30 10:45:53 +00005564static int
Mel Gorman89a55f52018-01-30 10:45:52 +00005565wake_affine_idle(int this_cpu, int prev_cpu, int sync)
Peter Zijlstra90001d62017-07-31 17:50:05 +02005566{
Mel Gorman7332dec2017-12-19 08:59:47 +00005567 /*
5568 * If this_cpu is idle, it implies the wakeup is from interrupt
5569 * context. Only allow the move if cache is shared. Otherwise an
5570 * interrupt intensive workload could force all tasks onto one
5571 * node depending on the IO topology or IRQ affinity settings.
Mel Gorman806486c2018-01-30 10:45:54 +00005572 *
5573 * If the prev_cpu is idle and cache affine then avoid a migration.
5574 * There is no guarantee that the cache hot data from an interrupt
5575 * is more important than cache hot data on the prev_cpu and from
5576 * a cpufreq perspective, it's better to have higher utilisation
5577 * on one CPU.
Mel Gorman7332dec2017-12-19 08:59:47 +00005578 */
Rohit Jain943d3552018-05-09 09:39:48 -07005579 if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
5580 return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
Peter Zijlstra90001d62017-07-31 17:50:05 +02005581
Peter Zijlstrad153b152017-09-27 11:35:30 +02005582 if (sync && cpu_rq(this_cpu)->nr_running == 1)
Mel Gorman3b76c4a2018-01-30 10:45:53 +00005583 return this_cpu;
Peter Zijlstra90001d62017-07-31 17:50:05 +02005584
Mel Gorman3b76c4a2018-01-30 10:45:53 +00005585 return nr_cpumask_bits;
Peter Zijlstra90001d62017-07-31 17:50:05 +02005586}
5587
Mel Gorman3b76c4a2018-01-30 10:45:53 +00005588static int
Peter Zijlstraf2cdd9c2017-10-06 09:23:24 +02005589wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
5590 int this_cpu, int prev_cpu, int sync)
Peter Zijlstra90001d62017-07-31 17:50:05 +02005591{
Peter Zijlstra90001d62017-07-31 17:50:05 +02005592 s64 this_eff_load, prev_eff_load;
5593 unsigned long task_load;
5594
Peter Zijlstraf2cdd9c2017-10-06 09:23:24 +02005595 this_eff_load = target_load(this_cpu, sd->wake_idx);
Peter Zijlstra90001d62017-07-31 17:50:05 +02005596
Peter Zijlstra90001d62017-07-31 17:50:05 +02005597 if (sync) {
5598 unsigned long current_load = task_h_load(current);
5599
Peter Zijlstraf2cdd9c2017-10-06 09:23:24 +02005600 if (current_load > this_eff_load)
Mel Gorman3b76c4a2018-01-30 10:45:53 +00005601 return this_cpu;
Peter Zijlstra90001d62017-07-31 17:50:05 +02005602
Peter Zijlstraf2cdd9c2017-10-06 09:23:24 +02005603 this_eff_load -= current_load;
Peter Zijlstra90001d62017-07-31 17:50:05 +02005604 }
5605
Peter Zijlstra90001d62017-07-31 17:50:05 +02005606 task_load = task_h_load(p);
5607
Peter Zijlstraf2cdd9c2017-10-06 09:23:24 +02005608 this_eff_load += task_load;
5609 if (sched_feat(WA_BIAS))
5610 this_eff_load *= 100;
5611 this_eff_load *= capacity_of(prev_cpu);
Peter Zijlstra90001d62017-07-31 17:50:05 +02005612
Mel Gormaneeb60392018-02-13 13:37:26 +00005613 prev_eff_load = source_load(prev_cpu, sd->wake_idx);
Peter Zijlstraf2cdd9c2017-10-06 09:23:24 +02005614 prev_eff_load -= task_load;
5615 if (sched_feat(WA_BIAS))
5616 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
5617 prev_eff_load *= capacity_of(this_cpu);
Peter Zijlstra90001d62017-07-31 17:50:05 +02005618
Mel Gorman082f7642018-02-13 13:37:27 +00005619 /*
5620 * If sync, adjust the weight of prev_eff_load such that if
5621 * prev_eff == this_eff that select_idle_sibling() will consider
5622 * stacking the wakee on top of the waker if no other CPU is
5623 * idle.
5624 */
5625 if (sync)
5626 prev_eff_load += 1;
5627
5628 return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
Peter Zijlstra90001d62017-07-31 17:50:05 +02005629}
5630
Morten Rasmussen772bd008c2016-06-22 18:03:13 +01005631static int wake_affine(struct sched_domain *sd, struct task_struct *p,
Mel Gorman7ebb66a2018-02-13 13:37:25 +00005632 int this_cpu, int prev_cpu, int sync)
Ingo Molnar098fb9d2008-03-16 20:36:10 +01005633{
Mel Gorman3b76c4a2018-01-30 10:45:53 +00005634 int target = nr_cpumask_bits;
Ingo Molnar098fb9d2008-03-16 20:36:10 +01005635
Mel Gorman89a55f52018-01-30 10:45:52 +00005636 if (sched_feat(WA_IDLE))
Mel Gorman3b76c4a2018-01-30 10:45:53 +00005637 target = wake_affine_idle(this_cpu, prev_cpu, sync);
Peter Zijlstra90001d62017-07-31 17:50:05 +02005638
Mel Gorman3b76c4a2018-01-30 10:45:53 +00005639 if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
5640 target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
Mike Galbraithb3137bc2008-05-29 11:11:41 +02005641
Josh Poimboeufae928822016-06-17 12:43:24 -05005642 schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
Mel Gorman3b76c4a2018-01-30 10:45:53 +00005643 if (target == nr_cpumask_bits)
5644 return prev_cpu;
Mike Galbraithb3137bc2008-05-29 11:11:41 +02005645
Mel Gorman3b76c4a2018-01-30 10:45:53 +00005646 schedstat_inc(sd->ttwu_move_affine);
5647 schedstat_inc(p->se.statistics.nr_wakeups_affine);
5648 return target;
Ingo Molnar098fb9d2008-03-16 20:36:10 +01005649}
5650
Patrick Bellasif01415f2017-12-05 17:10:15 +00005651static unsigned long cpu_util_wake(int cpu, struct task_struct *p);
Morten Rasmussen6a0b19c2016-10-14 14:41:08 +01005652
5653static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
5654{
Joel Fernandesf453ae22017-12-14 13:21:58 -08005655 return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);
Morten Rasmussen6a0b19c2016-10-14 14:41:08 +01005656}
5657
Peter Zijlstraaaee1202009-09-10 13:36:25 +02005658/*
5659 * find_idlest_group finds and returns the least busy CPU group within the
5660 * domain.
Brendan Jackman6fee85c2017-10-05 12:45:15 +01005661 *
5662 * Assumes p is allowed on at least one CPU in sd.
Peter Zijlstraaaee1202009-09-10 13:36:25 +02005663 */
5664static struct sched_group *
Peter Zijlstra78e7ed52009-09-03 13:16:51 +02005665find_idlest_group(struct sched_domain *sd, struct task_struct *p,
Vincent Guittotc44f2a02013-10-18 13:52:21 +02005666 int this_cpu, int sd_flag)
Gregory Haskinse7693a32008-01-25 21:08:09 +01005667{
Andi Kleenb3bd3de2010-08-10 14:17:51 -07005668 struct sched_group *idlest = NULL, *group = sd->groups;
Morten Rasmussen6a0b19c2016-10-14 14:41:08 +01005669 struct sched_group *most_spare_sg = NULL;
Brendan Jackman0d10ab92017-10-05 12:45:14 +01005670 unsigned long min_runnable_load = ULONG_MAX;
5671 unsigned long this_runnable_load = ULONG_MAX;
5672 unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
Morten Rasmussen6a0b19c2016-10-14 14:41:08 +01005673 unsigned long most_spare = 0, this_spare = 0;
Vincent Guittotc44f2a02013-10-18 13:52:21 +02005674 int load_idx = sd->forkexec_idx;
Vincent Guittot6b947802016-12-08 17:56:54 +01005675 int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
5676 unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
5677 (sd->imbalance_pct-100) / 100;
Gregory Haskinse7693a32008-01-25 21:08:09 +01005678
Vincent Guittotc44f2a02013-10-18 13:52:21 +02005679 if (sd_flag & SD_BALANCE_WAKE)
5680 load_idx = sd->wake_idx;
5681
Peter Zijlstraaaee1202009-09-10 13:36:25 +02005682 do {
Vincent Guittot6b947802016-12-08 17:56:54 +01005683 unsigned long load, avg_load, runnable_load;
5684 unsigned long spare_cap, max_spare_cap;
Peter Zijlstraaaee1202009-09-10 13:36:25 +02005685 int local_group;
5686 int i;
Gregory Haskinse7693a32008-01-25 21:08:09 +01005687
Peter Zijlstraaaee1202009-09-10 13:36:25 +02005688 /* Skip over this group if it has no CPUs allowed */
Peter Zijlstraae4df9d2017-05-01 11:03:12 +02005689 if (!cpumask_intersects(sched_group_span(group),
Ingo Molnar0c98d342017-02-05 15:38:10 +01005690 &p->cpus_allowed))
Peter Zijlstraaaee1202009-09-10 13:36:25 +02005691 continue;
5692
5693 local_group = cpumask_test_cpu(this_cpu,
Peter Zijlstraae4df9d2017-05-01 11:03:12 +02005694 sched_group_span(group));
Peter Zijlstraaaee1202009-09-10 13:36:25 +02005695
Morten Rasmussen6a0b19c2016-10-14 14:41:08 +01005696 /*
5697 * Tally up the load of all CPUs in the group and find
5698 * the group containing the CPU with most spare capacity.
5699 */
Peter Zijlstraaaee1202009-09-10 13:36:25 +02005700 avg_load = 0;
Vincent Guittot6b947802016-12-08 17:56:54 +01005701 runnable_load = 0;
Morten Rasmussen6a0b19c2016-10-14 14:41:08 +01005702 max_spare_cap = 0;
Peter Zijlstraaaee1202009-09-10 13:36:25 +02005703
Peter Zijlstraae4df9d2017-05-01 11:03:12 +02005704 for_each_cpu(i, sched_group_span(group)) {
Ingo Molnar97fb7a02018-03-03 14:01:12 +01005705 /* Bias balancing toward CPUs of our domain */
Peter Zijlstraaaee1202009-09-10 13:36:25 +02005706 if (local_group)
5707 load = source_load(i, load_idx);
5708 else
5709 load = target_load(i, load_idx);
5710
Vincent Guittot6b947802016-12-08 17:56:54 +01005711 runnable_load += load;
5712
5713 avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
Morten Rasmussen6a0b19c2016-10-14 14:41:08 +01005714
5715 spare_cap = capacity_spare_wake(i, p);
5716
5717 if (spare_cap > max_spare_cap)
5718 max_spare_cap = spare_cap;
Peter Zijlstraaaee1202009-09-10 13:36:25 +02005719 }
5720
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04005721 /* Adjust by relative CPU capacity of the group */
Vincent Guittot6b947802016-12-08 17:56:54 +01005722 avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
5723 group->sgc->capacity;
5724 runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
5725 group->sgc->capacity;
Peter Zijlstraaaee1202009-09-10 13:36:25 +02005726
5727 if (local_group) {
Vincent Guittot6b947802016-12-08 17:56:54 +01005728 this_runnable_load = runnable_load;
5729 this_avg_load = avg_load;
Morten Rasmussen6a0b19c2016-10-14 14:41:08 +01005730 this_spare = max_spare_cap;
5731 } else {
Vincent Guittot6b947802016-12-08 17:56:54 +01005732 if (min_runnable_load > (runnable_load + imbalance)) {
5733 /*
5734 * The runnable load is significantly smaller
Ingo Molnar97fb7a02018-03-03 14:01:12 +01005735 * so we can pick this new CPU:
Vincent Guittot6b947802016-12-08 17:56:54 +01005736 */
5737 min_runnable_load = runnable_load;
5738 min_avg_load = avg_load;
5739 idlest = group;
5740 } else if ((runnable_load < (min_runnable_load + imbalance)) &&
5741 (100*min_avg_load > imbalance_scale*avg_load)) {
5742 /*
5743 * The runnable loads are close so take the
Ingo Molnar97fb7a02018-03-03 14:01:12 +01005744 * blocked load into account through avg_load:
Vincent Guittot6b947802016-12-08 17:56:54 +01005745 */
5746 min_avg_load = avg_load;
Morten Rasmussen6a0b19c2016-10-14 14:41:08 +01005747 idlest = group;
5748 }
5749
5750 if (most_spare < max_spare_cap) {
5751 most_spare = max_spare_cap;
5752 most_spare_sg = group;
5753 }
Peter Zijlstraaaee1202009-09-10 13:36:25 +02005754 }
5755 } while (group = group->next, group != sd->groups);
5756
Morten Rasmussen6a0b19c2016-10-14 14:41:08 +01005757 /*
5758 * The cross-over point between using spare capacity or least load
5759 * is too conservative for high utilization tasks on partially
5760 * utilized systems if we require spare_capacity > task_util(p),
5761 * so we allow for some task stuffing by using
5762 * spare_capacity > task_util(p)/2.
Vincent Guittotf519a3f2016-12-08 17:56:53 +01005763 *
5764 * Spare capacity can't be used for fork because the utilization has
5765 * not been set yet, we must first select a rq to compute the initial
5766 * utilization.
Morten Rasmussen6a0b19c2016-10-14 14:41:08 +01005767 */
Vincent Guittotf519a3f2016-12-08 17:56:53 +01005768 if (sd_flag & SD_BALANCE_FORK)
5769 goto skip_spare;
5770
Morten Rasmussen6a0b19c2016-10-14 14:41:08 +01005771 if (this_spare > task_util(p) / 2 &&
Vincent Guittot6b947802016-12-08 17:56:54 +01005772 imbalance_scale*this_spare > 100*most_spare)
Morten Rasmussen6a0b19c2016-10-14 14:41:08 +01005773 return NULL;
Vincent Guittot6b947802016-12-08 17:56:54 +01005774
5775 if (most_spare > task_util(p) / 2)
Morten Rasmussen6a0b19c2016-10-14 14:41:08 +01005776 return most_spare_sg;
5777
Vincent Guittotf519a3f2016-12-08 17:56:53 +01005778skip_spare:
Vincent Guittot6b947802016-12-08 17:56:54 +01005779 if (!idlest)
Peter Zijlstraaaee1202009-09-10 13:36:25 +02005780 return NULL;
Vincent Guittot6b947802016-12-08 17:56:54 +01005781
Mel Gorman2c833622018-02-13 13:37:29 +00005782 /*
5783 * When comparing groups across NUMA domains, it's possible for the
5784 * local domain to be very lightly loaded relative to the remote
5785 * domains but "imbalance" skews the comparison making remote CPUs
5786 * look much more favourable. When considering cross-domain, add
5787 * imbalance to the runnable load on the remote node and consider
5788 * staying local.
5789 */
5790 if ((sd->flags & SD_NUMA) &&
5791 min_runnable_load + imbalance >= this_runnable_load)
5792 return NULL;
5793
Vincent Guittot6b947802016-12-08 17:56:54 +01005794 if (min_runnable_load > (this_runnable_load + imbalance))
5795 return NULL;
5796
5797 if ((this_runnable_load < (min_runnable_load + imbalance)) &&
5798 (100*this_avg_load < imbalance_scale*min_avg_load))
5799 return NULL;
5800
Peter Zijlstraaaee1202009-09-10 13:36:25 +02005801 return idlest;
5802}
5803
5804/*
Ingo Molnar97fb7a02018-03-03 14:01:12 +01005805 * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
Peter Zijlstraaaee1202009-09-10 13:36:25 +02005806 */
5807static int
Brendan Jackman18bd1b4b2017-10-05 12:45:12 +01005808find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
Peter Zijlstraaaee1202009-09-10 13:36:25 +02005809{
5810 unsigned long load, min_load = ULONG_MAX;
Nicolas Pitre83a0a962014-09-04 11:32:10 -04005811 unsigned int min_exit_latency = UINT_MAX;
5812 u64 latest_idle_timestamp = 0;
5813 int least_loaded_cpu = this_cpu;
5814 int shallowest_idle_cpu = -1;
Peter Zijlstraaaee1202009-09-10 13:36:25 +02005815 int i;
5816
Morten Rasmusseneaecf412016-06-22 18:03:14 +01005817 /* Check if we have any choice: */
5818 if (group->group_weight == 1)
Peter Zijlstraae4df9d2017-05-01 11:03:12 +02005819 return cpumask_first(sched_group_span(group));
Morten Rasmusseneaecf412016-06-22 18:03:14 +01005820
Peter Zijlstraaaee1202009-09-10 13:36:25 +02005821 /* Traverse only the allowed CPUs */
Peter Zijlstraae4df9d2017-05-01 11:03:12 +02005822 for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
Rohit Jain943d3552018-05-09 09:39:48 -07005823 if (available_idle_cpu(i)) {
Nicolas Pitre83a0a962014-09-04 11:32:10 -04005824 struct rq *rq = cpu_rq(i);
5825 struct cpuidle_state *idle = idle_get_state(rq);
5826 if (idle && idle->exit_latency < min_exit_latency) {
5827 /*
5828 * We give priority to a CPU whose idle state
5829 * has the smallest exit latency irrespective
5830 * of any idle timestamp.
5831 */
5832 min_exit_latency = idle->exit_latency;
5833 latest_idle_timestamp = rq->idle_stamp;
5834 shallowest_idle_cpu = i;
5835 } else if ((!idle || idle->exit_latency == min_exit_latency) &&
5836 rq->idle_stamp > latest_idle_timestamp) {
5837 /*
5838 * If equal or no active idle state, then
5839 * the most recently idled CPU might have
5840 * a warmer cache.
5841 */
5842 latest_idle_timestamp = rq->idle_stamp;
5843 shallowest_idle_cpu = i;
5844 }
Yao Dongdong9f967422014-10-28 04:08:06 +00005845 } else if (shallowest_idle_cpu == -1) {
Viresh Kumarc7132dd2017-05-24 10:59:54 +05305846 load = weighted_cpuload(cpu_rq(i));
Joel Fernandes18cec7e2017-12-15 07:39:44 -08005847 if (load < min_load) {
Nicolas Pitre83a0a962014-09-04 11:32:10 -04005848 min_load = load;
5849 least_loaded_cpu = i;
5850 }
Gregory Haskinse7693a32008-01-25 21:08:09 +01005851 }
5852 }
5853
Nicolas Pitre83a0a962014-09-04 11:32:10 -04005854 return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
Peter Zijlstraaaee1202009-09-10 13:36:25 +02005855}
Gregory Haskinse7693a32008-01-25 21:08:09 +01005856
Brendan Jackman18bd1b4b2017-10-05 12:45:12 +01005857static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
5858 int cpu, int prev_cpu, int sd_flag)
5859{
Brendan Jackman93f50f92017-10-05 12:45:16 +01005860 int new_cpu = cpu;
Brendan Jackman18bd1b4b2017-10-05 12:45:12 +01005861
Brendan Jackman6fee85c2017-10-05 12:45:15 +01005862 if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
5863 return prev_cpu;
5864
Viresh Kumarc976a862018-04-26 16:00:51 +05305865 /*
5866 * We need task's util for capacity_spare_wake, sync it up to prev_cpu's
5867 * last_update_time.
5868 */
5869 if (!(sd_flag & SD_BALANCE_FORK))
5870 sync_entity_load_avg(&p->se);
5871
Brendan Jackman18bd1b4b2017-10-05 12:45:12 +01005872 while (sd) {
5873 struct sched_group *group;
5874 struct sched_domain *tmp;
5875 int weight;
5876
5877 if (!(sd->flags & sd_flag)) {
5878 sd = sd->child;
5879 continue;
5880 }
5881
5882 group = find_idlest_group(sd, p, cpu, sd_flag);
5883 if (!group) {
5884 sd = sd->child;
5885 continue;
5886 }
5887
5888 new_cpu = find_idlest_group_cpu(group, p, cpu);
Brendan Jackmane90381e2017-10-05 12:45:13 +01005889 if (new_cpu == cpu) {
Ingo Molnar97fb7a02018-03-03 14:01:12 +01005890 /* Now try balancing at a lower domain level of 'cpu': */
Brendan Jackman18bd1b4b2017-10-05 12:45:12 +01005891 sd = sd->child;
5892 continue;
5893 }
5894
Ingo Molnar97fb7a02018-03-03 14:01:12 +01005895 /* Now try balancing at a lower domain level of 'new_cpu': */
Brendan Jackman18bd1b4b2017-10-05 12:45:12 +01005896 cpu = new_cpu;
5897 weight = sd->span_weight;
5898 sd = NULL;
5899 for_each_domain(cpu, tmp) {
5900 if (weight <= tmp->span_weight)
5901 break;
5902 if (tmp->flags & sd_flag)
5903 sd = tmp;
5904 }
Brendan Jackman18bd1b4b2017-10-05 12:45:12 +01005905 }
5906
5907 return new_cpu;
5908}
5909
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02005910#ifdef CONFIG_SCHED_SMT
Peter Zijlstraba2591a2018-05-29 16:43:46 +02005911DEFINE_STATIC_KEY_FALSE(sched_smt_present);
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02005912
5913static inline void set_idle_cores(int cpu, int val)
5914{
5915 struct sched_domain_shared *sds;
5916
5917 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
5918 if (sds)
5919 WRITE_ONCE(sds->has_idle_cores, val);
5920}
5921
5922static inline bool test_idle_cores(int cpu, bool def)
5923{
5924 struct sched_domain_shared *sds;
5925
5926 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
5927 if (sds)
5928 return READ_ONCE(sds->has_idle_cores);
5929
5930 return def;
5931}
5932
5933/*
5934 * Scans the local SMT mask to see if the entire core is idle, and records this
5935 * information in sd_llc_shared->has_idle_cores.
5936 *
5937 * Since SMT siblings share all cache levels, inspecting this limited remote
5938 * state should be fairly cheap.
5939 */
Peter Zijlstra1b568f02016-05-09 10:38:41 +02005940void __update_idle_core(struct rq *rq)
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02005941{
5942 int core = cpu_of(rq);
5943 int cpu;
5944
5945 rcu_read_lock();
5946 if (test_idle_cores(core, true))
5947 goto unlock;
5948
5949 for_each_cpu(cpu, cpu_smt_mask(core)) {
5950 if (cpu == core)
5951 continue;
5952
Rohit Jain943d3552018-05-09 09:39:48 -07005953 if (!available_idle_cpu(cpu))
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02005954 goto unlock;
5955 }
5956
5957 set_idle_cores(core, 1);
5958unlock:
5959 rcu_read_unlock();
5960}
5961
5962/*
5963 * Scan the entire LLC domain for idle cores; this dynamically switches off if
5964 * there are no idle cores left in the system; tracked through
5965 * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
5966 */
5967static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
5968{
5969 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
Peter Zijlstrac743f0a2017-04-14 14:20:05 +02005970 int core, cpu;
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02005971
Peter Zijlstra1b568f02016-05-09 10:38:41 +02005972 if (!static_branch_likely(&sched_smt_present))
5973 return -1;
5974
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02005975 if (!test_idle_cores(target, false))
5976 return -1;
5977
Ingo Molnar0c98d342017-02-05 15:38:10 +01005978 cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02005979
Peter Zijlstrac743f0a2017-04-14 14:20:05 +02005980 for_each_cpu_wrap(core, cpus, target) {
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02005981 bool idle = true;
5982
5983 for_each_cpu(cpu, cpu_smt_mask(core)) {
5984 cpumask_clear_cpu(cpu, cpus);
Rohit Jain943d3552018-05-09 09:39:48 -07005985 if (!available_idle_cpu(cpu))
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02005986 idle = false;
5987 }
5988
5989 if (idle)
5990 return core;
5991 }
5992
5993 /*
5994 * Failed to find an idle core; stop looking for one.
5995 */
5996 set_idle_cores(target, 0);
5997
5998 return -1;
5999}
6000
6001/*
6002 * Scan the local SMT mask for idle CPUs.
6003 */
6004static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
6005{
6006 int cpu;
6007
Peter Zijlstra1b568f02016-05-09 10:38:41 +02006008 if (!static_branch_likely(&sched_smt_present))
6009 return -1;
6010
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006011 for_each_cpu(cpu, cpu_smt_mask(target)) {
Ingo Molnar0c98d342017-02-05 15:38:10 +01006012 if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006013 continue;
Rohit Jain943d3552018-05-09 09:39:48 -07006014 if (available_idle_cpu(cpu))
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006015 return cpu;
6016 }
6017
6018 return -1;
6019}
6020
6021#else /* CONFIG_SCHED_SMT */
6022
6023static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
6024{
6025 return -1;
6026}
6027
6028static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
6029{
6030 return -1;
6031}
6032
6033#endif /* CONFIG_SCHED_SMT */
6034
6035/*
6036 * Scan the LLC domain for idle CPUs; this is dynamically regulated by
6037 * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
6038 * average idle time for this rq (as found in rq->avg_idle).
6039 */
6040static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
6041{
Wanpeng Li9cfb38a2016-10-09 08:04:03 +08006042 struct sched_domain *this_sd;
Peter Zijlstra1ad3aaf2017-05-17 12:53:50 +02006043 u64 avg_cost, avg_idle;
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006044 u64 time, cost;
6045 s64 delta;
Peter Zijlstra1ad3aaf2017-05-17 12:53:50 +02006046 int cpu, nr = INT_MAX;
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006047
Wanpeng Li9cfb38a2016-10-09 08:04:03 +08006048 this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
6049 if (!this_sd)
6050 return -1;
6051
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006052 /*
6053 * Due to large variance we need a large fuzz factor; hackbench in
6054 * particularly is sensitive here.
6055 */
Peter Zijlstra1ad3aaf2017-05-17 12:53:50 +02006056 avg_idle = this_rq()->avg_idle / 512;
6057 avg_cost = this_sd->avg_scan_cost + 1;
6058
6059 if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost)
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006060 return -1;
6061
Peter Zijlstra1ad3aaf2017-05-17 12:53:50 +02006062 if (sched_feat(SIS_PROP)) {
6063 u64 span_avg = sd->span_weight * avg_idle;
6064 if (span_avg > 4*avg_cost)
6065 nr = div_u64(span_avg, avg_cost);
6066 else
6067 nr = 4;
6068 }
6069
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006070 time = local_clock();
6071
Peter Zijlstrac743f0a2017-04-14 14:20:05 +02006072 for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
Peter Zijlstra1ad3aaf2017-05-17 12:53:50 +02006073 if (!--nr)
6074 return -1;
Ingo Molnar0c98d342017-02-05 15:38:10 +01006075 if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006076 continue;
Rohit Jain943d3552018-05-09 09:39:48 -07006077 if (available_idle_cpu(cpu))
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006078 break;
6079 }
6080
6081 time = local_clock() - time;
6082 cost = this_sd->avg_scan_cost;
6083 delta = (s64)(time - cost) / 8;
6084 this_sd->avg_scan_cost += delta;
6085
6086 return cpu;
6087}
6088
6089/*
6090 * Try and locate an idle core/thread in the LLC cache domain.
Peter Zijlstraa50bde52009-11-12 15:55:28 +01006091 */
Morten Rasmussen772bd008c2016-06-22 18:03:13 +01006092static int select_idle_sibling(struct task_struct *p, int prev, int target)
Peter Zijlstraa50bde52009-11-12 15:55:28 +01006093{
Suresh Siddha99bd5e22010-03-31 16:47:45 -07006094 struct sched_domain *sd;
Mel Gorman32e839d2018-01-30 10:45:55 +00006095 int i, recent_used_cpu;
Mike Galbraithe0a79f52013-01-28 12:19:25 +01006096
Rohit Jain943d3552018-05-09 09:39:48 -07006097 if (available_idle_cpu(target))
Mike Galbraithe0a79f52013-01-28 12:19:25 +01006098 return target;
Peter Zijlstraa50bde52009-11-12 15:55:28 +01006099
6100 /*
Ingo Molnar97fb7a02018-03-03 14:01:12 +01006101 * If the previous CPU is cache affine and idle, don't be stupid:
Peter Zijlstraa50bde52009-11-12 15:55:28 +01006102 */
Rohit Jain943d3552018-05-09 09:39:48 -07006103 if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev))
Morten Rasmussen772bd008c2016-06-22 18:03:13 +01006104 return prev;
Peter Zijlstraa50bde52009-11-12 15:55:28 +01006105
Ingo Molnar97fb7a02018-03-03 14:01:12 +01006106 /* Check a recently used CPU as a potential idle candidate: */
Mel Gorman32e839d2018-01-30 10:45:55 +00006107 recent_used_cpu = p->recent_used_cpu;
6108 if (recent_used_cpu != prev &&
6109 recent_used_cpu != target &&
6110 cpus_share_cache(recent_used_cpu, target) &&
Rohit Jain943d3552018-05-09 09:39:48 -07006111 available_idle_cpu(recent_used_cpu) &&
Mel Gorman32e839d2018-01-30 10:45:55 +00006112 cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
6113 /*
6114 * Replace recent_used_cpu with prev as it is a potential
Ingo Molnar97fb7a02018-03-03 14:01:12 +01006115 * candidate for the next wake:
Mel Gorman32e839d2018-01-30 10:45:55 +00006116 */
6117 p->recent_used_cpu = prev;
6118 return recent_used_cpu;
6119 }
6120
Peter Zijlstra518cd622011-12-07 15:07:31 +01006121 sd = rcu_dereference(per_cpu(sd_llc, target));
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006122 if (!sd)
6123 return target;
Morten Rasmussen772bd008c2016-06-22 18:03:13 +01006124
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006125 i = select_idle_core(p, sd, target);
6126 if ((unsigned)i < nr_cpumask_bits)
Gregory Haskinse7693a32008-01-25 21:08:09 +01006127 return i;
Ingo Molnar098fb9d2008-03-16 20:36:10 +01006128
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006129 i = select_idle_cpu(p, sd, target);
6130 if ((unsigned)i < nr_cpumask_bits)
6131 return i;
Mike Galbraith970e1782012-06-12 05:18:32 +02006132
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006133 i = select_idle_smt(p, sd, target);
6134 if ((unsigned)i < nr_cpumask_bits)
6135 return i;
Linus Torvalds37407ea2012-09-16 12:29:43 -07006136
Peter Zijlstraa50bde52009-11-12 15:55:28 +01006137 return target;
6138}
Dietmar Eggemann231678b2015-08-14 17:23:13 +01006139
Patrick Bellasif9be3e52018-03-09 09:52:43 +00006140/**
6141 * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks
6142 * @cpu: the CPU to get the utilization of
6143 *
6144 * The unit of the return value must be the one of capacity so we can compare
6145 * the utilization with the capacity of the CPU that is available for CFS task
6146 * (ie cpu_capacity).
Dietmar Eggemann231678b2015-08-14 17:23:13 +01006147 *
6148 * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
6149 * recent utilization of currently non-runnable tasks on a CPU. It represents
6150 * the amount of utilization of a CPU in the range [0..capacity_orig] where
6151 * capacity_orig is the cpu_capacity available at the highest frequency
6152 * (arch_scale_freq_capacity()).
6153 * The utilization of a CPU converges towards a sum equal to or less than the
6154 * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
6155 * the running time on this CPU scaled by capacity_curr.
6156 *
Patrick Bellasif9be3e52018-03-09 09:52:43 +00006157 * The estimated utilization of a CPU is defined to be the maximum between its
6158 * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks
6159 * currently RUNNABLE on that CPU.
6160 * This allows to properly represent the expected utilization of a CPU which
6161 * has just got a big task running since a long sleep period. At the same time
6162 * however it preserves the benefits of the "blocked utilization" in
6163 * describing the potential for other tasks waking up on the same CPU.
6164 *
Dietmar Eggemann231678b2015-08-14 17:23:13 +01006165 * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
6166 * higher than capacity_orig because of unfortunate rounding in
6167 * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
6168 * the average stabilizes with the new running time. We need to check that the
6169 * utilization stays within the range of [0..capacity_orig] and cap it if
6170 * necessary. Without utilization capping, a group could be seen as overloaded
6171 * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
6172 * available capacity. We allow utilization to overshoot capacity_curr (but not
6173 * capacity_orig) as it useful for predicting the capacity required after task
6174 * migrations (scheduler-driven DVFS).
Patrick Bellasif9be3e52018-03-09 09:52:43 +00006175 *
6176 * Return: the (estimated) utilization for the specified CPU
Vincent Guittot8bb5b002015-03-04 08:48:47 +01006177 */
Patrick Bellasif9be3e52018-03-09 09:52:43 +00006178static inline unsigned long cpu_util(int cpu)
Vincent Guittot8bb5b002015-03-04 08:48:47 +01006179{
Patrick Bellasif9be3e52018-03-09 09:52:43 +00006180 struct cfs_rq *cfs_rq;
6181 unsigned int util;
Vincent Guittot8bb5b002015-03-04 08:48:47 +01006182
Patrick Bellasif9be3e52018-03-09 09:52:43 +00006183 cfs_rq = &cpu_rq(cpu)->cfs;
6184 util = READ_ONCE(cfs_rq->avg.util_avg);
6185
6186 if (sched_feat(UTIL_EST))
6187 util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
6188
6189 return min_t(unsigned long, util, capacity_orig_of(cpu));
Vincent Guittot8bb5b002015-03-04 08:48:47 +01006190}
Peter Zijlstraa50bde52009-11-12 15:55:28 +01006191
Morten Rasmussen32731632016-07-25 14:34:26 +01006192/*
Ingo Molnar97fb7a02018-03-03 14:01:12 +01006193 * cpu_util_wake: Compute CPU utilization with any contributions from
Morten Rasmussen104cb162016-10-14 14:41:07 +01006194 * the waking task p removed.
6195 */
Patrick Bellasif01415f2017-12-05 17:10:15 +00006196static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
Morten Rasmussen104cb162016-10-14 14:41:07 +01006197{
Patrick Bellasif9be3e52018-03-09 09:52:43 +00006198 struct cfs_rq *cfs_rq;
6199 unsigned int util;
Morten Rasmussen104cb162016-10-14 14:41:07 +01006200
6201 /* Task has no contribution or is new */
Patrick Bellasif9be3e52018-03-09 09:52:43 +00006202 if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
Morten Rasmussen104cb162016-10-14 14:41:07 +01006203 return cpu_util(cpu);
6204
Patrick Bellasif9be3e52018-03-09 09:52:43 +00006205 cfs_rq = &cpu_rq(cpu)->cfs;
6206 util = READ_ONCE(cfs_rq->avg.util_avg);
Morten Rasmussen104cb162016-10-14 14:41:07 +01006207
Patrick Bellasif9be3e52018-03-09 09:52:43 +00006208 /* Discount task's blocked util from CPU's util */
6209 util -= min_t(unsigned int, util, task_util(p));
6210
6211 /*
6212 * Covered cases:
6213 *
6214 * a) if *p is the only task sleeping on this CPU, then:
6215 * cpu_util (== task_util) > util_est (== 0)
6216 * and thus we return:
6217 * cpu_util_wake = (cpu_util - task_util) = 0
6218 *
6219 * b) if other tasks are SLEEPING on this CPU, which is now exiting
6220 * IDLE, then:
6221 * cpu_util >= task_util
6222 * cpu_util > util_est (== 0)
6223 * and thus we discount *p's blocked utilization to return:
6224 * cpu_util_wake = (cpu_util - task_util) >= 0
6225 *
6226 * c) if other tasks are RUNNABLE on that CPU and
6227 * util_est > cpu_util
6228 * then we use util_est since it returns a more restrictive
6229 * estimation of the spare capacity on that CPU, by just
6230 * considering the expected utilization of tasks already
6231 * runnable on that CPU.
6232 *
6233 * Cases a) and b) are covered by the above code, while case c) is
6234 * covered by the following code when estimated utilization is
6235 * enabled.
6236 */
6237 if (sched_feat(UTIL_EST))
6238 util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
6239
6240 /*
6241 * Utilization (estimated) can exceed the CPU capacity, thus let's
6242 * clamp to the maximum CPU capacity to ensure consistency with
6243 * the cpu_util call.
6244 */
6245 return min_t(unsigned long, util, capacity_orig_of(cpu));
Morten Rasmussen104cb162016-10-14 14:41:07 +01006246}
6247
6248/*
Morten Rasmussen32731632016-07-25 14:34:26 +01006249 * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
6250 * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
6251 *
6252 * In that case WAKE_AFFINE doesn't make sense and we'll let
6253 * BALANCE_WAKE sort things out.
6254 */
6255static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
6256{
6257 long min_cap, max_cap;
6258
Morten Rasmussendf054e82018-07-04 11:17:39 +01006259 if (!static_branch_unlikely(&sched_asym_cpucapacity))
6260 return 0;
6261
Morten Rasmussen32731632016-07-25 14:34:26 +01006262 min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
6263 max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
6264
6265 /* Minimum capacity is close to max, no need to abort wake_affine */
6266 if (max_cap - min_cap < max_cap >> 3)
6267 return 0;
6268
Morten Rasmussen104cb162016-10-14 14:41:07 +01006269 /* Bring task utilization in sync with prev_cpu */
6270 sync_entity_load_avg(&p->se);
6271
Morten Rasmussen3b1baa62018-07-04 11:17:40 +01006272 return !task_fits_capacity(p, min_cap);
Morten Rasmussen32731632016-07-25 14:34:26 +01006273}
6274
Peter Zijlstraa50bde52009-11-12 15:55:28 +01006275/*
Morten Rasmussende91b9c2014-02-18 14:14:24 +00006276 * select_task_rq_fair: Select target runqueue for the waking task in domains
6277 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
6278 * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006279 *
Ingo Molnar97fb7a02018-03-03 14:01:12 +01006280 * Balances load by selecting the idlest CPU in the idlest group, or under
6281 * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006282 *
Ingo Molnar97fb7a02018-03-03 14:01:12 +01006283 * Returns the target CPU number.
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006284 *
6285 * preempt must be disabled.
6286 */
Peter Zijlstra0017d732010-03-24 18:34:10 +01006287static int
Peter Zijlstraac66f542013-10-07 11:29:16 +01006288select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006289{
Viresh Kumarf1d88b42018-04-26 16:00:50 +05306290 struct sched_domain *tmp, *sd = NULL;
Peter Zijlstrac88d5912009-09-10 13:50:02 +02006291 int cpu = smp_processor_id();
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02006292 int new_cpu = prev_cpu;
Suresh Siddha99bd5e22010-03-31 16:47:45 -07006293 int want_affine = 0;
Peter Zijlstra24d0c1d2018-02-13 13:37:28 +00006294 int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
Gregory Haskinse7693a32008-01-25 21:08:09 +01006295
Peter Zijlstrac58d25f2016-05-12 09:19:59 +02006296 if (sd_flag & SD_BALANCE_WAKE) {
6297 record_wakee(p);
Morten Rasmussen32731632016-07-25 14:34:26 +01006298 want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
Ingo Molnar0c98d342017-02-05 15:38:10 +01006299 && cpumask_test_cpu(cpu, &p->cpus_allowed);
Peter Zijlstrac58d25f2016-05-12 09:19:59 +02006300 }
Gregory Haskinse7693a32008-01-25 21:08:09 +01006301
Peter Zijlstradce840a2011-04-07 14:09:50 +02006302 rcu_read_lock();
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006303 for_each_domain(cpu, tmp) {
Peter Zijlstrae4f42882009-12-16 18:04:34 +01006304 if (!(tmp->flags & SD_LOAD_BALANCE))
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02006305 break;
Peter Zijlstrae4f42882009-12-16 18:04:34 +01006306
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006307 /*
Ingo Molnar97fb7a02018-03-03 14:01:12 +01006308 * If both 'cpu' and 'prev_cpu' are part of this domain,
Suresh Siddha99bd5e22010-03-31 16:47:45 -07006309 * cpu is a valid SD_WAKE_AFFINE target.
Peter Zijlstrafe3bcfe2009-11-12 15:55:29 +01006310 */
Suresh Siddha99bd5e22010-03-31 16:47:45 -07006311 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
6312 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
Viresh Kumarf1d88b42018-04-26 16:00:50 +05306313 if (cpu != prev_cpu)
6314 new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
6315
6316 sd = NULL; /* Prefer wake_affine over balance flags */
Alex Shif03542a2012-07-26 08:55:34 +08006317 break;
Peter Zijlstrac88d5912009-09-10 13:50:02 +02006318 }
6319
Alex Shif03542a2012-07-26 08:55:34 +08006320 if (tmp->flags & sd_flag)
Peter Zijlstra29cd8ba2009-09-17 09:01:14 +02006321 sd = tmp;
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02006322 else if (!want_affine)
6323 break;
Peter Zijlstrac88d5912009-09-10 13:50:02 +02006324 }
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006325
Viresh Kumarf1d88b42018-04-26 16:00:50 +05306326 if (unlikely(sd)) {
6327 /* Slow path */
Brendan Jackman18bd1b4b2017-10-05 12:45:12 +01006328 new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
Viresh Kumarf1d88b42018-04-26 16:00:50 +05306329 } else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
6330 /* Fast path */
6331
6332 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
6333
6334 if (want_affine)
6335 current->recent_used_cpu = cpu;
Gregory Haskinse7693a32008-01-25 21:08:09 +01006336 }
Peter Zijlstradce840a2011-04-07 14:09:50 +02006337 rcu_read_unlock();
Gregory Haskinse7693a32008-01-25 21:08:09 +01006338
Peter Zijlstrac88d5912009-09-10 13:50:02 +02006339 return new_cpu;
Gregory Haskinse7693a32008-01-25 21:08:09 +01006340}
Paul Turner0a74bef2012-10-04 13:18:30 +02006341
Peter Zijlstra144d8482017-05-11 17:57:24 +02006342static void detach_entity_cfs_rq(struct sched_entity *se);
6343
Paul Turner0a74bef2012-10-04 13:18:30 +02006344/*
Ingo Molnar97fb7a02018-03-03 14:01:12 +01006345 * Called immediately before a task is migrated to a new CPU; task_cpu(p) and
Paul Turner0a74bef2012-10-04 13:18:30 +02006346 * cfs_rq_of(p) references at time of call are still valid and identify the
Ingo Molnar97fb7a02018-03-03 14:01:12 +01006347 * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
Paul Turner0a74bef2012-10-04 13:18:30 +02006348 */
Srikar Dronamraju3f9672b2018-09-21 23:18:58 +05306349static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
Paul Turner0a74bef2012-10-04 13:18:30 +02006350{
Paul Turneraff3e492012-10-04 13:18:30 +02006351 /*
Peter Zijlstra59efa0b2016-05-10 18:24:37 +02006352 * As blocked tasks retain absolute vruntime the migration needs to
6353 * deal with this by subtracting the old and adding the new
6354 * min_vruntime -- the latter is done by enqueue_entity() when placing
6355 * the task on the new runqueue.
6356 */
6357 if (p->state == TASK_WAKING) {
6358 struct sched_entity *se = &p->se;
6359 struct cfs_rq *cfs_rq = cfs_rq_of(se);
6360 u64 min_vruntime;
6361
6362#ifndef CONFIG_64BIT
6363 u64 min_vruntime_copy;
6364
6365 do {
6366 min_vruntime_copy = cfs_rq->min_vruntime_copy;
6367 smp_rmb();
6368 min_vruntime = cfs_rq->min_vruntime;
6369 } while (min_vruntime != min_vruntime_copy);
6370#else
6371 min_vruntime = cfs_rq->min_vruntime;
6372#endif
6373
6374 se->vruntime -= min_vruntime;
6375 }
6376
Peter Zijlstra144d8482017-05-11 17:57:24 +02006377 if (p->on_rq == TASK_ON_RQ_MIGRATING) {
6378 /*
6379 * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
6380 * rq->lock and can modify state directly.
6381 */
6382 lockdep_assert_held(&task_rq(p)->lock);
6383 detach_entity_cfs_rq(&p->se);
6384
6385 } else {
6386 /*
6387 * We are supposed to update the task to "current" time, then
6388 * its up to date and ready to go to new CPU/cfs_rq. But we
6389 * have difficulty in getting what current time is, so simply
6390 * throw away the out-of-date time. This will result in the
6391 * wakee task is less decayed, but giving the wakee more load
6392 * sounds not bad.
6393 */
6394 remove_entity_load_avg(&p->se);
6395 }
Yuyang Du9d89c252015-07-15 08:04:37 +08006396
6397 /* Tell new CPU we are migrated */
6398 p->se.avg.last_update_time = 0;
Ben Segall3944a922014-05-15 15:59:20 -07006399
6400 /* We have migrated, no longer consider this task hot */
Yuyang Du9d89c252015-07-15 08:04:37 +08006401 p->se.exec_start = 0;
Srikar Dronamraju3f9672b2018-09-21 23:18:58 +05306402
6403 update_scan_period(p, new_cpu);
Paul Turner0a74bef2012-10-04 13:18:30 +02006404}
Yuyang Du12695572015-07-15 08:04:40 +08006405
6406static void task_dead_fair(struct task_struct *p)
6407{
6408 remove_entity_load_avg(&p->se);
6409}
Gregory Haskinse7693a32008-01-25 21:08:09 +01006410#endif /* CONFIG_SMP */
6411
Cheng Jiana555e9d2017-12-07 21:30:43 +08006412static unsigned long wakeup_gran(struct sched_entity *se)
Peter Zijlstra0bbd3332008-04-19 19:44:57 +02006413{
6414 unsigned long gran = sysctl_sched_wakeup_granularity;
6415
6416 /*
Peter Zijlstrae52fb7c2009-01-14 12:39:19 +01006417 * Since its curr running now, convert the gran from real-time
6418 * to virtual-time in his units.
Mike Galbraith13814d42010-03-11 17:17:04 +01006419 *
6420 * By using 'se' instead of 'curr' we penalize light tasks, so
6421 * they get preempted easier. That is, if 'se' < 'curr' then
6422 * the resulting gran will be larger, therefore penalizing the
6423 * lighter, if otoh 'se' > 'curr' then the resulting gran will
6424 * be smaller, again penalizing the lighter task.
6425 *
6426 * This is especially important for buddies when the leftmost
6427 * task is higher priority than the buddy.
Peter Zijlstra0bbd3332008-04-19 19:44:57 +02006428 */
Shaohua Lif4ad9bd2011-04-08 12:53:09 +08006429 return calc_delta_fair(gran, se);
Peter Zijlstra0bbd3332008-04-19 19:44:57 +02006430}
6431
6432/*
Peter Zijlstra464b7522008-10-24 11:06:15 +02006433 * Should 'se' preempt 'curr'.
6434 *
6435 * |s1
6436 * |s2
6437 * |s3
6438 * g
6439 * |<--->|c
6440 *
6441 * w(c, s1) = -1
6442 * w(c, s2) = 0
6443 * w(c, s3) = 1
6444 *
6445 */
6446static int
6447wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
6448{
6449 s64 gran, vdiff = curr->vruntime - se->vruntime;
6450
6451 if (vdiff <= 0)
6452 return -1;
6453
Cheng Jiana555e9d2017-12-07 21:30:43 +08006454 gran = wakeup_gran(se);
Peter Zijlstra464b7522008-10-24 11:06:15 +02006455 if (vdiff > gran)
6456 return 1;
6457
6458 return 0;
6459}
6460
Peter Zijlstra02479092008-11-04 21:25:10 +01006461static void set_last_buddy(struct sched_entity *se)
6462{
Venkatesh Pallipadi69c80f32011-04-13 18:21:09 -07006463 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
6464 return;
6465
Daniel Axtensc5ae3662017-05-11 06:11:39 +10006466 for_each_sched_entity(se) {
6467 if (SCHED_WARN_ON(!se->on_rq))
6468 return;
Venkatesh Pallipadi69c80f32011-04-13 18:21:09 -07006469 cfs_rq_of(se)->last = se;
Daniel Axtensc5ae3662017-05-11 06:11:39 +10006470 }
Peter Zijlstra02479092008-11-04 21:25:10 +01006471}
6472
6473static void set_next_buddy(struct sched_entity *se)
6474{
Venkatesh Pallipadi69c80f32011-04-13 18:21:09 -07006475 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
6476 return;
6477
Daniel Axtensc5ae3662017-05-11 06:11:39 +10006478 for_each_sched_entity(se) {
6479 if (SCHED_WARN_ON(!se->on_rq))
6480 return;
Venkatesh Pallipadi69c80f32011-04-13 18:21:09 -07006481 cfs_rq_of(se)->next = se;
Daniel Axtensc5ae3662017-05-11 06:11:39 +10006482 }
Peter Zijlstra02479092008-11-04 21:25:10 +01006483}
6484
Rik van Rielac53db52011-02-01 09:51:03 -05006485static void set_skip_buddy(struct sched_entity *se)
6486{
Venkatesh Pallipadi69c80f32011-04-13 18:21:09 -07006487 for_each_sched_entity(se)
6488 cfs_rq_of(se)->skip = se;
Rik van Rielac53db52011-02-01 09:51:03 -05006489}
6490
Peter Zijlstra464b7522008-10-24 11:06:15 +02006491/*
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02006492 * Preempt the current task with a newly woken task if needed:
6493 */
Peter Zijlstra5a9b86f2009-09-16 13:47:58 +02006494static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02006495{
6496 struct task_struct *curr = rq->curr;
Srivatsa Vaddagiri8651a862007-10-15 17:00:12 +02006497 struct sched_entity *se = &curr->se, *pse = &p->se;
Mike Galbraith03e89e42008-12-16 08:45:30 +01006498 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
Mike Galbraithf685cea2009-10-23 23:09:22 +02006499 int scale = cfs_rq->nr_running >= sched_nr_latency;
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07006500 int next_buddy_marked = 0;
Mike Galbraith03e89e42008-12-16 08:45:30 +01006501
Ingo Molnar4ae7d5c2008-03-19 01:42:00 +01006502 if (unlikely(se == pse))
6503 return;
6504
Paul Turner5238cdd2011-07-21 09:43:37 -07006505 /*
Kirill Tkhai163122b2014-08-20 13:48:29 +04006506 * This is possible from callers such as attach_tasks(), in which we
Paul Turner5238cdd2011-07-21 09:43:37 -07006507 * unconditionally check_prempt_curr() after an enqueue (which may have
6508 * lead to a throttle). This both saves work and prevents false
6509 * next-buddy nomination below.
6510 */
6511 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
6512 return;
6513
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07006514 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
Mike Galbraith3cb63d52009-09-11 12:01:17 +02006515 set_next_buddy(pse);
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07006516 next_buddy_marked = 1;
6517 }
Peter Zijlstra57fdc262008-09-23 15:33:45 +02006518
Bharata B Raoaec0a512008-08-28 14:42:49 +05306519 /*
6520 * We can come here with TIF_NEED_RESCHED already set from new task
6521 * wake up path.
Paul Turner5238cdd2011-07-21 09:43:37 -07006522 *
6523 * Note: this also catches the edge-case of curr being in a throttled
6524 * group (e.g. via set_curr_task), since update_curr() (in the
6525 * enqueue of curr) will have resulted in resched being set. This
6526 * prevents us from potentially nominating it as a false LAST_BUDDY
6527 * below.
Bharata B Raoaec0a512008-08-28 14:42:49 +05306528 */
6529 if (test_tsk_need_resched(curr))
6530 return;
6531
Darren Harta2f5c9a2011-02-22 13:04:33 -08006532 /* Idle tasks are by definition preempted by non-idle tasks. */
6533 if (unlikely(curr->policy == SCHED_IDLE) &&
6534 likely(p->policy != SCHED_IDLE))
6535 goto preempt;
6536
Ingo Molnar91c234b2007-10-15 17:00:18 +02006537 /*
Darren Harta2f5c9a2011-02-22 13:04:33 -08006538 * Batch and idle tasks do not preempt non-idle tasks (their preemption
6539 * is driven by the tick):
Ingo Molnar91c234b2007-10-15 17:00:18 +02006540 */
Ingo Molnar8ed92e52012-10-14 14:28:50 +02006541 if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
Ingo Molnar91c234b2007-10-15 17:00:18 +02006542 return;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02006543
Peter Zijlstra3a7e73a2009-11-28 18:51:02 +01006544 find_matching_se(&se, &pse);
Paul Turner9bbd7372011-07-05 19:07:21 -07006545 update_curr(cfs_rq_of(se));
Peter Zijlstra3a7e73a2009-11-28 18:51:02 +01006546 BUG_ON(!pse);
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07006547 if (wakeup_preempt_entity(se, pse) == 1) {
6548 /*
6549 * Bias pick_next to pick the sched entity that is
6550 * triggering this preemption.
6551 */
6552 if (!next_buddy_marked)
6553 set_next_buddy(pse);
Peter Zijlstra3a7e73a2009-11-28 18:51:02 +01006554 goto preempt;
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07006555 }
Jupyung Leea65ac742009-11-17 18:51:40 +09006556
Peter Zijlstra3a7e73a2009-11-28 18:51:02 +01006557 return;
6558
6559preempt:
Kirill Tkhai88751252014-06-29 00:03:57 +04006560 resched_curr(rq);
Peter Zijlstra3a7e73a2009-11-28 18:51:02 +01006561 /*
6562 * Only set the backward buddy when the current task is still
6563 * on the rq. This can happen when a wakeup gets interleaved
6564 * with schedule on the ->pre_schedule() or idle_balance()
6565 * point, either of which can * drop the rq lock.
6566 *
6567 * Also, during early boot the idle thread is in the fair class,
6568 * for obvious reasons its a bad idea to schedule back to it.
6569 */
6570 if (unlikely(!se->on_rq || curr == rq->idle))
6571 return;
6572
6573 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
6574 set_last_buddy(se);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02006575}
6576
Peter Zijlstra606dba22012-02-11 06:05:00 +01006577static struct task_struct *
Matt Flemingd8ac8972016-09-21 14:38:10 +01006578pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02006579{
6580 struct cfs_rq *cfs_rq = &rq->cfs;
6581 struct sched_entity *se;
Peter Zijlstra678d5712012-02-11 06:05:00 +01006582 struct task_struct *p;
Peter Zijlstra37e117c2014-02-14 12:25:08 +01006583 int new_tasks;
Peter Zijlstra678d5712012-02-11 06:05:00 +01006584
Peter Zijlstra6e831252014-02-11 16:11:48 +01006585again:
Peter Zijlstra678d5712012-02-11 06:05:00 +01006586 if (!cfs_rq->nr_running)
Peter Zijlstra38033c32014-01-23 20:32:21 +01006587 goto idle;
Peter Zijlstra678d5712012-02-11 06:05:00 +01006588
Viresh Kumar9674f5c2017-05-24 10:59:55 +05306589#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra3f1d2a32014-02-12 10:49:30 +01006590 if (prev->sched_class != &fair_sched_class)
Peter Zijlstra678d5712012-02-11 06:05:00 +01006591 goto simple;
6592
6593 /*
6594 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
6595 * likely that a next task is from the same cgroup as the current.
6596 *
6597 * Therefore attempt to avoid putting and setting the entire cgroup
6598 * hierarchy, only change the part that actually changes.
6599 */
6600
6601 do {
6602 struct sched_entity *curr = cfs_rq->curr;
6603
6604 /*
6605 * Since we got here without doing put_prev_entity() we also
6606 * have to consider cfs_rq->curr. If it is still a runnable
6607 * entity, update_curr() will update its vruntime, otherwise
6608 * forget we've ever seen it.
6609 */
Ben Segall54d27362015-04-06 15:28:10 -07006610 if (curr) {
6611 if (curr->on_rq)
6612 update_curr(cfs_rq);
6613 else
6614 curr = NULL;
Peter Zijlstra678d5712012-02-11 06:05:00 +01006615
Ben Segall54d27362015-04-06 15:28:10 -07006616 /*
6617 * This call to check_cfs_rq_runtime() will do the
6618 * throttle and dequeue its entity in the parent(s).
Viresh Kumar9674f5c2017-05-24 10:59:55 +05306619 * Therefore the nr_running test will indeed
Ben Segall54d27362015-04-06 15:28:10 -07006620 * be correct.
6621 */
Viresh Kumar9674f5c2017-05-24 10:59:55 +05306622 if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
6623 cfs_rq = &rq->cfs;
6624
6625 if (!cfs_rq->nr_running)
6626 goto idle;
6627
Ben Segall54d27362015-04-06 15:28:10 -07006628 goto simple;
Viresh Kumar9674f5c2017-05-24 10:59:55 +05306629 }
Ben Segall54d27362015-04-06 15:28:10 -07006630 }
Peter Zijlstra678d5712012-02-11 06:05:00 +01006631
6632 se = pick_next_entity(cfs_rq, curr);
6633 cfs_rq = group_cfs_rq(se);
6634 } while (cfs_rq);
6635
6636 p = task_of(se);
6637
6638 /*
6639 * Since we haven't yet done put_prev_entity and if the selected task
6640 * is a different task than we started out with, try and touch the
6641 * least amount of cfs_rqs.
6642 */
6643 if (prev != p) {
6644 struct sched_entity *pse = &prev->se;
6645
6646 while (!(cfs_rq = is_same_group(se, pse))) {
6647 int se_depth = se->depth;
6648 int pse_depth = pse->depth;
6649
6650 if (se_depth <= pse_depth) {
6651 put_prev_entity(cfs_rq_of(pse), pse);
6652 pse = parent_entity(pse);
6653 }
6654 if (se_depth >= pse_depth) {
6655 set_next_entity(cfs_rq_of(se), se);
6656 se = parent_entity(se);
6657 }
6658 }
6659
6660 put_prev_entity(cfs_rq, pse);
6661 set_next_entity(cfs_rq, se);
6662 }
6663
Uladzislau Rezki93824902017-09-13 12:24:30 +02006664 goto done;
Peter Zijlstra678d5712012-02-11 06:05:00 +01006665simple:
Peter Zijlstra678d5712012-02-11 06:05:00 +01006666#endif
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02006667
Peter Zijlstra3f1d2a32014-02-12 10:49:30 +01006668 put_prev_task(rq, prev);
Peter Zijlstra606dba22012-02-11 06:05:00 +01006669
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02006670 do {
Peter Zijlstra678d5712012-02-11 06:05:00 +01006671 se = pick_next_entity(cfs_rq, NULL);
Peter Zijlstraf4b67552008-11-04 21:25:07 +01006672 set_next_entity(cfs_rq, se);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02006673 cfs_rq = group_cfs_rq(se);
6674 } while (cfs_rq);
6675
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01006676 p = task_of(se);
Peter Zijlstra678d5712012-02-11 06:05:00 +01006677
Norbert Manthey13a453c2018-02-27 08:47:40 +01006678done: __maybe_unused;
Uladzislau Rezki93824902017-09-13 12:24:30 +02006679#ifdef CONFIG_SMP
6680 /*
6681 * Move the next running task to the front of
6682 * the list, so our cfs_tasks list becomes MRU
6683 * one.
6684 */
6685 list_move(&p->se.group_node, &rq->cfs_tasks);
6686#endif
6687
Mike Galbraithb39e66e2011-11-22 15:20:07 +01006688 if (hrtick_enabled(rq))
6689 hrtick_start_fair(rq, p);
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01006690
Morten Rasmussen3b1baa62018-07-04 11:17:40 +01006691 update_misfit_status(p, rq);
6692
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01006693 return p;
Peter Zijlstra38033c32014-01-23 20:32:21 +01006694
6695idle:
Morten Rasmussen3b1baa62018-07-04 11:17:40 +01006696 update_misfit_status(NULL, rq);
Matt Fleming46f69fa2016-09-21 14:38:12 +01006697 new_tasks = idle_balance(rq, rf);
6698
Peter Zijlstra37e117c2014-02-14 12:25:08 +01006699 /*
6700 * Because idle_balance() releases (and re-acquires) rq->lock, it is
6701 * possible for any higher priority task to appear. In that case we
6702 * must re-start the pick_next_entity() loop.
6703 */
Kirill Tkhaie4aa3582014-03-06 13:31:55 +04006704 if (new_tasks < 0)
Peter Zijlstra37e117c2014-02-14 12:25:08 +01006705 return RETRY_TASK;
6706
Kirill Tkhaie4aa3582014-03-06 13:31:55 +04006707 if (new_tasks > 0)
Peter Zijlstra38033c32014-01-23 20:32:21 +01006708 goto again;
Peter Zijlstra38033c32014-01-23 20:32:21 +01006709
6710 return NULL;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02006711}
6712
6713/*
6714 * Account for a descheduled task:
6715 */
Ingo Molnar31ee5292007-08-09 11:16:49 +02006716static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02006717{
6718 struct sched_entity *se = &prev->se;
6719 struct cfs_rq *cfs_rq;
6720
6721 for_each_sched_entity(se) {
6722 cfs_rq = cfs_rq_of(se);
Ingo Molnarab6cde22007-08-09 11:16:48 +02006723 put_prev_entity(cfs_rq, se);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02006724 }
6725}
6726
Rik van Rielac53db52011-02-01 09:51:03 -05006727/*
6728 * sched_yield() is very simple
6729 *
6730 * The magic of dealing with the ->skip buddy is in pick_next_entity.
6731 */
6732static void yield_task_fair(struct rq *rq)
6733{
6734 struct task_struct *curr = rq->curr;
6735 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
6736 struct sched_entity *se = &curr->se;
6737
6738 /*
6739 * Are we the only task in the tree?
6740 */
6741 if (unlikely(rq->nr_running == 1))
6742 return;
6743
6744 clear_buddies(cfs_rq, se);
6745
6746 if (curr->policy != SCHED_BATCH) {
6747 update_rq_clock(rq);
6748 /*
6749 * Update run-time statistics of the 'current'.
6750 */
6751 update_curr(cfs_rq);
Mike Galbraith916671c2011-11-22 15:21:26 +01006752 /*
6753 * Tell update_rq_clock() that we've just updated,
6754 * so we don't do microscopic update in schedule()
6755 * and double the fastpath cost.
6756 */
Davidlohr Buesoadcc8da2018-04-04 09:15:39 -07006757 rq_clock_skip_update(rq);
Rik van Rielac53db52011-02-01 09:51:03 -05006758 }
6759
6760 set_skip_buddy(se);
6761}
6762
Mike Galbraithd95f4122011-02-01 09:50:51 -05006763static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
6764{
6765 struct sched_entity *se = &p->se;
6766
Paul Turner5238cdd2011-07-21 09:43:37 -07006767 /* throttled hierarchies are not runnable */
6768 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
Mike Galbraithd95f4122011-02-01 09:50:51 -05006769 return false;
6770
6771 /* Tell the scheduler that we'd really like pse to run next. */
6772 set_next_buddy(se);
6773
Mike Galbraithd95f4122011-02-01 09:50:51 -05006774 yield_task_fair(rq);
6775
6776 return true;
6777}
6778
Peter Williams681f3e62007-10-24 18:23:51 +02006779#ifdef CONFIG_SMP
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02006780/**************************************************
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02006781 * Fair scheduling class load-balancing methods.
6782 *
6783 * BASICS
6784 *
6785 * The purpose of load-balancing is to achieve the same basic fairness the
Ingo Molnar97fb7a02018-03-03 14:01:12 +01006786 * per-CPU scheduler provides, namely provide a proportional amount of compute
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02006787 * time to each task. This is expressed in the following equation:
6788 *
6789 * W_i,n/P_i == W_j,n/P_j for all i,j (1)
6790 *
Ingo Molnar97fb7a02018-03-03 14:01:12 +01006791 * Where W_i,n is the n-th weight average for CPU i. The instantaneous weight
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02006792 * W_i,0 is defined as:
6793 *
6794 * W_i,0 = \Sum_j w_i,j (2)
6795 *
Ingo Molnar97fb7a02018-03-03 14:01:12 +01006796 * Where w_i,j is the weight of the j-th runnable task on CPU i. This weight
Yuyang Du1c3de5e2016-03-30 07:07:51 +08006797 * is derived from the nice value as per sched_prio_to_weight[].
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02006798 *
6799 * The weight average is an exponential decay average of the instantaneous
6800 * weight:
6801 *
6802 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
6803 *
Ingo Molnar97fb7a02018-03-03 14:01:12 +01006804 * C_i is the compute capacity of CPU i, typically it is the
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02006805 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
6806 * can also include other factors [XXX].
6807 *
6808 * To achieve this balance we define a measure of imbalance which follows
6809 * directly from (1):
6810 *
Nicolas Pitreced549f2014-05-26 18:19:38 -04006811 * imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4)
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02006812 *
6813 * We them move tasks around to minimize the imbalance. In the continuous
6814 * function space it is obvious this converges, in the discrete case we get
6815 * a few fun cases generally called infeasible weight scenarios.
6816 *
6817 * [XXX expand on:
6818 * - infeasible weights;
6819 * - local vs global optima in the discrete case. ]
6820 *
6821 *
6822 * SCHED DOMAINS
6823 *
6824 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
Ingo Molnar97fb7a02018-03-03 14:01:12 +01006825 * for all i,j solution, we create a tree of CPUs that follows the hardware
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02006826 * topology where each level pairs two lower groups (or better). This results
Ingo Molnar97fb7a02018-03-03 14:01:12 +01006827 * in O(log n) layers. Furthermore we reduce the number of CPUs going up the
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02006828 * tree to only the first of the previous level and we decrease the frequency
Ingo Molnar97fb7a02018-03-03 14:01:12 +01006829 * of load-balance at each level inv. proportional to the number of CPUs in
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02006830 * the groups.
6831 *
6832 * This yields:
6833 *
6834 * log_2 n 1 n
6835 * \Sum { --- * --- * 2^i } = O(n) (5)
6836 * i = 0 2^i 2^i
6837 * `- size of each group
Ingo Molnar97fb7a02018-03-03 14:01:12 +01006838 * | | `- number of CPUs doing load-balance
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02006839 * | `- freq
6840 * `- sum over all levels
6841 *
6842 * Coupled with a limit on how many tasks we can migrate every balance pass,
6843 * this makes (5) the runtime complexity of the balancer.
6844 *
6845 * An important property here is that each CPU is still (indirectly) connected
Ingo Molnar97fb7a02018-03-03 14:01:12 +01006846 * to every other CPU in at most O(log n) steps:
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02006847 *
6848 * The adjacency matrix of the resulting graph is given by:
6849 *
Byungchul Park97a71422015-07-05 18:33:48 +09006850 * log_2 n
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02006851 * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
6852 * k = 0
6853 *
6854 * And you'll find that:
6855 *
6856 * A^(log_2 n)_i,j != 0 for all i,j (7)
6857 *
Ingo Molnar97fb7a02018-03-03 14:01:12 +01006858 * Showing there's indeed a path between every CPU in at most O(log n) steps.
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02006859 * The task movement gives a factor of O(m), giving a convergence complexity
6860 * of:
6861 *
6862 * O(nm log n), n := nr_cpus, m := nr_tasks (8)
6863 *
6864 *
6865 * WORK CONSERVING
6866 *
6867 * In order to avoid CPUs going idle while there's still work to do, new idle
Ingo Molnar97fb7a02018-03-03 14:01:12 +01006868 * balancing is more aggressive and has the newly idle CPU iterate up the domain
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02006869 * tree itself instead of relying on other CPUs to bring it work.
6870 *
6871 * This adds some complexity to both (5) and (8) but it reduces the total idle
6872 * time.
6873 *
6874 * [XXX more?]
6875 *
6876 *
6877 * CGROUPS
6878 *
6879 * Cgroups make a horror show out of (2), instead of a simple sum we get:
6880 *
6881 * s_k,i
6882 * W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
6883 * S_k
6884 *
6885 * Where
6886 *
6887 * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
6888 *
Ingo Molnar97fb7a02018-03-03 14:01:12 +01006889 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i.
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02006890 *
6891 * The big problem is S_k, its a global sum needed to compute a local (W_i)
6892 * property.
6893 *
6894 * [XXX write more on how we solve this.. _after_ merging pjt's patches that
6895 * rewrite all of this once again.]
Byungchul Park97a71422015-07-05 18:33:48 +09006896 */
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02006897
Hiroshi Shimamotoed387b72012-01-31 11:40:32 +09006898static unsigned long __read_mostly max_load_balance_interval = HZ/10;
6899
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01006900enum fbq_type { regular, remote, all };
6901
Morten Rasmussen3b1baa62018-07-04 11:17:40 +01006902enum group_type {
6903 group_other = 0,
6904 group_misfit_task,
6905 group_imbalanced,
6906 group_overloaded,
6907};
6908
Peter Zijlstraddcdf6e2012-02-22 19:27:40 +01006909#define LBF_ALL_PINNED 0x01
Peter Zijlstra367456c2012-02-20 21:49:09 +01006910#define LBF_NEED_BREAK 0x02
Peter Zijlstra62633222013-08-19 12:41:09 +02006911#define LBF_DST_PINNED 0x04
6912#define LBF_SOME_PINNED 0x08
Peter Zijlstrae022e0d2017-12-21 11:20:23 +01006913#define LBF_NOHZ_STATS 0x10
Vincent Guittotf643ea22018-02-13 11:31:17 +01006914#define LBF_NOHZ_AGAIN 0x20
Peter Zijlstraddcdf6e2012-02-22 19:27:40 +01006915
6916struct lb_env {
6917 struct sched_domain *sd;
6918
Peter Zijlstraddcdf6e2012-02-22 19:27:40 +01006919 struct rq *src_rq;
Prashanth Nageshappa85c1e7d2012-06-19 17:47:34 +05306920 int src_cpu;
Peter Zijlstraddcdf6e2012-02-22 19:27:40 +01006921
6922 int dst_cpu;
6923 struct rq *dst_rq;
6924
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05306925 struct cpumask *dst_grpmask;
6926 int new_dst_cpu;
Peter Zijlstraddcdf6e2012-02-22 19:27:40 +01006927 enum cpu_idle_type idle;
Peter Zijlstrabd939f42012-05-02 14:20:37 +02006928 long imbalance;
Michael Wangb94031302012-07-12 16:10:13 +08006929 /* The set of CPUs under consideration for load-balancing */
6930 struct cpumask *cpus;
6931
Peter Zijlstraddcdf6e2012-02-22 19:27:40 +01006932 unsigned int flags;
Peter Zijlstra367456c2012-02-20 21:49:09 +01006933
6934 unsigned int loop;
6935 unsigned int loop_break;
6936 unsigned int loop_max;
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01006937
6938 enum fbq_type fbq_type;
Morten Rasmussencad68e52018-07-04 11:17:42 +01006939 enum group_type src_grp_type;
Kirill Tkhai163122b2014-08-20 13:48:29 +04006940 struct list_head tasks;
Peter Zijlstraddcdf6e2012-02-22 19:27:40 +01006941};
6942
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01006943/*
Peter Zijlstra029632f2011-10-25 10:00:11 +02006944 * Is this task likely cache-hot:
6945 */
Hillf Danton5d5e2b12014-06-10 10:58:43 +02006946static int task_hot(struct task_struct *p, struct lb_env *env)
Peter Zijlstra029632f2011-10-25 10:00:11 +02006947{
6948 s64 delta;
6949
Kirill Tkhaie5673f22014-08-20 13:48:01 +04006950 lockdep_assert_held(&env->src_rq->lock);
6951
Peter Zijlstra029632f2011-10-25 10:00:11 +02006952 if (p->sched_class != &fair_sched_class)
6953 return 0;
6954
6955 if (unlikely(p->policy == SCHED_IDLE))
6956 return 0;
6957
6958 /*
6959 * Buddy candidates are cache hot:
6960 */
Hillf Danton5d5e2b12014-06-10 10:58:43 +02006961 if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
Peter Zijlstra029632f2011-10-25 10:00:11 +02006962 (&p->se == cfs_rq_of(&p->se)->next ||
6963 &p->se == cfs_rq_of(&p->se)->last))
6964 return 1;
6965
6966 if (sysctl_sched_migration_cost == -1)
6967 return 1;
6968 if (sysctl_sched_migration_cost == 0)
6969 return 0;
6970
Hillf Danton5d5e2b12014-06-10 10:58:43 +02006971 delta = rq_clock_task(env->src_rq) - p->se.exec_start;
Peter Zijlstra029632f2011-10-25 10:00:11 +02006972
6973 return delta < (s64)sysctl_sched_migration_cost;
6974}
6975
Mel Gorman3a7053b2013-10-07 11:29:00 +01006976#ifdef CONFIG_NUMA_BALANCING
Rik van Rielc1ceac62015-05-14 22:59:36 -04006977/*
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05306978 * Returns 1, if task migration degrades locality
6979 * Returns 0, if task migration improves locality i.e migration preferred.
6980 * Returns -1, if task migration is not affected by locality.
Rik van Rielc1ceac62015-05-14 22:59:36 -04006981 */
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05306982static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
Mel Gorman3a7053b2013-10-07 11:29:00 +01006983{
Rik van Rielb1ad0652014-05-15 13:03:06 -04006984 struct numa_group *numa_group = rcu_dereference(p->numa_group);
Srikar Dronamrajuf35678b2018-06-20 22:32:56 +05306985 unsigned long src_weight, dst_weight;
6986 int src_nid, dst_nid, dist;
Mel Gorman3a7053b2013-10-07 11:29:00 +01006987
Srikar Dronamraju2a595722015-08-11 21:54:21 +05306988 if (!static_branch_likely(&sched_numa_balancing))
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05306989 return -1;
6990
Srikar Dronamrajuc3b9bc52015-08-11 16:30:12 +05306991 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05306992 return -1;
Mel Gorman7a0f3082013-10-07 11:29:01 +01006993
6994 src_nid = cpu_to_node(env->src_cpu);
6995 dst_nid = cpu_to_node(env->dst_cpu);
6996
Mel Gorman83e1d2c2013-10-07 11:29:27 +01006997 if (src_nid == dst_nid)
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05306998 return -1;
Mel Gorman7a0f3082013-10-07 11:29:01 +01006999
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05307000 /* Migrating away from the preferred node is always bad. */
7001 if (src_nid == p->numa_preferred_nid) {
7002 if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
7003 return 1;
7004 else
7005 return -1;
7006 }
Mel Gorman83e1d2c2013-10-07 11:29:27 +01007007
Rik van Rielc1ceac62015-05-14 22:59:36 -04007008 /* Encourage migration to the preferred node. */
7009 if (dst_nid == p->numa_preferred_nid)
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05307010 return 0;
Rik van Rielc1ceac62015-05-14 22:59:36 -04007011
Rik van Riel739294f2017-06-23 12:55:27 -04007012 /* Leaving a core idle is often worse than degrading locality. */
Srikar Dronamrajuf35678b2018-06-20 22:32:56 +05307013 if (env->idle == CPU_IDLE)
Rik van Riel739294f2017-06-23 12:55:27 -04007014 return -1;
7015
Srikar Dronamrajuf35678b2018-06-20 22:32:56 +05307016 dist = node_distance(src_nid, dst_nid);
Rik van Rielc1ceac62015-05-14 22:59:36 -04007017 if (numa_group) {
Srikar Dronamrajuf35678b2018-06-20 22:32:56 +05307018 src_weight = group_weight(p, src_nid, dist);
7019 dst_weight = group_weight(p, dst_nid, dist);
Rik van Rielc1ceac62015-05-14 22:59:36 -04007020 } else {
Srikar Dronamrajuf35678b2018-06-20 22:32:56 +05307021 src_weight = task_weight(p, src_nid, dist);
7022 dst_weight = task_weight(p, dst_nid, dist);
Rik van Rielc1ceac62015-05-14 22:59:36 -04007023 }
7024
Srikar Dronamrajuf35678b2018-06-20 22:32:56 +05307025 return dst_weight < src_weight;
Mel Gorman7a0f3082013-10-07 11:29:01 +01007026}
7027
Mel Gorman3a7053b2013-10-07 11:29:00 +01007028#else
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05307029static inline int migrate_degrades_locality(struct task_struct *p,
Mel Gorman3a7053b2013-10-07 11:29:00 +01007030 struct lb_env *env)
7031{
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05307032 return -1;
Mel Gorman7a0f3082013-10-07 11:29:01 +01007033}
Mel Gorman3a7053b2013-10-07 11:29:00 +01007034#endif
7035
Peter Zijlstra029632f2011-10-25 10:00:11 +02007036/*
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007037 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
7038 */
7039static
Peter Zijlstra8e45cb52012-02-22 12:47:19 +01007040int can_migrate_task(struct task_struct *p, struct lb_env *env)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007041{
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05307042 int tsk_cache_hot;
Kirill Tkhaie5673f22014-08-20 13:48:01 +04007043
7044 lockdep_assert_held(&env->src_rq->lock);
7045
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007046 /*
7047 * We do not migrate tasks that are:
Joonsoo Kimd3198082013-04-23 17:27:40 +09007048 * 1) throttled_lb_pair, or
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007049 * 2) cannot be migrated to this CPU due to cpus_allowed, or
Joonsoo Kimd3198082013-04-23 17:27:40 +09007050 * 3) running (obviously), or
7051 * 4) are cache-hot on their current CPU.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007052 */
Joonsoo Kimd3198082013-04-23 17:27:40 +09007053 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
7054 return 0;
7055
Ingo Molnar0c98d342017-02-05 15:38:10 +01007056 if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) {
Joonsoo Kime02e60c2013-04-23 17:27:42 +09007057 int cpu;
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05307058
Josh Poimboeufae928822016-06-17 12:43:24 -05007059 schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05307060
Peter Zijlstra62633222013-08-19 12:41:09 +02007061 env->flags |= LBF_SOME_PINNED;
7062
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05307063 /*
Ingo Molnar97fb7a02018-03-03 14:01:12 +01007064 * Remember if this task can be migrated to any other CPU in
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05307065 * our sched_group. We may want to revisit it if we couldn't
7066 * meet load balance goals by pulling other tasks on src_cpu.
7067 *
Jeffrey Hugo65a44332017-06-07 13:18:57 -06007068 * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have
7069 * already computed one in current iteration.
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05307070 */
Jeffrey Hugo65a44332017-06-07 13:18:57 -06007071 if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05307072 return 0;
7073
Ingo Molnar97fb7a02018-03-03 14:01:12 +01007074 /* Prevent to re-select dst_cpu via env's CPUs: */
Joonsoo Kime02e60c2013-04-23 17:27:42 +09007075 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
Ingo Molnar0c98d342017-02-05 15:38:10 +01007076 if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
Peter Zijlstra62633222013-08-19 12:41:09 +02007077 env->flags |= LBF_DST_PINNED;
Joonsoo Kime02e60c2013-04-23 17:27:42 +09007078 env->new_dst_cpu = cpu;
7079 break;
7080 }
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05307081 }
Joonsoo Kime02e60c2013-04-23 17:27:42 +09007082
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007083 return 0;
7084 }
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05307085
7086 /* Record that we found atleast one task that could run on dst_cpu */
Peter Zijlstra8e45cb52012-02-22 12:47:19 +01007087 env->flags &= ~LBF_ALL_PINNED;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007088
Peter Zijlstraddcdf6e2012-02-22 19:27:40 +01007089 if (task_running(env->src_rq, p)) {
Josh Poimboeufae928822016-06-17 12:43:24 -05007090 schedstat_inc(p->se.statistics.nr_failed_migrations_running);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007091 return 0;
7092 }
7093
7094 /*
7095 * Aggressive migration if:
Mel Gorman3a7053b2013-10-07 11:29:00 +01007096 * 1) destination numa is preferred
7097 * 2) task is cache cold, or
7098 * 3) too many balance attempts have failed.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007099 */
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05307100 tsk_cache_hot = migrate_degrades_locality(p, env);
7101 if (tsk_cache_hot == -1)
7102 tsk_cache_hot = task_hot(p, env);
Mel Gorman3a7053b2013-10-07 11:29:00 +01007103
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05307104 if (tsk_cache_hot <= 0 ||
Kirill Tkhai7a96c232014-09-22 22:36:12 +04007105 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05307106 if (tsk_cache_hot == 1) {
Josh Poimboeufae928822016-06-17 12:43:24 -05007107 schedstat_inc(env->sd->lb_hot_gained[env->idle]);
7108 schedstat_inc(p->se.statistics.nr_forced_migrations);
Mel Gorman3a7053b2013-10-07 11:29:00 +01007109 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007110 return 1;
7111 }
7112
Josh Poimboeufae928822016-06-17 12:43:24 -05007113 schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
Zhang Hang4e2dcb72013-04-10 14:04:55 +08007114 return 0;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007115}
7116
Peter Zijlstra897c3952009-12-17 17:45:42 +01007117/*
Kirill Tkhai163122b2014-08-20 13:48:29 +04007118 * detach_task() -- detach the task for the migration specified in env
Peter Zijlstra897c3952009-12-17 17:45:42 +01007119 */
Kirill Tkhai163122b2014-08-20 13:48:29 +04007120static void detach_task(struct task_struct *p, struct lb_env *env)
7121{
7122 lockdep_assert_held(&env->src_rq->lock);
7123
Kirill Tkhai163122b2014-08-20 13:48:29 +04007124 p->on_rq = TASK_ON_RQ_MIGRATING;
Peter Zijlstra5704ac02017-02-21 17:15:21 +01007125 deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
Kirill Tkhai163122b2014-08-20 13:48:29 +04007126 set_task_cpu(p, env->dst_cpu);
7127}
7128
7129/*
Kirill Tkhaie5673f22014-08-20 13:48:01 +04007130 * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
Peter Zijlstra897c3952009-12-17 17:45:42 +01007131 * part of active balancing operations within "domain".
Peter Zijlstra897c3952009-12-17 17:45:42 +01007132 *
Kirill Tkhaie5673f22014-08-20 13:48:01 +04007133 * Returns a task if successful and NULL otherwise.
Peter Zijlstra897c3952009-12-17 17:45:42 +01007134 */
Kirill Tkhaie5673f22014-08-20 13:48:01 +04007135static struct task_struct *detach_one_task(struct lb_env *env)
Peter Zijlstra897c3952009-12-17 17:45:42 +01007136{
Uladzislau Rezki93824902017-09-13 12:24:30 +02007137 struct task_struct *p;
Peter Zijlstra897c3952009-12-17 17:45:42 +01007138
Kirill Tkhaie5673f22014-08-20 13:48:01 +04007139 lockdep_assert_held(&env->src_rq->lock);
7140
Uladzislau Rezki93824902017-09-13 12:24:30 +02007141 list_for_each_entry_reverse(p,
7142 &env->src_rq->cfs_tasks, se.group_node) {
Peter Zijlstra367456c2012-02-20 21:49:09 +01007143 if (!can_migrate_task(p, env))
7144 continue;
Peter Zijlstra897c3952009-12-17 17:45:42 +01007145
Kirill Tkhai163122b2014-08-20 13:48:29 +04007146 detach_task(p, env);
Kirill Tkhaie5673f22014-08-20 13:48:01 +04007147
Peter Zijlstra367456c2012-02-20 21:49:09 +01007148 /*
Kirill Tkhaie5673f22014-08-20 13:48:01 +04007149 * Right now, this is only the second place where
Kirill Tkhai163122b2014-08-20 13:48:29 +04007150 * lb_gained[env->idle] is updated (other is detach_tasks)
Kirill Tkhaie5673f22014-08-20 13:48:01 +04007151 * so we can safely collect stats here rather than
Kirill Tkhai163122b2014-08-20 13:48:29 +04007152 * inside detach_tasks().
Peter Zijlstra367456c2012-02-20 21:49:09 +01007153 */
Josh Poimboeufae928822016-06-17 12:43:24 -05007154 schedstat_inc(env->sd->lb_gained[env->idle]);
Kirill Tkhaie5673f22014-08-20 13:48:01 +04007155 return p;
Peter Zijlstra897c3952009-12-17 17:45:42 +01007156 }
Kirill Tkhaie5673f22014-08-20 13:48:01 +04007157 return NULL;
Peter Zijlstra897c3952009-12-17 17:45:42 +01007158}
7159
Peter Zijlstraeb953082012-04-17 13:38:40 +02007160static const unsigned int sched_nr_migrate_break = 32;
7161
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01007162/*
Kirill Tkhai163122b2014-08-20 13:48:29 +04007163 * detach_tasks() -- tries to detach up to imbalance weighted load from
7164 * busiest_rq, as part of a balancing operation within domain "sd".
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01007165 *
Kirill Tkhai163122b2014-08-20 13:48:29 +04007166 * Returns number of detached tasks if successful and 0 otherwise.
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01007167 */
Kirill Tkhai163122b2014-08-20 13:48:29 +04007168static int detach_tasks(struct lb_env *env)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007169{
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01007170 struct list_head *tasks = &env->src_rq->cfs_tasks;
7171 struct task_struct *p;
Peter Zijlstra367456c2012-02-20 21:49:09 +01007172 unsigned long load;
Kirill Tkhai163122b2014-08-20 13:48:29 +04007173 int detached = 0;
7174
7175 lockdep_assert_held(&env->src_rq->lock);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007176
Peter Zijlstrabd939f42012-05-02 14:20:37 +02007177 if (env->imbalance <= 0)
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01007178 return 0;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007179
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01007180 while (!list_empty(tasks)) {
Yuyang Du985d3a42015-07-06 06:11:51 +08007181 /*
7182 * We don't want to steal all, otherwise we may be treated likewise,
7183 * which could at worst lead to a livelock crash.
7184 */
7185 if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
7186 break;
7187
Uladzislau Rezki93824902017-09-13 12:24:30 +02007188 p = list_last_entry(tasks, struct task_struct, se.group_node);
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01007189
Peter Zijlstra367456c2012-02-20 21:49:09 +01007190 env->loop++;
7191 /* We've more or less seen every task there is, call it quits */
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01007192 if (env->loop > env->loop_max)
Peter Zijlstra367456c2012-02-20 21:49:09 +01007193 break;
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01007194
7195 /* take a breather every nr_migrate tasks */
Peter Zijlstra367456c2012-02-20 21:49:09 +01007196 if (env->loop > env->loop_break) {
Peter Zijlstraeb953082012-04-17 13:38:40 +02007197 env->loop_break += sched_nr_migrate_break;
Peter Zijlstra8e45cb52012-02-22 12:47:19 +01007198 env->flags |= LBF_NEED_BREAK;
Peter Zijlstraee00e662009-12-17 17:25:20 +01007199 break;
Peter Zijlstraa195f002011-09-22 15:30:18 +02007200 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007201
Joonsoo Kimd3198082013-04-23 17:27:40 +09007202 if (!can_migrate_task(p, env))
Peter Zijlstra367456c2012-02-20 21:49:09 +01007203 goto next;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007204
Peter Zijlstra367456c2012-02-20 21:49:09 +01007205 load = task_h_load(p);
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01007206
Peter Zijlstraeb953082012-04-17 13:38:40 +02007207 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
Peter Zijlstra367456c2012-02-20 21:49:09 +01007208 goto next;
7209
Peter Zijlstrabd939f42012-05-02 14:20:37 +02007210 if ((load / 2) > env->imbalance)
Peter Zijlstra367456c2012-02-20 21:49:09 +01007211 goto next;
7212
Kirill Tkhai163122b2014-08-20 13:48:29 +04007213 detach_task(p, env);
7214 list_add(&p->se.group_node, &env->tasks);
7215
7216 detached++;
Peter Zijlstrabd939f42012-05-02 14:20:37 +02007217 env->imbalance -= load;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007218
7219#ifdef CONFIG_PREEMPT
Peter Zijlstraee00e662009-12-17 17:25:20 +01007220 /*
7221 * NEWIDLE balancing is a source of latency, so preemptible
Kirill Tkhai163122b2014-08-20 13:48:29 +04007222 * kernels will stop after the first task is detached to minimize
Peter Zijlstraee00e662009-12-17 17:25:20 +01007223 * the critical section.
7224 */
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01007225 if (env->idle == CPU_NEWLY_IDLE)
Peter Zijlstraee00e662009-12-17 17:25:20 +01007226 break;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007227#endif
7228
Peter Zijlstraee00e662009-12-17 17:25:20 +01007229 /*
7230 * We only want to steal up to the prescribed amount of
7231 * weighted load.
7232 */
Peter Zijlstrabd939f42012-05-02 14:20:37 +02007233 if (env->imbalance <= 0)
Peter Zijlstraee00e662009-12-17 17:25:20 +01007234 break;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007235
Peter Zijlstra367456c2012-02-20 21:49:09 +01007236 continue;
7237next:
Uladzislau Rezki93824902017-09-13 12:24:30 +02007238 list_move(&p->se.group_node, tasks);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007239 }
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01007240
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007241 /*
Kirill Tkhai163122b2014-08-20 13:48:29 +04007242 * Right now, this is one of only two places we collect this stat
7243 * so we can safely collect detach_one_task() stats here rather
7244 * than inside detach_one_task().
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007245 */
Josh Poimboeufae928822016-06-17 12:43:24 -05007246 schedstat_add(env->sd->lb_gained[env->idle], detached);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007247
Kirill Tkhai163122b2014-08-20 13:48:29 +04007248 return detached;
7249}
7250
7251/*
7252 * attach_task() -- attach the task detached by detach_task() to its new rq.
7253 */
7254static void attach_task(struct rq *rq, struct task_struct *p)
7255{
7256 lockdep_assert_held(&rq->lock);
7257
7258 BUG_ON(task_rq(p) != rq);
Peter Zijlstra5704ac02017-02-21 17:15:21 +01007259 activate_task(rq, p, ENQUEUE_NOCLOCK);
Joonwoo Park3ea94de2015-11-12 19:38:54 -08007260 p->on_rq = TASK_ON_RQ_QUEUED;
Kirill Tkhai163122b2014-08-20 13:48:29 +04007261 check_preempt_curr(rq, p, 0);
7262}
7263
7264/*
7265 * attach_one_task() -- attaches the task returned from detach_one_task() to
7266 * its new rq.
7267 */
7268static void attach_one_task(struct rq *rq, struct task_struct *p)
7269{
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02007270 struct rq_flags rf;
7271
7272 rq_lock(rq, &rf);
Peter Zijlstra5704ac02017-02-21 17:15:21 +01007273 update_rq_clock(rq);
Kirill Tkhai163122b2014-08-20 13:48:29 +04007274 attach_task(rq, p);
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02007275 rq_unlock(rq, &rf);
Kirill Tkhai163122b2014-08-20 13:48:29 +04007276}
7277
7278/*
7279 * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
7280 * new rq.
7281 */
7282static void attach_tasks(struct lb_env *env)
7283{
7284 struct list_head *tasks = &env->tasks;
7285 struct task_struct *p;
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02007286 struct rq_flags rf;
Kirill Tkhai163122b2014-08-20 13:48:29 +04007287
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02007288 rq_lock(env->dst_rq, &rf);
Peter Zijlstra5704ac02017-02-21 17:15:21 +01007289 update_rq_clock(env->dst_rq);
Kirill Tkhai163122b2014-08-20 13:48:29 +04007290
7291 while (!list_empty(tasks)) {
7292 p = list_first_entry(tasks, struct task_struct, se.group_node);
7293 list_del_init(&p->se.group_node);
7294
7295 attach_task(env->dst_rq, p);
7296 }
7297
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02007298 rq_unlock(env->dst_rq, &rf);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007299}
7300
Vincent Guittot1936c532018-02-13 11:31:18 +01007301static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
7302{
7303 if (cfs_rq->avg.load_avg)
7304 return true;
7305
7306 if (cfs_rq->avg.util_avg)
7307 return true;
7308
7309 return false;
7310}
7311
Vincent Guittot91c27492018-06-28 17:45:09 +02007312static inline bool others_have_blocked(struct rq *rq)
Vincent Guittot371bf422018-06-28 17:45:05 +02007313{
7314 if (READ_ONCE(rq->avg_rt.util_avg))
7315 return true;
7316
Vincent Guittot3727e0e2018-06-28 17:45:07 +02007317 if (READ_ONCE(rq->avg_dl.util_avg))
7318 return true;
7319
Vincent Guittot11d4afd2018-09-25 11:17:42 +02007320#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
Vincent Guittot91c27492018-06-28 17:45:09 +02007321 if (READ_ONCE(rq->avg_irq.util_avg))
7322 return true;
7323#endif
7324
Vincent Guittot371bf422018-06-28 17:45:05 +02007325 return false;
7326}
7327
Vincent Guittot1936c532018-02-13 11:31:18 +01007328#ifdef CONFIG_FAIR_GROUP_SCHED
7329
Tejun Heoa9e7f652017-04-25 17:43:50 -07007330static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
7331{
7332 if (cfs_rq->load.weight)
7333 return false;
7334
7335 if (cfs_rq->avg.load_sum)
7336 return false;
7337
7338 if (cfs_rq->avg.util_sum)
7339 return false;
7340
Peter Zijlstra1ea6c462017-05-06 15:59:54 +02007341 if (cfs_rq->avg.runnable_load_sum)
Tejun Heoa9e7f652017-04-25 17:43:50 -07007342 return false;
7343
7344 return true;
7345}
7346
Paul Turner48a16752012-10-04 13:18:31 +02007347static void update_blocked_averages(int cpu)
Peter Zijlstra9e3081c2010-11-15 15:47:02 -08007348{
Peter Zijlstra9e3081c2010-11-15 15:47:02 -08007349 struct rq *rq = cpu_rq(cpu);
Tejun Heoa9e7f652017-04-25 17:43:50 -07007350 struct cfs_rq *cfs_rq, *pos;
Vincent Guittot12b04872018-08-31 17:22:55 +02007351 const struct sched_class *curr_class;
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02007352 struct rq_flags rf;
Vincent Guittotf643ea22018-02-13 11:31:17 +01007353 bool done = true;
Peter Zijlstra9e3081c2010-11-15 15:47:02 -08007354
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02007355 rq_lock_irqsave(rq, &rf);
Paul Turner48a16752012-10-04 13:18:31 +02007356 update_rq_clock(rq);
Yuyang Du9d89c252015-07-15 08:04:37 +08007357
Peter Zijlstra9763b672011-07-13 13:09:25 +02007358 /*
7359 * Iterates the task_group tree in a bottom up fashion, see
7360 * list_add_leaf_cfs_rq() for details.
7361 */
Tejun Heoa9e7f652017-04-25 17:43:50 -07007362 for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
Vincent Guittotbc427892017-03-17 14:47:22 +01007363 struct sched_entity *se;
7364
Yuyang Du9d89c252015-07-15 08:04:37 +08007365 /* throttled entities do not contribute to load */
7366 if (throttled_hierarchy(cfs_rq))
7367 continue;
Paul Turner48a16752012-10-04 13:18:31 +02007368
Viresh Kumar3a123bb2017-05-24 10:59:56 +05307369 if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
Yuyang Du9d89c252015-07-15 08:04:37 +08007370 update_tg_load_avg(cfs_rq, 0);
Vincent Guittot4e516072016-11-08 10:53:46 +01007371
Vincent Guittotbc427892017-03-17 14:47:22 +01007372 /* Propagate pending load changes to the parent, if any: */
7373 se = cfs_rq->tg->se[cpu];
7374 if (se && !skip_blocked_update(se))
Peter Zijlstra88c06162017-05-06 17:32:43 +02007375 update_load_avg(cfs_rq_of(se), se, 0);
Tejun Heoa9e7f652017-04-25 17:43:50 -07007376
7377 /*
7378 * There can be a lot of idle CPU cgroups. Don't let fully
7379 * decayed cfs_rqs linger on the list.
7380 */
7381 if (cfs_rq_is_decayed(cfs_rq))
7382 list_del_leaf_cfs_rq(cfs_rq);
Vincent Guittot1936c532018-02-13 11:31:18 +01007383
7384 /* Don't need periodic decay once load/util_avg are null */
7385 if (cfs_rq_has_blocked(cfs_rq))
Vincent Guittotf643ea22018-02-13 11:31:17 +01007386 done = false;
Yuyang Du9d89c252015-07-15 08:04:37 +08007387 }
Vincent Guittot12b04872018-08-31 17:22:55 +02007388
7389 curr_class = rq->curr->sched_class;
7390 update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class);
7391 update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);
Vincent Guittot91c27492018-06-28 17:45:09 +02007392 update_irq_load_avg(rq, 0);
Vincent Guittot371bf422018-06-28 17:45:05 +02007393 /* Don't need periodic decay once load/util_avg are null */
Vincent Guittot91c27492018-06-28 17:45:09 +02007394 if (others_have_blocked(rq))
Vincent Guittot371bf422018-06-28 17:45:05 +02007395 done = false;
Peter Zijlstrae022e0d2017-12-21 11:20:23 +01007396
7397#ifdef CONFIG_NO_HZ_COMMON
7398 rq->last_blocked_load_update_tick = jiffies;
Vincent Guittotf643ea22018-02-13 11:31:17 +01007399 if (done)
7400 rq->has_blocked_load = 0;
Peter Zijlstrae022e0d2017-12-21 11:20:23 +01007401#endif
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02007402 rq_unlock_irqrestore(rq, &rf);
Peter Zijlstra9e3081c2010-11-15 15:47:02 -08007403}
7404
Peter Zijlstra9763b672011-07-13 13:09:25 +02007405/*
Vladimir Davydov68520792013-07-15 17:49:19 +04007406 * Compute the hierarchical load factor for cfs_rq and all its ascendants.
Peter Zijlstra9763b672011-07-13 13:09:25 +02007407 * This needs to be done in a top-down fashion because the load of a child
7408 * group is a fraction of its parents load.
7409 */
Vladimir Davydov68520792013-07-15 17:49:19 +04007410static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
Peter Zijlstra9763b672011-07-13 13:09:25 +02007411{
Vladimir Davydov68520792013-07-15 17:49:19 +04007412 struct rq *rq = rq_of(cfs_rq);
7413 struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
Peter Zijlstraa35b6462012-08-08 21:46:40 +02007414 unsigned long now = jiffies;
Vladimir Davydov68520792013-07-15 17:49:19 +04007415 unsigned long load;
Peter Zijlstraa35b6462012-08-08 21:46:40 +02007416
Vladimir Davydov68520792013-07-15 17:49:19 +04007417 if (cfs_rq->last_h_load_update == now)
Peter Zijlstraa35b6462012-08-08 21:46:40 +02007418 return;
7419
Vladimir Davydov68520792013-07-15 17:49:19 +04007420 cfs_rq->h_load_next = NULL;
7421 for_each_sched_entity(se) {
7422 cfs_rq = cfs_rq_of(se);
7423 cfs_rq->h_load_next = se;
7424 if (cfs_rq->last_h_load_update == now)
7425 break;
7426 }
Peter Zijlstraa35b6462012-08-08 21:46:40 +02007427
Vladimir Davydov68520792013-07-15 17:49:19 +04007428 if (!se) {
Yuyang Du7ea241a2015-07-15 08:04:42 +08007429 cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
Vladimir Davydov68520792013-07-15 17:49:19 +04007430 cfs_rq->last_h_load_update = now;
7431 }
7432
7433 while ((se = cfs_rq->h_load_next) != NULL) {
7434 load = cfs_rq->h_load;
Yuyang Du7ea241a2015-07-15 08:04:42 +08007435 load = div64_ul(load * se->avg.load_avg,
7436 cfs_rq_load_avg(cfs_rq) + 1);
Vladimir Davydov68520792013-07-15 17:49:19 +04007437 cfs_rq = group_cfs_rq(se);
7438 cfs_rq->h_load = load;
7439 cfs_rq->last_h_load_update = now;
7440 }
Peter Zijlstra9763b672011-07-13 13:09:25 +02007441}
7442
Peter Zijlstra367456c2012-02-20 21:49:09 +01007443static unsigned long task_h_load(struct task_struct *p)
Peter Zijlstra230059de2009-12-17 17:47:12 +01007444{
Peter Zijlstra367456c2012-02-20 21:49:09 +01007445 struct cfs_rq *cfs_rq = task_cfs_rq(p);
Peter Zijlstra230059de2009-12-17 17:47:12 +01007446
Vladimir Davydov68520792013-07-15 17:49:19 +04007447 update_cfs_rq_h_load(cfs_rq);
Yuyang Du9d89c252015-07-15 08:04:37 +08007448 return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
Yuyang Du7ea241a2015-07-15 08:04:42 +08007449 cfs_rq_load_avg(cfs_rq) + 1);
Peter Zijlstra230059de2009-12-17 17:47:12 +01007450}
7451#else
Paul Turner48a16752012-10-04 13:18:31 +02007452static inline void update_blocked_averages(int cpu)
Peter Zijlstra9e3081c2010-11-15 15:47:02 -08007453{
Vincent Guittot6c1d47c2015-07-15 08:04:38 +08007454 struct rq *rq = cpu_rq(cpu);
7455 struct cfs_rq *cfs_rq = &rq->cfs;
Vincent Guittot12b04872018-08-31 17:22:55 +02007456 const struct sched_class *curr_class;
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02007457 struct rq_flags rf;
Vincent Guittot6c1d47c2015-07-15 08:04:38 +08007458
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02007459 rq_lock_irqsave(rq, &rf);
Vincent Guittot6c1d47c2015-07-15 08:04:38 +08007460 update_rq_clock(rq);
Viresh Kumar3a123bb2017-05-24 10:59:56 +05307461 update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
Vincent Guittot12b04872018-08-31 17:22:55 +02007462
7463 curr_class = rq->curr->sched_class;
7464 update_rt_rq_load_avg(rq_clock_task(rq), rq, curr_class == &rt_sched_class);
7465 update_dl_rq_load_avg(rq_clock_task(rq), rq, curr_class == &dl_sched_class);
Vincent Guittot91c27492018-06-28 17:45:09 +02007466 update_irq_load_avg(rq, 0);
Peter Zijlstrae022e0d2017-12-21 11:20:23 +01007467#ifdef CONFIG_NO_HZ_COMMON
7468 rq->last_blocked_load_update_tick = jiffies;
Vincent Guittot91c27492018-06-28 17:45:09 +02007469 if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq))
Vincent Guittotf643ea22018-02-13 11:31:17 +01007470 rq->has_blocked_load = 0;
Peter Zijlstrae022e0d2017-12-21 11:20:23 +01007471#endif
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02007472 rq_unlock_irqrestore(rq, &rf);
Peter Zijlstra9e3081c2010-11-15 15:47:02 -08007473}
7474
Peter Zijlstra367456c2012-02-20 21:49:09 +01007475static unsigned long task_h_load(struct task_struct *p)
7476{
Yuyang Du9d89c252015-07-15 08:04:37 +08007477 return p->se.avg.load_avg;
Peter Zijlstra230059de2009-12-17 17:47:12 +01007478}
7479#endif
7480
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007481/********** Helpers for find_busiest_group ************************/
Rik van Rielcaeb1782014-07-28 14:16:28 -04007482
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007483/*
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007484 * sg_lb_stats - stats of a sched_group required for load_balancing
7485 */
7486struct sg_lb_stats {
7487 unsigned long avg_load; /*Avg load across the CPUs of the group */
7488 unsigned long group_load; /* Total load over the CPUs of the group */
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007489 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
Joonsoo Kim56cf5152013-08-06 17:36:43 +09007490 unsigned long load_per_task;
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04007491 unsigned long group_capacity;
Dietmar Eggemann9e91d612015-08-14 17:23:12 +01007492 unsigned long group_util; /* Total utilization of the group */
Peter Zijlstra147c5fc2013-08-19 15:22:57 +02007493 unsigned int sum_nr_running; /* Nr tasks running in the group */
Peter Zijlstra147c5fc2013-08-19 15:22:57 +02007494 unsigned int idle_cpus;
7495 unsigned int group_weight;
Rik van Rielcaeb1782014-07-28 14:16:28 -04007496 enum group_type group_type;
Vincent Guittotea678212015-02-27 16:54:11 +01007497 int group_no_capacity;
Morten Rasmussen3b1baa62018-07-04 11:17:40 +01007498 unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01007499#ifdef CONFIG_NUMA_BALANCING
7500 unsigned int nr_numa_running;
7501 unsigned int nr_preferred_running;
7502#endif
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007503};
7504
Joonsoo Kim56cf5152013-08-06 17:36:43 +09007505/*
7506 * sd_lb_stats - Structure to store the statistics of a sched_domain
7507 * during load balancing.
7508 */
7509struct sd_lb_stats {
7510 struct sched_group *busiest; /* Busiest group in this sd */
7511 struct sched_group *local; /* Local group in this sd */
Peter Zijlstra90001d62017-07-31 17:50:05 +02007512 unsigned long total_running;
Joonsoo Kim56cf5152013-08-06 17:36:43 +09007513 unsigned long total_load; /* Total load of all groups in sd */
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04007514 unsigned long total_capacity; /* Total capacity of all groups in sd */
Joonsoo Kim56cf5152013-08-06 17:36:43 +09007515 unsigned long avg_load; /* Average load across all groups in sd */
7516
Joonsoo Kim56cf5152013-08-06 17:36:43 +09007517 struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
Peter Zijlstra147c5fc2013-08-19 15:22:57 +02007518 struct sg_lb_stats local_stat; /* Statistics of the local group */
Joonsoo Kim56cf5152013-08-06 17:36:43 +09007519};
7520
Peter Zijlstra147c5fc2013-08-19 15:22:57 +02007521static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
7522{
7523 /*
7524 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
7525 * local_stat because update_sg_lb_stats() does a full clear/assignment.
7526 * We must however clear busiest_stat::avg_load because
7527 * update_sd_pick_busiest() reads this before assignment.
7528 */
7529 *sds = (struct sd_lb_stats){
7530 .busiest = NULL,
7531 .local = NULL,
Peter Zijlstra90001d62017-07-31 17:50:05 +02007532 .total_running = 0UL,
Peter Zijlstra147c5fc2013-08-19 15:22:57 +02007533 .total_load = 0UL,
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04007534 .total_capacity = 0UL,
Peter Zijlstra147c5fc2013-08-19 15:22:57 +02007535 .busiest_stat = {
7536 .avg_load = 0UL,
Rik van Rielcaeb1782014-07-28 14:16:28 -04007537 .sum_nr_running = 0,
7538 .group_type = group_other,
Peter Zijlstra147c5fc2013-08-19 15:22:57 +02007539 },
7540 };
7541}
7542
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007543/**
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007544 * get_sd_load_idx - Obtain the load index for a given sched domain.
7545 * @sd: The sched_domain whose load_idx is to be obtained.
Kamalesh Babulaled1b7732013-10-13 23:06:15 +05307546 * @idle: The idle status of the CPU for whose sd load_idx is obtained.
Yacine Belkadie69f6182013-07-12 20:45:47 +02007547 *
7548 * Return: The load index.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007549 */
7550static inline int get_sd_load_idx(struct sched_domain *sd,
7551 enum cpu_idle_type idle)
7552{
7553 int load_idx;
7554
7555 switch (idle) {
7556 case CPU_NOT_IDLE:
7557 load_idx = sd->busy_idx;
7558 break;
7559
7560 case CPU_NEWLY_IDLE:
7561 load_idx = sd->newidle_idx;
7562 break;
7563 default:
7564 load_idx = sd->idle_idx;
7565 break;
7566 }
7567
7568 return load_idx;
7569}
7570
Vincent Guittot287cdaa2018-09-04 11:36:26 +02007571static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007572{
7573 struct rq *rq = cpu_rq(cpu);
Vincent Guittot287cdaa2018-09-04 11:36:26 +02007574 unsigned long max = arch_scale_cpu_capacity(sd, cpu);
Vincent Guittot523e9792018-06-28 17:45:12 +02007575 unsigned long used, free;
Vincent Guittot523e9792018-06-28 17:45:12 +02007576 unsigned long irq;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007577
Vincent Guittot2e62c472018-07-19 14:00:06 +02007578 irq = cpu_util_irq(rq);
Venkatesh Pallipadiaa483802010-10-04 17:03:22 -07007579
Vincent Guittot523e9792018-06-28 17:45:12 +02007580 if (unlikely(irq >= max))
7581 return 1;
Peter Zijlstracadefd32014-02-27 10:40:35 +01007582
Vincent Guittot523e9792018-06-28 17:45:12 +02007583 used = READ_ONCE(rq->avg_rt.util_avg);
7584 used += READ_ONCE(rq->avg_dl.util_avg);
Peter Zijlstrab654f7d2012-05-22 14:04:28 +02007585
Vincent Guittot523e9792018-06-28 17:45:12 +02007586 if (unlikely(used >= max))
7587 return 1;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007588
Vincent Guittot523e9792018-06-28 17:45:12 +02007589 free = max - used;
Vincent Guittot2e62c472018-07-19 14:00:06 +02007590
7591 return scale_irq_capacity(free, irq, max);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007592}
7593
Nicolas Pitreced549f2014-05-26 18:19:38 -04007594static void update_cpu_capacity(struct sched_domain *sd, int cpu)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007595{
Vincent Guittot287cdaa2018-09-04 11:36:26 +02007596 unsigned long capacity = scale_rt_capacity(sd, cpu);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007597 struct sched_group *sdg = sd->groups;
7598
Vincent Guittot523e9792018-06-28 17:45:12 +02007599 cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007600
Nicolas Pitreced549f2014-05-26 18:19:38 -04007601 if (!capacity)
7602 capacity = 1;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007603
Nicolas Pitreced549f2014-05-26 18:19:38 -04007604 cpu_rq(cpu)->cpu_capacity = capacity;
7605 sdg->sgc->capacity = capacity;
Morten Rasmussenbf475ce2016-10-14 14:41:09 +01007606 sdg->sgc->min_capacity = capacity;
Morten Rasmussene3d6d0c2018-07-04 11:17:41 +01007607 sdg->sgc->max_capacity = capacity;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007608}
7609
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04007610void update_group_capacity(struct sched_domain *sd, int cpu)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007611{
7612 struct sched_domain *child = sd->child;
7613 struct sched_group *group, *sdg = sd->groups;
Morten Rasmussene3d6d0c2018-07-04 11:17:41 +01007614 unsigned long capacity, min_capacity, max_capacity;
Vincent Guittot4ec44122011-12-12 20:21:08 +01007615 unsigned long interval;
7616
7617 interval = msecs_to_jiffies(sd->balance_interval);
7618 interval = clamp(interval, 1UL, max_load_balance_interval);
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04007619 sdg->sgc->next_update = jiffies + interval;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007620
7621 if (!child) {
Nicolas Pitreced549f2014-05-26 18:19:38 -04007622 update_cpu_capacity(sd, cpu);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007623 return;
7624 }
7625
Vincent Guittotdc7ff762015-03-03 11:35:03 +01007626 capacity = 0;
Morten Rasmussenbf475ce2016-10-14 14:41:09 +01007627 min_capacity = ULONG_MAX;
Morten Rasmussene3d6d0c2018-07-04 11:17:41 +01007628 max_capacity = 0;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007629
Peter Zijlstra74a5ce22012-05-23 18:00:43 +02007630 if (child->flags & SD_OVERLAP) {
7631 /*
7632 * SD_OVERLAP domains cannot assume that child groups
7633 * span the current group.
7634 */
7635
Peter Zijlstraae4df9d2017-05-01 11:03:12 +02007636 for_each_cpu(cpu, sched_group_span(sdg)) {
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04007637 struct sched_group_capacity *sgc;
Srikar Dronamraju9abf24d2013-11-12 22:11:26 +05307638 struct rq *rq = cpu_rq(cpu);
Peter Zijlstra863bffc2013-08-28 11:44:39 +02007639
Srikar Dronamraju9abf24d2013-11-12 22:11:26 +05307640 /*
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04007641 * build_sched_domains() -> init_sched_groups_capacity()
Srikar Dronamraju9abf24d2013-11-12 22:11:26 +05307642 * gets here before we've attached the domains to the
7643 * runqueues.
7644 *
Nicolas Pitreced549f2014-05-26 18:19:38 -04007645 * Use capacity_of(), which is set irrespective of domains
7646 * in update_cpu_capacity().
Srikar Dronamraju9abf24d2013-11-12 22:11:26 +05307647 *
Vincent Guittotdc7ff762015-03-03 11:35:03 +01007648 * This avoids capacity from being 0 and
Srikar Dronamraju9abf24d2013-11-12 22:11:26 +05307649 * causing divide-by-zero issues on boot.
Srikar Dronamraju9abf24d2013-11-12 22:11:26 +05307650 */
7651 if (unlikely(!rq->sd)) {
Nicolas Pitreced549f2014-05-26 18:19:38 -04007652 capacity += capacity_of(cpu);
Morten Rasmussenbf475ce2016-10-14 14:41:09 +01007653 } else {
7654 sgc = rq->sd->groups->sgc;
7655 capacity += sgc->capacity;
Srikar Dronamraju9abf24d2013-11-12 22:11:26 +05307656 }
7657
Morten Rasmussenbf475ce2016-10-14 14:41:09 +01007658 min_capacity = min(capacity, min_capacity);
Morten Rasmussene3d6d0c2018-07-04 11:17:41 +01007659 max_capacity = max(capacity, max_capacity);
Peter Zijlstra863bffc2013-08-28 11:44:39 +02007660 }
Peter Zijlstra74a5ce22012-05-23 18:00:43 +02007661 } else {
7662 /*
7663 * !SD_OVERLAP domains can assume that child groups
7664 * span the current group.
Byungchul Park97a71422015-07-05 18:33:48 +09007665 */
Peter Zijlstra74a5ce22012-05-23 18:00:43 +02007666
7667 group = child->groups;
7668 do {
Morten Rasmussenbf475ce2016-10-14 14:41:09 +01007669 struct sched_group_capacity *sgc = group->sgc;
7670
7671 capacity += sgc->capacity;
7672 min_capacity = min(sgc->min_capacity, min_capacity);
Morten Rasmussene3d6d0c2018-07-04 11:17:41 +01007673 max_capacity = max(sgc->max_capacity, max_capacity);
Peter Zijlstra74a5ce22012-05-23 18:00:43 +02007674 group = group->next;
7675 } while (group != child->groups);
7676 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007677
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04007678 sdg->sgc->capacity = capacity;
Morten Rasmussenbf475ce2016-10-14 14:41:09 +01007679 sdg->sgc->min_capacity = min_capacity;
Morten Rasmussene3d6d0c2018-07-04 11:17:41 +01007680 sdg->sgc->max_capacity = max_capacity;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007681}
7682
Srivatsa Vaddagiri9d5efe02010-06-08 14:57:02 +10007683/*
Vincent Guittotea678212015-02-27 16:54:11 +01007684 * Check whether the capacity of the rq has been noticeably reduced by side
7685 * activity. The imbalance_pct is used for the threshold.
7686 * Return true is the capacity is reduced
Srivatsa Vaddagiri9d5efe02010-06-08 14:57:02 +10007687 */
7688static inline int
Vincent Guittotea678212015-02-27 16:54:11 +01007689check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
Srivatsa Vaddagiri9d5efe02010-06-08 14:57:02 +10007690{
Vincent Guittotea678212015-02-27 16:54:11 +01007691 return ((rq->cpu_capacity * sd->imbalance_pct) <
7692 (rq->cpu_capacity_orig * 100));
Srivatsa Vaddagiri9d5efe02010-06-08 14:57:02 +10007693}
7694
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02007695/*
7696 * Group imbalance indicates (and tries to solve) the problem where balancing
Ingo Molnar0c98d342017-02-05 15:38:10 +01007697 * groups is inadequate due to ->cpus_allowed constraints.
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02007698 *
Ingo Molnar97fb7a02018-03-03 14:01:12 +01007699 * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
7700 * cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02007701 * Something like:
7702 *
Ingo Molnar2b4d5b22016-11-23 07:37:00 +01007703 * { 0 1 2 3 } { 4 5 6 7 }
7704 * * * * *
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02007705 *
7706 * If we were to balance group-wise we'd place two tasks in the first group and
7707 * two tasks in the second group. Clearly this is undesired as it will overload
Ingo Molnar97fb7a02018-03-03 14:01:12 +01007708 * cpu 3 and leave one of the CPUs in the second group unused.
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02007709 *
7710 * The current solution to this issue is detecting the skew in the first group
Peter Zijlstra62633222013-08-19 12:41:09 +02007711 * by noticing the lower domain failed to reach balance and had difficulty
7712 * moving tasks due to affinity constraints.
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02007713 *
7714 * When this is so detected; this group becomes a candidate for busiest; see
Kamalesh Babulaled1b7732013-10-13 23:06:15 +05307715 * update_sd_pick_busiest(). And calculate_imbalance() and
Peter Zijlstra62633222013-08-19 12:41:09 +02007716 * find_busiest_group() avoid some of the usual balance conditions to allow it
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02007717 * to create an effective group imbalance.
7718 *
7719 * This is a somewhat tricky proposition since the next run might not find the
7720 * group imbalance and decide the groups need to be balanced again. A most
7721 * subtle and fragile situation.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007722 */
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02007723
Peter Zijlstra62633222013-08-19 12:41:09 +02007724static inline int sg_imbalanced(struct sched_group *group)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007725{
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04007726 return group->sgc->imbalance;
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02007727}
7728
Peter Zijlstrab37d9312013-08-28 11:50:34 +02007729/*
Vincent Guittotea678212015-02-27 16:54:11 +01007730 * group_has_capacity returns true if the group has spare capacity that could
7731 * be used by some tasks.
7732 * We consider that a group has spare capacity if the * number of task is
Dietmar Eggemann9e91d612015-08-14 17:23:12 +01007733 * smaller than the number of CPUs or if the utilization is lower than the
7734 * available capacity for CFS tasks.
Vincent Guittotea678212015-02-27 16:54:11 +01007735 * For the latter, we use a threshold to stabilize the state, to take into
7736 * account the variance of the tasks' load and to return true if the available
7737 * capacity in meaningful for the load balancer.
7738 * As an example, an available capacity of 1% can appear but it doesn't make
7739 * any benefit for the load balance.
Peter Zijlstrab37d9312013-08-28 11:50:34 +02007740 */
Vincent Guittotea678212015-02-27 16:54:11 +01007741static inline bool
7742group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
Peter Zijlstrab37d9312013-08-28 11:50:34 +02007743{
Vincent Guittotea678212015-02-27 16:54:11 +01007744 if (sgs->sum_nr_running < sgs->group_weight)
7745 return true;
Peter Zijlstrab37d9312013-08-28 11:50:34 +02007746
Vincent Guittotea678212015-02-27 16:54:11 +01007747 if ((sgs->group_capacity * 100) >
Dietmar Eggemann9e91d612015-08-14 17:23:12 +01007748 (sgs->group_util * env->sd->imbalance_pct))
Vincent Guittotea678212015-02-27 16:54:11 +01007749 return true;
Peter Zijlstrab37d9312013-08-28 11:50:34 +02007750
Vincent Guittotea678212015-02-27 16:54:11 +01007751 return false;
Peter Zijlstrab37d9312013-08-28 11:50:34 +02007752}
7753
Vincent Guittotea678212015-02-27 16:54:11 +01007754/*
7755 * group_is_overloaded returns true if the group has more tasks than it can
7756 * handle.
7757 * group_is_overloaded is not equals to !group_has_capacity because a group
7758 * with the exact right number of tasks, has no more spare capacity but is not
7759 * overloaded so both group_has_capacity and group_is_overloaded return
7760 * false.
7761 */
7762static inline bool
7763group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
Rik van Rielcaeb1782014-07-28 14:16:28 -04007764{
Vincent Guittotea678212015-02-27 16:54:11 +01007765 if (sgs->sum_nr_running <= sgs->group_weight)
7766 return false;
7767
7768 if ((sgs->group_capacity * 100) <
Dietmar Eggemann9e91d612015-08-14 17:23:12 +01007769 (sgs->group_util * env->sd->imbalance_pct))
Vincent Guittotea678212015-02-27 16:54:11 +01007770 return true;
7771
7772 return false;
7773}
7774
Morten Rasmussen9e0994c2016-10-14 14:41:10 +01007775/*
Morten Rasmussene3d6d0c2018-07-04 11:17:41 +01007776 * group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller
Morten Rasmussen9e0994c2016-10-14 14:41:10 +01007777 * per-CPU capacity than sched_group ref.
7778 */
7779static inline bool
Morten Rasmussene3d6d0c2018-07-04 11:17:41 +01007780group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
Morten Rasmussen9e0994c2016-10-14 14:41:10 +01007781{
7782 return sg->sgc->min_capacity * capacity_margin <
7783 ref->sgc->min_capacity * 1024;
7784}
7785
Morten Rasmussene3d6d0c2018-07-04 11:17:41 +01007786/*
7787 * group_smaller_max_cpu_capacity: Returns true if sched_group sg has smaller
7788 * per-CPU capacity_orig than sched_group ref.
7789 */
7790static inline bool
7791group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
7792{
7793 return sg->sgc->max_capacity * capacity_margin <
7794 ref->sgc->max_capacity * 1024;
7795}
7796
Leo Yan79a89f92015-09-15 18:56:45 +08007797static inline enum
7798group_type group_classify(struct sched_group *group,
7799 struct sg_lb_stats *sgs)
Vincent Guittotea678212015-02-27 16:54:11 +01007800{
7801 if (sgs->group_no_capacity)
Rik van Rielcaeb1782014-07-28 14:16:28 -04007802 return group_overloaded;
7803
7804 if (sg_imbalanced(group))
7805 return group_imbalanced;
7806
Morten Rasmussen3b1baa62018-07-04 11:17:40 +01007807 if (sgs->group_misfit_task_load)
7808 return group_misfit_task;
7809
Rik van Rielcaeb1782014-07-28 14:16:28 -04007810 return group_other;
7811}
7812
Peter Zijlstra63928382018-02-13 16:54:17 +01007813static bool update_nohz_stats(struct rq *rq, bool force)
Peter Zijlstrae022e0d2017-12-21 11:20:23 +01007814{
7815#ifdef CONFIG_NO_HZ_COMMON
7816 unsigned int cpu = rq->cpu;
7817
Vincent Guittotf643ea22018-02-13 11:31:17 +01007818 if (!rq->has_blocked_load)
7819 return false;
7820
Peter Zijlstrae022e0d2017-12-21 11:20:23 +01007821 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
Vincent Guittotf643ea22018-02-13 11:31:17 +01007822 return false;
Peter Zijlstrae022e0d2017-12-21 11:20:23 +01007823
Peter Zijlstra63928382018-02-13 16:54:17 +01007824 if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))
Vincent Guittotf643ea22018-02-13 11:31:17 +01007825 return true;
Peter Zijlstrae022e0d2017-12-21 11:20:23 +01007826
7827 update_blocked_averages(cpu);
Vincent Guittotf643ea22018-02-13 11:31:17 +01007828
7829 return rq->has_blocked_load;
7830#else
7831 return false;
Peter Zijlstrae022e0d2017-12-21 11:20:23 +01007832#endif
7833}
7834
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007835/**
7836 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
7837 * @env: The load balancing environment.
7838 * @group: sched_group whose statistics are to be updated.
7839 * @load_idx: Load index of sched_domain of this_cpu for load calc.
7840 * @local_group: Does group contain this_cpu.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007841 * @sgs: variable to hold the statistics for this group.
Valentin Schneider757ffdd2018-07-04 11:17:47 +01007842 * @overload: Indicate pullable load (e.g. >1 runnable task).
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007843 */
7844static inline void update_sg_lb_stats(struct lb_env *env,
7845 struct sched_group *group, int load_idx,
Tim Chen4486edd2014-06-23 12:16:49 -07007846 int local_group, struct sg_lb_stats *sgs,
7847 bool *overload)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007848{
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02007849 unsigned long load;
Waiman Longa426f992015-11-25 14:09:38 -05007850 int i, nr_running;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007851
Peter Zijlstrab72ff132013-08-28 10:32:32 +02007852 memset(sgs, 0, sizeof(*sgs));
7853
Peter Zijlstraae4df9d2017-05-01 11:03:12 +02007854 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007855 struct rq *rq = cpu_rq(i);
7856
Peter Zijlstra63928382018-02-13 16:54:17 +01007857 if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
Vincent Guittotf643ea22018-02-13 11:31:17 +01007858 env->flags |= LBF_NOHZ_AGAIN;
Peter Zijlstrae022e0d2017-12-21 11:20:23 +01007859
Ingo Molnar97fb7a02018-03-03 14:01:12 +01007860 /* Bias balancing toward CPUs of our domain: */
Peter Zijlstra62633222013-08-19 12:41:09 +02007861 if (local_group)
Peter Zijlstra04f733b2012-05-11 00:12:02 +02007862 load = target_load(i, load_idx);
Peter Zijlstra62633222013-08-19 12:41:09 +02007863 else
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007864 load = source_load(i, load_idx);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007865
7866 sgs->group_load += load;
Dietmar Eggemann9e91d612015-08-14 17:23:12 +01007867 sgs->group_util += cpu_util(i);
Vincent Guittot65fdac02014-08-26 13:06:46 +02007868 sgs->sum_nr_running += rq->cfs.h_nr_running;
Tim Chen4486edd2014-06-23 12:16:49 -07007869
Waiman Longa426f992015-11-25 14:09:38 -05007870 nr_running = rq->nr_running;
7871 if (nr_running > 1)
Tim Chen4486edd2014-06-23 12:16:49 -07007872 *overload = true;
7873
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01007874#ifdef CONFIG_NUMA_BALANCING
7875 sgs->nr_numa_running += rq->nr_numa_running;
7876 sgs->nr_preferred_running += rq->nr_preferred_running;
7877#endif
Viresh Kumarc7132dd2017-05-24 10:59:54 +05307878 sgs->sum_weighted_load += weighted_cpuload(rq);
Waiman Longa426f992015-11-25 14:09:38 -05007879 /*
7880 * No need to call idle_cpu() if nr_running is not 0
7881 */
7882 if (!nr_running && idle_cpu(i))
Suresh Siddhaaae6d3d2010-09-17 15:02:32 -07007883 sgs->idle_cpus++;
Morten Rasmussen3b1baa62018-07-04 11:17:40 +01007884
7885 if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
Valentin Schneider757ffdd2018-07-04 11:17:47 +01007886 sgs->group_misfit_task_load < rq->misfit_task_load) {
Morten Rasmussen3b1baa62018-07-04 11:17:40 +01007887 sgs->group_misfit_task_load = rq->misfit_task_load;
Valentin Schneider757ffdd2018-07-04 11:17:47 +01007888 *overload = 1;
7889 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007890 }
7891
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04007892 /* Adjust by relative CPU capacity of the group */
7893 sgs->group_capacity = group->sgc->capacity;
Nicolas Pitreca8ce3d2014-05-26 18:19:39 -04007894 sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007895
Suresh Siddhadd5feea2010-02-23 16:13:52 -08007896 if (sgs->sum_nr_running)
Peter Zijlstra38d0f772013-08-15 19:47:56 +02007897 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007898
Suresh Siddhaaae6d3d2010-09-17 15:02:32 -07007899 sgs->group_weight = group->group_weight;
Peter Zijlstrab37d9312013-08-28 11:50:34 +02007900
Vincent Guittotea678212015-02-27 16:54:11 +01007901 sgs->group_no_capacity = group_is_overloaded(env, sgs);
Leo Yan79a89f92015-09-15 18:56:45 +08007902 sgs->group_type = group_classify(group, sgs);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007903}
7904
7905/**
Michael Neuling532cb4c2010-06-08 14:57:02 +10007906 * update_sd_pick_busiest - return 1 on busiest group
Randy Dunlapcd968912012-06-08 13:18:33 -07007907 * @env: The load balancing environment.
Michael Neuling532cb4c2010-06-08 14:57:02 +10007908 * @sds: sched_domain statistics
7909 * @sg: sched_group candidate to be checked for being the busiest
Michael Neulingb6b12292010-06-10 12:06:21 +10007910 * @sgs: sched_group statistics
Michael Neuling532cb4c2010-06-08 14:57:02 +10007911 *
7912 * Determine if @sg is a busier group than the previously selected
7913 * busiest group.
Yacine Belkadie69f6182013-07-12 20:45:47 +02007914 *
7915 * Return: %true if @sg is a busier group than the previously selected
7916 * busiest group. %false otherwise.
Michael Neuling532cb4c2010-06-08 14:57:02 +10007917 */
Peter Zijlstrabd939f42012-05-02 14:20:37 +02007918static bool update_sd_pick_busiest(struct lb_env *env,
Michael Neuling532cb4c2010-06-08 14:57:02 +10007919 struct sd_lb_stats *sds,
7920 struct sched_group *sg,
Peter Zijlstrabd939f42012-05-02 14:20:37 +02007921 struct sg_lb_stats *sgs)
Michael Neuling532cb4c2010-06-08 14:57:02 +10007922{
Rik van Rielcaeb1782014-07-28 14:16:28 -04007923 struct sg_lb_stats *busiest = &sds->busiest_stat;
Michael Neuling532cb4c2010-06-08 14:57:02 +10007924
Morten Rasmussencad68e52018-07-04 11:17:42 +01007925 /*
7926 * Don't try to pull misfit tasks we can't help.
7927 * We can use max_capacity here as reduction in capacity on some
7928 * CPUs in the group should either be possible to resolve
7929 * internally or be covered by avg_load imbalance (eventually).
7930 */
7931 if (sgs->group_type == group_misfit_task &&
7932 (!group_smaller_max_cpu_capacity(sg, sds->local) ||
7933 !group_has_capacity(env, &sds->local_stat)))
7934 return false;
7935
Rik van Rielcaeb1782014-07-28 14:16:28 -04007936 if (sgs->group_type > busiest->group_type)
Michael Neuling532cb4c2010-06-08 14:57:02 +10007937 return true;
7938
Rik van Rielcaeb1782014-07-28 14:16:28 -04007939 if (sgs->group_type < busiest->group_type)
7940 return false;
7941
7942 if (sgs->avg_load <= busiest->avg_load)
7943 return false;
7944
Morten Rasmussen9e0994c2016-10-14 14:41:10 +01007945 if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
7946 goto asym_packing;
7947
7948 /*
7949 * Candidate sg has no more than one task per CPU and
7950 * has higher per-CPU capacity. Migrating tasks to less
7951 * capable CPUs may harm throughput. Maximize throughput,
7952 * power/energy consequences are not considered.
7953 */
7954 if (sgs->sum_nr_running <= sgs->group_weight &&
Morten Rasmussene3d6d0c2018-07-04 11:17:41 +01007955 group_smaller_min_cpu_capacity(sds->local, sg))
Morten Rasmussen9e0994c2016-10-14 14:41:10 +01007956 return false;
7957
Morten Rasmussencad68e52018-07-04 11:17:42 +01007958 /*
7959 * If we have more than one misfit sg go with the biggest misfit.
7960 */
7961 if (sgs->group_type == group_misfit_task &&
7962 sgs->group_misfit_task_load < busiest->group_misfit_task_load)
7963 return false;
7964
Morten Rasmussen9e0994c2016-10-14 14:41:10 +01007965asym_packing:
Rik van Rielcaeb1782014-07-28 14:16:28 -04007966 /* This is the busiest node in its class. */
7967 if (!(env->sd->flags & SD_ASYM_PACKING))
Michael Neuling532cb4c2010-06-08 14:57:02 +10007968 return true;
7969
Ingo Molnar97fb7a02018-03-03 14:01:12 +01007970 /* No ASYM_PACKING if target CPU is already busy */
Srikar Dronamraju1f621e02016-04-06 18:47:40 +05307971 if (env->idle == CPU_NOT_IDLE)
7972 return true;
Michael Neuling532cb4c2010-06-08 14:57:02 +10007973 /*
Tim Chenafe06ef2016-11-22 12:23:53 -08007974 * ASYM_PACKING needs to move all the work to the highest
7975 * prority CPUs in the group, therefore mark all groups
7976 * of lower priority than ourself as busy.
Michael Neuling532cb4c2010-06-08 14:57:02 +10007977 */
Tim Chenafe06ef2016-11-22 12:23:53 -08007978 if (sgs->sum_nr_running &&
7979 sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
Michael Neuling532cb4c2010-06-08 14:57:02 +10007980 if (!sds->busiest)
7981 return true;
7982
Ingo Molnar97fb7a02018-03-03 14:01:12 +01007983 /* Prefer to move from lowest priority CPU's work */
Tim Chenafe06ef2016-11-22 12:23:53 -08007984 if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
7985 sg->asym_prefer_cpu))
Michael Neuling532cb4c2010-06-08 14:57:02 +10007986 return true;
7987 }
7988
7989 return false;
7990}
7991
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01007992#ifdef CONFIG_NUMA_BALANCING
7993static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
7994{
7995 if (sgs->sum_nr_running > sgs->nr_numa_running)
7996 return regular;
7997 if (sgs->sum_nr_running > sgs->nr_preferred_running)
7998 return remote;
7999 return all;
8000}
8001
8002static inline enum fbq_type fbq_classify_rq(struct rq *rq)
8003{
8004 if (rq->nr_running > rq->nr_numa_running)
8005 return regular;
8006 if (rq->nr_running > rq->nr_preferred_running)
8007 return remote;
8008 return all;
8009}
8010#else
8011static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
8012{
8013 return all;
8014}
8015
8016static inline enum fbq_type fbq_classify_rq(struct rq *rq)
8017{
8018 return regular;
8019}
8020#endif /* CONFIG_NUMA_BALANCING */
8021
Michael Neuling532cb4c2010-06-08 14:57:02 +10008022/**
Hui Kang461819a2011-10-11 23:00:59 -04008023 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
Randy Dunlapcd968912012-06-08 13:18:33 -07008024 * @env: The load balancing environment.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008025 * @sds: variable to hold the statistics for this sched_domain.
8026 */
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01008027static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008028{
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008029 struct sched_domain *child = env->sd->child;
8030 struct sched_group *sg = env->sd->groups;
Srikar Dronamraju05b40e02017-03-22 23:27:50 +05308031 struct sg_lb_stats *local = &sds->local_stat;
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008032 struct sg_lb_stats tmp_sgs;
Valentin Schneiderdbbad712018-07-04 11:17:44 +01008033 int load_idx;
Tim Chen4486edd2014-06-23 12:16:49 -07008034 bool overload = false;
Valentin Schneiderdbbad712018-07-04 11:17:44 +01008035 bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008036
Peter Zijlstrae022e0d2017-12-21 11:20:23 +01008037#ifdef CONFIG_NO_HZ_COMMON
Vincent Guittotf643ea22018-02-13 11:31:17 +01008038 if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
Peter Zijlstrae022e0d2017-12-21 11:20:23 +01008039 env->flags |= LBF_NOHZ_STATS;
Peter Zijlstrae022e0d2017-12-21 11:20:23 +01008040#endif
8041
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008042 load_idx = get_sd_load_idx(env->sd, env->idle);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008043
8044 do {
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008045 struct sg_lb_stats *sgs = &tmp_sgs;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008046 int local_group;
8047
Peter Zijlstraae4df9d2017-05-01 11:03:12 +02008048 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008049 if (local_group) {
8050 sds->local = sg;
Srikar Dronamraju05b40e02017-03-22 23:27:50 +05308051 sgs = local;
Peter Zijlstrab72ff132013-08-28 10:32:32 +02008052
8053 if (env->idle != CPU_NEWLY_IDLE ||
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008054 time_after_eq(jiffies, sg->sgc->next_update))
8055 update_group_capacity(env->sd, env->dst_cpu);
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008056 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008057
Tim Chen4486edd2014-06-23 12:16:49 -07008058 update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
8059 &overload);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008060
Peter Zijlstrab72ff132013-08-28 10:32:32 +02008061 if (local_group)
8062 goto next_group;
8063
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008064 /*
8065 * In case the child domain prefers tasks go to siblings
Vincent Guittotea678212015-02-27 16:54:11 +01008066 * first, lower the sg capacity so that we'll try
Nikhil Rao75dd3212010-10-15 13:12:30 -07008067 * and move all the excess tasks away. We lower the capacity
8068 * of a group only if the local group has the capacity to fit
Vincent Guittotea678212015-02-27 16:54:11 +01008069 * these excess tasks. The extra check prevents the case where
8070 * you always pull from the heaviest group when it is already
8071 * under-utilized (possible with a large weight task outweighs
8072 * the tasks on the system).
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008073 */
Peter Zijlstrab72ff132013-08-28 10:32:32 +02008074 if (prefer_sibling && sds->local &&
Srikar Dronamraju05b40e02017-03-22 23:27:50 +05308075 group_has_capacity(env, local) &&
8076 (sgs->sum_nr_running > local->sum_nr_running + 1)) {
Vincent Guittotea678212015-02-27 16:54:11 +01008077 sgs->group_no_capacity = 1;
Leo Yan79a89f92015-09-15 18:56:45 +08008078 sgs->group_type = group_classify(sg, sgs);
Wanpeng Licb0b9f22014-11-05 07:44:50 +08008079 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008080
Peter Zijlstrab72ff132013-08-28 10:32:32 +02008081 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
Michael Neuling532cb4c2010-06-08 14:57:02 +10008082 sds->busiest = sg;
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008083 sds->busiest_stat = *sgs;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008084 }
8085
Peter Zijlstrab72ff132013-08-28 10:32:32 +02008086next_group:
8087 /* Now, start updating sd_lb_stats */
Peter Zijlstra90001d62017-07-31 17:50:05 +02008088 sds->total_running += sgs->sum_nr_running;
Peter Zijlstrab72ff132013-08-28 10:32:32 +02008089 sds->total_load += sgs->group_load;
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008090 sds->total_capacity += sgs->group_capacity;
Peter Zijlstrab72ff132013-08-28 10:32:32 +02008091
Michael Neuling532cb4c2010-06-08 14:57:02 +10008092 sg = sg->next;
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008093 } while (sg != env->sd->groups);
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01008094
Vincent Guittotf643ea22018-02-13 11:31:17 +01008095#ifdef CONFIG_NO_HZ_COMMON
8096 if ((env->flags & LBF_NOHZ_AGAIN) &&
8097 cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
8098
8099 WRITE_ONCE(nohz.next_blocked,
8100 jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
8101 }
8102#endif
8103
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01008104 if (env->sd->flags & SD_NUMA)
8105 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
Tim Chen4486edd2014-06-23 12:16:49 -07008106
8107 if (!env->sd->parent) {
8108 /* update overload indicator if we are at root domain */
Valentin Schneidere90c8fe2018-07-04 11:17:46 +01008109 if (READ_ONCE(env->dst_rq->rd->overload) != overload)
8110 WRITE_ONCE(env->dst_rq->rd->overload, overload);
Tim Chen4486edd2014-06-23 12:16:49 -07008111 }
Michael Neuling532cb4c2010-06-08 14:57:02 +10008112}
8113
Michael Neuling532cb4c2010-06-08 14:57:02 +10008114/**
8115 * check_asym_packing - Check to see if the group is packed into the
Masanari Iida0ba42a52017-03-07 20:48:02 +09008116 * sched domain.
Michael Neuling532cb4c2010-06-08 14:57:02 +10008117 *
8118 * This is primarily intended to used at the sibling level. Some
8119 * cores like POWER7 prefer to use lower numbered SMT threads. In the
8120 * case of POWER7, it can move to lower SMT modes only when higher
8121 * threads are idle. When in lower SMT modes, the threads will
8122 * perform better since they share less core resources. Hence when we
8123 * have idle threads, we want them to be the higher ones.
8124 *
8125 * This packing function is run on idle threads. It checks to see if
8126 * the busiest CPU in this domain (core in the P7 case) has a higher
8127 * CPU number than the packing function is being run on. Here we are
8128 * assuming lower CPU number will be equivalent to lower a SMT thread
8129 * number.
8130 *
Yacine Belkadie69f6182013-07-12 20:45:47 +02008131 * Return: 1 when packing is required and a task should be moved to
Randy Dunlap46123352017-09-10 09:55:05 -07008132 * this CPU. The amount of the imbalance is returned in env->imbalance.
Michael Neulingb6b12292010-06-10 12:06:21 +10008133 *
Randy Dunlapcd968912012-06-08 13:18:33 -07008134 * @env: The load balancing environment.
Michael Neuling532cb4c2010-06-08 14:57:02 +10008135 * @sds: Statistics of the sched_domain which is to be packed
Michael Neuling532cb4c2010-06-08 14:57:02 +10008136 */
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008137static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
Michael Neuling532cb4c2010-06-08 14:57:02 +10008138{
8139 int busiest_cpu;
8140
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008141 if (!(env->sd->flags & SD_ASYM_PACKING))
Michael Neuling532cb4c2010-06-08 14:57:02 +10008142 return 0;
8143
Srikar Dronamraju1f621e02016-04-06 18:47:40 +05308144 if (env->idle == CPU_NOT_IDLE)
8145 return 0;
8146
Michael Neuling532cb4c2010-06-08 14:57:02 +10008147 if (!sds->busiest)
8148 return 0;
8149
Tim Chenafe06ef2016-11-22 12:23:53 -08008150 busiest_cpu = sds->busiest->asym_prefer_cpu;
8151 if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
Michael Neuling532cb4c2010-06-08 14:57:02 +10008152 return 0;
8153
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008154 env->imbalance = DIV_ROUND_CLOSEST(
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008155 sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
Nicolas Pitreca8ce3d2014-05-26 18:19:39 -04008156 SCHED_CAPACITY_SCALE);
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008157
Michael Neuling532cb4c2010-06-08 14:57:02 +10008158 return 1;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008159}
8160
8161/**
8162 * fix_small_imbalance - Calculate the minor imbalance that exists
8163 * amongst the groups of a sched_domain, during
8164 * load balancing.
Randy Dunlapcd968912012-06-08 13:18:33 -07008165 * @env: The load balancing environment.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008166 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008167 */
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008168static inline
8169void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008170{
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008171 unsigned long tmp, capa_now = 0, capa_move = 0;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008172 unsigned int imbn = 2;
Suresh Siddhadd5feea2010-02-23 16:13:52 -08008173 unsigned long scaled_busy_load_per_task;
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008174 struct sg_lb_stats *local, *busiest;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008175
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008176 local = &sds->local_stat;
8177 busiest = &sds->busiest_stat;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008178
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008179 if (!local->sum_nr_running)
8180 local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
8181 else if (busiest->load_per_task > local->load_per_task)
8182 imbn = 1;
Suresh Siddhadd5feea2010-02-23 16:13:52 -08008183
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008184 scaled_busy_load_per_task =
Nicolas Pitreca8ce3d2014-05-26 18:19:39 -04008185 (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008186 busiest->group_capacity;
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008187
Vladimir Davydov3029ede2013-09-15 17:49:14 +04008188 if (busiest->avg_load + scaled_busy_load_per_task >=
8189 local->avg_load + (scaled_busy_load_per_task * imbn)) {
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008190 env->imbalance = busiest->load_per_task;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008191 return;
8192 }
8193
8194 /*
8195 * OK, we don't have enough imbalance to justify moving tasks,
Nicolas Pitreced549f2014-05-26 18:19:38 -04008196 * however we may be able to increase total CPU capacity used by
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008197 * moving them.
8198 */
8199
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008200 capa_now += busiest->group_capacity *
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008201 min(busiest->load_per_task, busiest->avg_load);
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008202 capa_now += local->group_capacity *
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008203 min(local->load_per_task, local->avg_load);
Nicolas Pitreca8ce3d2014-05-26 18:19:39 -04008204 capa_now /= SCHED_CAPACITY_SCALE;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008205
8206 /* Amount of load we'd subtract */
Vincent Guittota2cd4262014-03-11 17:26:06 +01008207 if (busiest->avg_load > scaled_busy_load_per_task) {
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008208 capa_move += busiest->group_capacity *
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008209 min(busiest->load_per_task,
Vincent Guittota2cd4262014-03-11 17:26:06 +01008210 busiest->avg_load - scaled_busy_load_per_task);
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008211 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008212
8213 /* Amount of load we'd add */
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008214 if (busiest->avg_load * busiest->group_capacity <
Nicolas Pitreca8ce3d2014-05-26 18:19:39 -04008215 busiest->load_per_task * SCHED_CAPACITY_SCALE) {
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008216 tmp = (busiest->avg_load * busiest->group_capacity) /
8217 local->group_capacity;
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008218 } else {
Nicolas Pitreca8ce3d2014-05-26 18:19:39 -04008219 tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008220 local->group_capacity;
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008221 }
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008222 capa_move += local->group_capacity *
Peter Zijlstra3ae11c92013-08-15 20:37:48 +02008223 min(local->load_per_task, local->avg_load + tmp);
Nicolas Pitreca8ce3d2014-05-26 18:19:39 -04008224 capa_move /= SCHED_CAPACITY_SCALE;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008225
8226 /* Move if we gain throughput */
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008227 if (capa_move > capa_now)
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008228 env->imbalance = busiest->load_per_task;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008229}
8230
8231/**
8232 * calculate_imbalance - Calculate the amount of imbalance present within the
8233 * groups of a given sched_domain during load balance.
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008234 * @env: load balance environment
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008235 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008236 */
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008237static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008238{
Suresh Siddhadd5feea2010-02-23 16:13:52 -08008239 unsigned long max_pull, load_above_capacity = ~0UL;
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008240 struct sg_lb_stats *local, *busiest;
Suresh Siddhadd5feea2010-02-23 16:13:52 -08008241
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008242 local = &sds->local_stat;
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008243 busiest = &sds->busiest_stat;
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008244
Rik van Rielcaeb1782014-07-28 14:16:28 -04008245 if (busiest->group_type == group_imbalanced) {
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02008246 /*
8247 * In the group_imb case we cannot rely on group-wide averages
Ingo Molnar97fb7a02018-03-03 14:01:12 +01008248 * to ensure CPU-load equilibrium, look at wider averages. XXX
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02008249 */
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008250 busiest->load_per_task =
8251 min(busiest->load_per_task, sds->avg_load);
Suresh Siddhadd5feea2010-02-23 16:13:52 -08008252 }
8253
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008254 /*
Dietmar Eggemann885e5422016-04-29 20:32:39 +01008255 * Avg load of busiest sg can be less and avg load of local sg can
8256 * be greater than avg load across all sgs of sd because avg load
8257 * factors in sg capacity and sgs with smaller group_type are
8258 * skipped when updating the busiest sg:
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008259 */
Morten Rasmussencad68e52018-07-04 11:17:42 +01008260 if (busiest->group_type != group_misfit_task &&
8261 (busiest->avg_load <= sds->avg_load ||
8262 local->avg_load >= sds->avg_load)) {
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008263 env->imbalance = 0;
8264 return fix_small_imbalance(env, sds);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008265 }
8266
Peter Zijlstra9a5d9ba2014-07-29 17:15:11 +02008267 /*
Ingo Molnar97fb7a02018-03-03 14:01:12 +01008268 * If there aren't any idle CPUs, avoid creating some.
Peter Zijlstra9a5d9ba2014-07-29 17:15:11 +02008269 */
8270 if (busiest->group_type == group_overloaded &&
8271 local->group_type == group_overloaded) {
Peter Zijlstra1be0eb22016-05-06 12:21:23 +02008272 load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
Morten Rasmussencfa10332016-04-29 20:32:40 +01008273 if (load_above_capacity > busiest->group_capacity) {
Vincent Guittotea678212015-02-27 16:54:11 +01008274 load_above_capacity -= busiest->group_capacity;
Dietmar Eggemann26656212016-08-10 11:27:27 +01008275 load_above_capacity *= scale_load_down(NICE_0_LOAD);
Morten Rasmussencfa10332016-04-29 20:32:40 +01008276 load_above_capacity /= busiest->group_capacity;
8277 } else
Vincent Guittotea678212015-02-27 16:54:11 +01008278 load_above_capacity = ~0UL;
Suresh Siddhadd5feea2010-02-23 16:13:52 -08008279 }
8280
8281 /*
Ingo Molnar97fb7a02018-03-03 14:01:12 +01008282 * We're trying to get all the CPUs to the average_load, so we don't
Suresh Siddhadd5feea2010-02-23 16:13:52 -08008283 * want to push ourselves above the average load, nor do we wish to
Ingo Molnar97fb7a02018-03-03 14:01:12 +01008284 * reduce the max loaded CPU below the average load. At the same time,
Dietmar Eggemann0a9b23c2016-04-29 20:32:38 +01008285 * we also don't want to reduce the group load below the group
8286 * capacity. Thus we look for the minimum possible imbalance.
Suresh Siddhadd5feea2010-02-23 16:13:52 -08008287 */
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02008288 max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008289
8290 /* How much load to actually move to equalise the imbalance */
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008291 env->imbalance = min(
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008292 max_pull * busiest->group_capacity,
8293 (sds->avg_load - local->avg_load) * local->group_capacity
Nicolas Pitreca8ce3d2014-05-26 18:19:39 -04008294 ) / SCHED_CAPACITY_SCALE;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008295
Morten Rasmussencad68e52018-07-04 11:17:42 +01008296 /* Boost imbalance to allow misfit task to be balanced. */
8297 if (busiest->group_type == group_misfit_task) {
8298 env->imbalance = max_t(long, env->imbalance,
8299 busiest->group_misfit_task_load);
8300 }
8301
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008302 /*
8303 * if *imbalance is less than the average load per runnable task
Lucas De Marchi25985ed2011-03-30 22:57:33 -03008304 * there is no guarantee that any tasks will be moved so we'll have
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008305 * a think about bumping its value to force at least one task to be
8306 * moved
8307 */
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008308 if (env->imbalance < busiest->load_per_task)
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008309 return fix_small_imbalance(env, sds);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008310}
Nikhil Raofab47622010-10-15 13:12:29 -07008311
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008312/******* find_busiest_group() helpers end here *********************/
8313
8314/**
8315 * find_busiest_group - Returns the busiest group within the sched_domain
Dietmar Eggemann0a9b23c2016-04-29 20:32:38 +01008316 * if there is an imbalance.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008317 *
8318 * Also calculates the amount of weighted load which should be moved
8319 * to restore balance.
8320 *
Randy Dunlapcd968912012-06-08 13:18:33 -07008321 * @env: The load balancing environment.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008322 *
Yacine Belkadie69f6182013-07-12 20:45:47 +02008323 * Return: - The busiest group if imbalance exists.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008324 */
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008325static struct sched_group *find_busiest_group(struct lb_env *env)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008326{
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008327 struct sg_lb_stats *local, *busiest;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008328 struct sd_lb_stats sds;
8329
Peter Zijlstra147c5fc2013-08-19 15:22:57 +02008330 init_sd_lb_stats(&sds);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008331
8332 /*
8333 * Compute the various statistics relavent for load balancing at
8334 * this level.
8335 */
Joonsoo Kim23f0d202013-08-06 17:36:42 +09008336 update_sd_lb_stats(env, &sds);
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008337 local = &sds.local_stat;
8338 busiest = &sds.busiest_stat;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008339
Vincent Guittotea678212015-02-27 16:54:11 +01008340 /* ASYM feature bypasses nice load balance check */
Srikar Dronamraju1f621e02016-04-06 18:47:40 +05308341 if (check_asym_packing(env, &sds))
Michael Neuling532cb4c2010-06-08 14:57:02 +10008342 return sds.busiest;
8343
Peter Zijlstracc57aa82011-02-21 18:55:32 +01008344 /* There is no busy sibling group to pull tasks from */
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008345 if (!sds.busiest || busiest->sum_nr_running == 0)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008346 goto out_balanced;
8347
Peter Zijlstra90001d62017-07-31 17:50:05 +02008348 /* XXX broken for overlapping NUMA groups */
Nicolas Pitreca8ce3d2014-05-26 18:19:39 -04008349 sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
8350 / sds.total_capacity;
Ken Chenb0432d82011-04-07 17:23:22 -07008351
Peter Zijlstra866ab432011-02-21 18:56:47 +01008352 /*
8353 * If the busiest group is imbalanced the below checks don't
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02008354 * work because they assume all things are equal, which typically
Peter Zijlstra866ab432011-02-21 18:56:47 +01008355 * isn't true due to cpus_allowed constraints and the like.
8356 */
Rik van Rielcaeb1782014-07-28 14:16:28 -04008357 if (busiest->group_type == group_imbalanced)
Peter Zijlstra866ab432011-02-21 18:56:47 +01008358 goto force_balance;
8359
Brendan Jackman583ffd92017-10-05 11:58:54 +01008360 /*
8361 * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
8362 * capacities from resulting in underutilization due to avg_load.
8363 */
8364 if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
Vincent Guittotea678212015-02-27 16:54:11 +01008365 busiest->group_no_capacity)
Nikhil Raofab47622010-10-15 13:12:29 -07008366 goto force_balance;
8367
Morten Rasmussencad68e52018-07-04 11:17:42 +01008368 /* Misfit tasks should be dealt with regardless of the avg load */
8369 if (busiest->group_type == group_misfit_task)
8370 goto force_balance;
8371
Peter Zijlstracc57aa82011-02-21 18:55:32 +01008372 /*
Zhihui Zhang9c58c792014-09-20 21:24:36 -04008373 * If the local group is busier than the selected busiest group
Peter Zijlstracc57aa82011-02-21 18:55:32 +01008374 * don't try and pull any tasks.
8375 */
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008376 if (local->avg_load >= busiest->avg_load)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008377 goto out_balanced;
8378
Peter Zijlstracc57aa82011-02-21 18:55:32 +01008379 /*
8380 * Don't pull any tasks if this group is already above the domain
8381 * average load.
8382 */
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008383 if (local->avg_load >= sds.avg_load)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008384 goto out_balanced;
8385
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008386 if (env->idle == CPU_IDLE) {
Suresh Siddhaaae6d3d2010-09-17 15:02:32 -07008387 /*
Ingo Molnar97fb7a02018-03-03 14:01:12 +01008388 * This CPU is idle. If the busiest group is not overloaded
Vincent Guittot43f4d662014-10-01 15:38:55 +02008389 * and there is no imbalance between this and busiest group
Ingo Molnar97fb7a02018-03-03 14:01:12 +01008390 * wrt idle CPUs, it is balanced. The imbalance becomes
Vincent Guittot43f4d662014-10-01 15:38:55 +02008391 * significant if the diff is greater than 1 otherwise we
8392 * might end up to just move the imbalance on another group
Suresh Siddhaaae6d3d2010-09-17 15:02:32 -07008393 */
Vincent Guittot43f4d662014-10-01 15:38:55 +02008394 if ((busiest->group_type != group_overloaded) &&
8395 (local->idle_cpus <= (busiest->idle_cpus + 1)))
Suresh Siddhaaae6d3d2010-09-17 15:02:32 -07008396 goto out_balanced;
Peter Zijlstrac186faf2011-02-21 18:52:53 +01008397 } else {
8398 /*
8399 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
8400 * imbalance_pct to be conservative.
8401 */
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008402 if (100 * busiest->avg_load <=
8403 env->sd->imbalance_pct * local->avg_load)
Peter Zijlstrac186faf2011-02-21 18:52:53 +01008404 goto out_balanced;
Suresh Siddhaaae6d3d2010-09-17 15:02:32 -07008405 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008406
Nikhil Raofab47622010-10-15 13:12:29 -07008407force_balance:
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008408 /* Looks like there is an imbalance. Compute it */
Morten Rasmussencad68e52018-07-04 11:17:42 +01008409 env->src_grp_type = busiest->group_type;
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008410 calculate_imbalance(env, &sds);
Vincent Guittotbb3485c2018-09-07 09:51:04 +02008411 return env->imbalance ? sds.busiest : NULL;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008412
8413out_balanced:
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008414 env->imbalance = 0;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008415 return NULL;
8416}
8417
8418/*
Ingo Molnar97fb7a02018-03-03 14:01:12 +01008419 * find_busiest_queue - find the busiest runqueue among the CPUs in the group.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008420 */
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008421static struct rq *find_busiest_queue(struct lb_env *env,
Michael Wangb94031302012-07-12 16:10:13 +08008422 struct sched_group *group)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008423{
8424 struct rq *busiest = NULL, *rq;
Nicolas Pitreced549f2014-05-26 18:19:38 -04008425 unsigned long busiest_load = 0, busiest_capacity = 1;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008426 int i;
8427
Peter Zijlstraae4df9d2017-05-01 11:03:12 +02008428 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
Vincent Guittotea678212015-02-27 16:54:11 +01008429 unsigned long capacity, wl;
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01008430 enum fbq_type rt;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008431
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01008432 rq = cpu_rq(i);
8433 rt = fbq_classify_rq(rq);
8434
8435 /*
8436 * We classify groups/runqueues into three groups:
8437 * - regular: there are !numa tasks
8438 * - remote: there are numa tasks that run on the 'wrong' node
8439 * - all: there is no distinction
8440 *
8441 * In order to avoid migrating ideally placed numa tasks,
8442 * ignore those when there's better options.
8443 *
8444 * If we ignore the actual busiest queue to migrate another
8445 * task, the next balance pass can still reduce the busiest
8446 * queue by moving tasks around inside the node.
8447 *
8448 * If we cannot move enough load due to this classification
8449 * the next pass will adjust the group classification and
8450 * allow migration of more tasks.
8451 *
8452 * Both cases only affect the total convergence complexity.
8453 */
8454 if (rt > env->fbq_type)
8455 continue;
8456
Morten Rasmussencad68e52018-07-04 11:17:42 +01008457 /*
8458 * For ASYM_CPUCAPACITY domains with misfit tasks we simply
8459 * seek the "biggest" misfit task.
8460 */
8461 if (env->src_grp_type == group_misfit_task) {
8462 if (rq->misfit_task_load > busiest_load) {
8463 busiest_load = rq->misfit_task_load;
8464 busiest = rq;
8465 }
8466
8467 continue;
8468 }
8469
Nicolas Pitreced549f2014-05-26 18:19:38 -04008470 capacity = capacity_of(i);
Srivatsa Vaddagiri9d5efe02010-06-08 14:57:02 +10008471
Chris Redpath4ad38312018-07-04 11:17:48 +01008472 /*
8473 * For ASYM_CPUCAPACITY domains, don't pick a CPU that could
8474 * eventually lead to active_balancing high->low capacity.
8475 * Higher per-CPU capacity is considered better than balancing
8476 * average load.
8477 */
8478 if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
8479 capacity_of(env->dst_cpu) < capacity &&
8480 rq->nr_running == 1)
8481 continue;
8482
Viresh Kumarc7132dd2017-05-24 10:59:54 +05308483 wl = weighted_cpuload(rq);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008484
Thomas Gleixner6e40f5b2010-02-16 16:48:56 +01008485 /*
8486 * When comparing with imbalance, use weighted_cpuload()
Ingo Molnar97fb7a02018-03-03 14:01:12 +01008487 * which is not scaled with the CPU capacity.
Thomas Gleixner6e40f5b2010-02-16 16:48:56 +01008488 */
Vincent Guittotea678212015-02-27 16:54:11 +01008489
8490 if (rq->nr_running == 1 && wl > env->imbalance &&
8491 !check_cpu_capacity(rq, env->sd))
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008492 continue;
8493
Thomas Gleixner6e40f5b2010-02-16 16:48:56 +01008494 /*
Ingo Molnar97fb7a02018-03-03 14:01:12 +01008495 * For the load comparisons with the other CPU's, consider
8496 * the weighted_cpuload() scaled with the CPU capacity, so
8497 * that the load can be moved away from the CPU that is
Nicolas Pitreced549f2014-05-26 18:19:38 -04008498 * potentially running at a lower capacity.
Joonsoo Kim95a79b82013-08-06 17:36:41 +09008499 *
Nicolas Pitreced549f2014-05-26 18:19:38 -04008500 * Thus we're looking for max(wl_i / capacity_i), crosswise
Joonsoo Kim95a79b82013-08-06 17:36:41 +09008501 * multiplication to rid ourselves of the division works out
Nicolas Pitreced549f2014-05-26 18:19:38 -04008502 * to: wl_i * capacity_j > wl_j * capacity_i; where j is
8503 * our previous maximum.
Thomas Gleixner6e40f5b2010-02-16 16:48:56 +01008504 */
Nicolas Pitreced549f2014-05-26 18:19:38 -04008505 if (wl * busiest_capacity > busiest_load * capacity) {
Joonsoo Kim95a79b82013-08-06 17:36:41 +09008506 busiest_load = wl;
Nicolas Pitreced549f2014-05-26 18:19:38 -04008507 busiest_capacity = capacity;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008508 busiest = rq;
8509 }
8510 }
8511
8512 return busiest;
8513}
8514
8515/*
8516 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
8517 * so long as it is large enough.
8518 */
8519#define MAX_PINNED_INTERVAL 512
8520
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008521static int need_active_balance(struct lb_env *env)
Peter Zijlstra1af3ed32009-12-23 15:10:31 +01008522{
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008523 struct sched_domain *sd = env->sd;
8524
8525 if (env->idle == CPU_NEWLY_IDLE) {
Michael Neuling532cb4c2010-06-08 14:57:02 +10008526
8527 /*
8528 * ASYM_PACKING needs to force migrate tasks from busy but
Tim Chenafe06ef2016-11-22 12:23:53 -08008529 * lower priority CPUs in order to pack all tasks in the
8530 * highest priority CPUs.
Michael Neuling532cb4c2010-06-08 14:57:02 +10008531 */
Tim Chenafe06ef2016-11-22 12:23:53 -08008532 if ((sd->flags & SD_ASYM_PACKING) &&
8533 sched_asym_prefer(env->dst_cpu, env->src_cpu))
Michael Neuling532cb4c2010-06-08 14:57:02 +10008534 return 1;
Peter Zijlstra1af3ed32009-12-23 15:10:31 +01008535 }
8536
Vincent Guittot1aaf90a2015-02-27 16:54:14 +01008537 /*
8538 * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
8539 * It's worth migrating the task if the src_cpu's capacity is reduced
8540 * because of other sched_class or IRQs if more capacity stays
8541 * available on dst_cpu.
8542 */
8543 if ((env->idle != CPU_NOT_IDLE) &&
8544 (env->src_rq->cfs.h_nr_running == 1)) {
8545 if ((check_cpu_capacity(env->src_rq, sd)) &&
8546 (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
8547 return 1;
8548 }
8549
Morten Rasmussencad68e52018-07-04 11:17:42 +01008550 if (env->src_grp_type == group_misfit_task)
8551 return 1;
8552
Peter Zijlstra1af3ed32009-12-23 15:10:31 +01008553 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
8554}
8555
Tejun Heo969c7922010-05-06 18:49:21 +02008556static int active_load_balance_cpu_stop(void *data);
8557
Joonsoo Kim23f0d202013-08-06 17:36:42 +09008558static int should_we_balance(struct lb_env *env)
8559{
8560 struct sched_group *sg = env->sd->groups;
Joonsoo Kim23f0d202013-08-06 17:36:42 +09008561 int cpu, balance_cpu = -1;
8562
8563 /*
Peter Zijlstra024c9d22017-10-09 10:36:53 +02008564 * Ensure the balancing environment is consistent; can happen
8565 * when the softirq triggers 'during' hotplug.
8566 */
8567 if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
8568 return 0;
8569
8570 /*
Ingo Molnar97fb7a02018-03-03 14:01:12 +01008571 * In the newly idle case, we will allow all the CPUs
Joonsoo Kim23f0d202013-08-06 17:36:42 +09008572 * to do the newly idle load balance.
8573 */
8574 if (env->idle == CPU_NEWLY_IDLE)
8575 return 1;
8576
Ingo Molnar97fb7a02018-03-03 14:01:12 +01008577 /* Try to find first idle CPU */
Peter Zijlstrae5c14b12017-05-01 10:47:02 +02008578 for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
Peter Zijlstraaf218122017-05-01 08:51:05 +02008579 if (!idle_cpu(cpu))
Joonsoo Kim23f0d202013-08-06 17:36:42 +09008580 continue;
8581
8582 balance_cpu = cpu;
8583 break;
8584 }
8585
8586 if (balance_cpu == -1)
8587 balance_cpu = group_balance_cpu(sg);
8588
8589 /*
Ingo Molnar97fb7a02018-03-03 14:01:12 +01008590 * First idle CPU or the first CPU(busiest) in this sched group
Joonsoo Kim23f0d202013-08-06 17:36:42 +09008591 * is eligible for doing load balancing at this and above domains.
8592 */
Joonsoo Kimb0cff9d2013-09-10 15:54:49 +09008593 return balance_cpu == env->dst_cpu;
Joonsoo Kim23f0d202013-08-06 17:36:42 +09008594}
8595
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008596/*
8597 * Check this_cpu to ensure it is balanced within domain. Attempt to move
8598 * tasks if there is an imbalance.
8599 */
8600static int load_balance(int this_cpu, struct rq *this_rq,
8601 struct sched_domain *sd, enum cpu_idle_type idle,
Joonsoo Kim23f0d202013-08-06 17:36:42 +09008602 int *continue_balancing)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008603{
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05308604 int ld_moved, cur_ld_moved, active_balance = 0;
Peter Zijlstra62633222013-08-19 12:41:09 +02008605 struct sched_domain *sd_parent = sd->parent;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008606 struct sched_group *group;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008607 struct rq *busiest;
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02008608 struct rq_flags rf;
Christoph Lameter4ba29682014-08-26 19:12:21 -05008609 struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008610
Peter Zijlstra8e45cb52012-02-22 12:47:19 +01008611 struct lb_env env = {
8612 .sd = sd,
Peter Zijlstraddcdf6e2012-02-22 19:27:40 +01008613 .dst_cpu = this_cpu,
8614 .dst_rq = this_rq,
Peter Zijlstraae4df9d2017-05-01 11:03:12 +02008615 .dst_grpmask = sched_group_span(sd->groups),
Peter Zijlstra8e45cb52012-02-22 12:47:19 +01008616 .idle = idle,
Peter Zijlstraeb953082012-04-17 13:38:40 +02008617 .loop_break = sched_nr_migrate_break,
Michael Wangb94031302012-07-12 16:10:13 +08008618 .cpus = cpus,
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01008619 .fbq_type = all,
Kirill Tkhai163122b2014-08-20 13:48:29 +04008620 .tasks = LIST_HEAD_INIT(env.tasks),
Peter Zijlstra8e45cb52012-02-22 12:47:19 +01008621 };
8622
Jeffrey Hugo65a44332017-06-07 13:18:57 -06008623 cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008624
Josh Poimboeufae928822016-06-17 12:43:24 -05008625 schedstat_inc(sd->lb_count[idle]);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008626
8627redo:
Joonsoo Kim23f0d202013-08-06 17:36:42 +09008628 if (!should_we_balance(&env)) {
8629 *continue_balancing = 0;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008630 goto out_balanced;
Joonsoo Kim23f0d202013-08-06 17:36:42 +09008631 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008632
Joonsoo Kim23f0d202013-08-06 17:36:42 +09008633 group = find_busiest_group(&env);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008634 if (!group) {
Josh Poimboeufae928822016-06-17 12:43:24 -05008635 schedstat_inc(sd->lb_nobusyg[idle]);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008636 goto out_balanced;
8637 }
8638
Michael Wangb94031302012-07-12 16:10:13 +08008639 busiest = find_busiest_queue(&env, group);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008640 if (!busiest) {
Josh Poimboeufae928822016-06-17 12:43:24 -05008641 schedstat_inc(sd->lb_nobusyq[idle]);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008642 goto out_balanced;
8643 }
8644
Michael Wang78feefc2012-08-06 16:41:59 +08008645 BUG_ON(busiest == env.dst_rq);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008646
Josh Poimboeufae928822016-06-17 12:43:24 -05008647 schedstat_add(sd->lb_imbalance[idle], env.imbalance);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008648
Vincent Guittot1aaf90a2015-02-27 16:54:14 +01008649 env.src_cpu = busiest->cpu;
8650 env.src_rq = busiest;
8651
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008652 ld_moved = 0;
8653 if (busiest->nr_running > 1) {
8654 /*
8655 * Attempt to move tasks. If find_busiest_group has found
8656 * an imbalance but busiest->nr_running <= 1, the group is
8657 * still unbalanced. ld_moved simply stays zero, so it is
8658 * correctly treated as an imbalance.
8659 */
Peter Zijlstra8e45cb52012-02-22 12:47:19 +01008660 env.flags |= LBF_ALL_PINNED;
Peter Zijlstrac82513e2012-04-26 13:12:27 +02008661 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
Peter Zijlstra8e45cb52012-02-22 12:47:19 +01008662
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01008663more_balance:
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02008664 rq_lock_irqsave(busiest, &rf);
Peter Zijlstra3bed5e22016-10-03 16:35:32 +02008665 update_rq_clock(busiest);
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05308666
8667 /*
8668 * cur_ld_moved - load moved in current iteration
8669 * ld_moved - cumulative load moved across iterations
8670 */
Kirill Tkhai163122b2014-08-20 13:48:29 +04008671 cur_ld_moved = detach_tasks(&env);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008672
8673 /*
Kirill Tkhai163122b2014-08-20 13:48:29 +04008674 * We've detached some tasks from busiest_rq. Every
8675 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
8676 * unlock busiest->lock, and we are able to be sure
8677 * that nobody can manipulate the tasks in parallel.
8678 * See task_rq_lock() family for the details.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008679 */
Kirill Tkhai163122b2014-08-20 13:48:29 +04008680
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02008681 rq_unlock(busiest, &rf);
Kirill Tkhai163122b2014-08-20 13:48:29 +04008682
8683 if (cur_ld_moved) {
8684 attach_tasks(&env);
8685 ld_moved += cur_ld_moved;
8686 }
8687
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02008688 local_irq_restore(rf.flags);
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05308689
Joonsoo Kimf1cd0852013-04-23 17:27:37 +09008690 if (env.flags & LBF_NEED_BREAK) {
8691 env.flags &= ~LBF_NEED_BREAK;
8692 goto more_balance;
8693 }
8694
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05308695 /*
8696 * Revisit (affine) tasks on src_cpu that couldn't be moved to
8697 * us and move them to an alternate dst_cpu in our sched_group
8698 * where they can run. The upper limit on how many times we
Ingo Molnar97fb7a02018-03-03 14:01:12 +01008699 * iterate on same src_cpu is dependent on number of CPUs in our
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05308700 * sched_group.
8701 *
8702 * This changes load balance semantics a bit on who can move
8703 * load to a given_cpu. In addition to the given_cpu itself
8704 * (or a ilb_cpu acting on its behalf where given_cpu is
8705 * nohz-idle), we now have balance_cpu in a position to move
8706 * load to given_cpu. In rare situations, this may cause
8707 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
8708 * _independently_ and at _same_ time to move some load to
8709 * given_cpu) causing exceess load to be moved to given_cpu.
8710 * This however should not happen so much in practice and
8711 * moreover subsequent load balance cycles should correct the
8712 * excess load moved.
8713 */
Peter Zijlstra62633222013-08-19 12:41:09 +02008714 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05308715
Ingo Molnar97fb7a02018-03-03 14:01:12 +01008716 /* Prevent to re-select dst_cpu via env's CPUs */
Vladimir Davydov7aff2e32013-09-15 21:30:13 +04008717 cpumask_clear_cpu(env.dst_cpu, env.cpus);
8718
Michael Wang78feefc2012-08-06 16:41:59 +08008719 env.dst_rq = cpu_rq(env.new_dst_cpu);
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05308720 env.dst_cpu = env.new_dst_cpu;
Peter Zijlstra62633222013-08-19 12:41:09 +02008721 env.flags &= ~LBF_DST_PINNED;
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05308722 env.loop = 0;
8723 env.loop_break = sched_nr_migrate_break;
Joonsoo Kime02e60c2013-04-23 17:27:42 +09008724
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05308725 /*
8726 * Go back to "more_balance" rather than "redo" since we
8727 * need to continue with same src_cpu.
8728 */
8729 goto more_balance;
8730 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008731
Peter Zijlstra62633222013-08-19 12:41:09 +02008732 /*
8733 * We failed to reach balance because of affinity.
8734 */
8735 if (sd_parent) {
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008736 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
Peter Zijlstra62633222013-08-19 12:41:09 +02008737
Vincent Guittotafdeee02014-08-26 13:06:44 +02008738 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
Peter Zijlstra62633222013-08-19 12:41:09 +02008739 *group_imbalance = 1;
Peter Zijlstra62633222013-08-19 12:41:09 +02008740 }
8741
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008742 /* All tasks on this runqueue were pinned by CPU affinity */
Peter Zijlstra8e45cb52012-02-22 12:47:19 +01008743 if (unlikely(env.flags & LBF_ALL_PINNED)) {
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008744 cpumask_clear_cpu(cpu_of(busiest), cpus);
Jeffrey Hugo65a44332017-06-07 13:18:57 -06008745 /*
8746 * Attempting to continue load balancing at the current
8747 * sched_domain level only makes sense if there are
8748 * active CPUs remaining as possible busiest CPUs to
8749 * pull load from which are not contained within the
8750 * destination group that is receiving any migrated
8751 * load.
8752 */
8753 if (!cpumask_subset(cpus, env.dst_grpmask)) {
Prashanth Nageshappabbf18b12012-06-19 17:52:07 +05308754 env.loop = 0;
8755 env.loop_break = sched_nr_migrate_break;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008756 goto redo;
Prashanth Nageshappabbf18b12012-06-19 17:52:07 +05308757 }
Vincent Guittotafdeee02014-08-26 13:06:44 +02008758 goto out_all_pinned;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008759 }
8760 }
8761
8762 if (!ld_moved) {
Josh Poimboeufae928822016-06-17 12:43:24 -05008763 schedstat_inc(sd->lb_failed[idle]);
Venkatesh Pallipadi58b26c42010-09-10 18:19:17 -07008764 /*
8765 * Increment the failure counter only on periodic balance.
8766 * We do not want newidle balance, which can be very
8767 * frequent, pollute the failure counter causing
8768 * excessive cache_hot migrations and active balances.
8769 */
8770 if (idle != CPU_NEWLY_IDLE)
8771 sd->nr_balance_failed++;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008772
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008773 if (need_active_balance(&env)) {
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02008774 unsigned long flags;
8775
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008776 raw_spin_lock_irqsave(&busiest->lock, flags);
8777
Ingo Molnar97fb7a02018-03-03 14:01:12 +01008778 /*
8779 * Don't kick the active_load_balance_cpu_stop,
8780 * if the curr task on busiest CPU can't be
8781 * moved to this_cpu:
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008782 */
Ingo Molnar0c98d342017-02-05 15:38:10 +01008783 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008784 raw_spin_unlock_irqrestore(&busiest->lock,
8785 flags);
Peter Zijlstra8e45cb52012-02-22 12:47:19 +01008786 env.flags |= LBF_ALL_PINNED;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008787 goto out_one_pinned;
8788 }
8789
Tejun Heo969c7922010-05-06 18:49:21 +02008790 /*
8791 * ->active_balance synchronizes accesses to
8792 * ->active_balance_work. Once set, it's cleared
8793 * only after active load balance is finished.
8794 */
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008795 if (!busiest->active_balance) {
8796 busiest->active_balance = 1;
8797 busiest->push_cpu = this_cpu;
8798 active_balance = 1;
8799 }
8800 raw_spin_unlock_irqrestore(&busiest->lock, flags);
Tejun Heo969c7922010-05-06 18:49:21 +02008801
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008802 if (active_balance) {
Tejun Heo969c7922010-05-06 18:49:21 +02008803 stop_one_cpu_nowait(cpu_of(busiest),
8804 active_load_balance_cpu_stop, busiest,
8805 &busiest->active_balance_work);
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008806 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008807
Srikar Dronamrajud02c071182016-03-23 17:54:44 +05308808 /* We've kicked active balancing, force task migration. */
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008809 sd->nr_balance_failed = sd->cache_nice_tries+1;
8810 }
8811 } else
8812 sd->nr_balance_failed = 0;
8813
8814 if (likely(!active_balance)) {
8815 /* We were unbalanced, so reset the balancing interval */
8816 sd->balance_interval = sd->min_interval;
8817 } else {
8818 /*
8819 * If we've begun active balancing, start to back off. This
8820 * case may not be covered by the all_pinned logic if there
8821 * is only 1 task on the busy runqueue (because we don't call
Kirill Tkhai163122b2014-08-20 13:48:29 +04008822 * detach_tasks).
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008823 */
8824 if (sd->balance_interval < sd->max_interval)
8825 sd->balance_interval *= 2;
8826 }
8827
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008828 goto out;
8829
8830out_balanced:
Vincent Guittotafdeee02014-08-26 13:06:44 +02008831 /*
8832 * We reach balance although we may have faced some affinity
8833 * constraints. Clear the imbalance flag if it was set.
8834 */
8835 if (sd_parent) {
8836 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
8837
8838 if (*group_imbalance)
8839 *group_imbalance = 0;
8840 }
8841
8842out_all_pinned:
8843 /*
8844 * We reach balance because all tasks are pinned at this level so
8845 * we can't migrate them. Let the imbalance flag set so parent level
8846 * can try to migrate them.
8847 */
Josh Poimboeufae928822016-06-17 12:43:24 -05008848 schedstat_inc(sd->lb_balanced[idle]);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008849
8850 sd->nr_balance_failed = 0;
8851
8852out_one_pinned:
8853 /* tune up the balancing interval */
Peter Zijlstra8e45cb52012-02-22 12:47:19 +01008854 if (((env.flags & LBF_ALL_PINNED) &&
Peter Zijlstra5b54b562011-09-22 15:23:13 +02008855 sd->balance_interval < MAX_PINNED_INTERVAL) ||
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008856 (sd->balance_interval < sd->max_interval))
8857 sd->balance_interval *= 2;
8858
Venkatesh Pallipadi46e49b32011-02-14 14:38:50 -08008859 ld_moved = 0;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008860out:
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008861 return ld_moved;
8862}
8863
Jason Low52a08ef2014-05-08 17:49:22 -07008864static inline unsigned long
8865get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
8866{
8867 unsigned long interval = sd->balance_interval;
8868
8869 if (cpu_busy)
8870 interval *= sd->busy_factor;
8871
8872 /* scale ms to jiffies */
8873 interval = msecs_to_jiffies(interval);
8874 interval = clamp(interval, 1UL, max_load_balance_interval);
8875
8876 return interval;
8877}
8878
8879static inline void
Leo Yan31851a92016-08-05 14:31:29 +08008880update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
Jason Low52a08ef2014-05-08 17:49:22 -07008881{
8882 unsigned long interval, next;
8883
Leo Yan31851a92016-08-05 14:31:29 +08008884 /* used by idle balance, so cpu_busy = 0 */
8885 interval = get_sd_balance_interval(sd, 0);
Jason Low52a08ef2014-05-08 17:49:22 -07008886 next = sd->last_balance + interval;
8887
8888 if (time_after(*next_balance, next))
8889 *next_balance = next;
8890}
8891
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008892/*
Ingo Molnar97fb7a02018-03-03 14:01:12 +01008893 * active_load_balance_cpu_stop is run by the CPU stopper. It pushes
Tejun Heo969c7922010-05-06 18:49:21 +02008894 * running tasks off the busiest CPU onto idle CPUs. It requires at
8895 * least 1 task to be running on each physical CPU where possible, and
8896 * avoids physical / logical imbalances.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008897 */
Tejun Heo969c7922010-05-06 18:49:21 +02008898static int active_load_balance_cpu_stop(void *data)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008899{
Tejun Heo969c7922010-05-06 18:49:21 +02008900 struct rq *busiest_rq = data;
8901 int busiest_cpu = cpu_of(busiest_rq);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008902 int target_cpu = busiest_rq->push_cpu;
Tejun Heo969c7922010-05-06 18:49:21 +02008903 struct rq *target_rq = cpu_rq(target_cpu);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008904 struct sched_domain *sd;
Kirill Tkhaie5673f22014-08-20 13:48:01 +04008905 struct task_struct *p = NULL;
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02008906 struct rq_flags rf;
Tejun Heo969c7922010-05-06 18:49:21 +02008907
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02008908 rq_lock_irq(busiest_rq, &rf);
Peter Zijlstraedd8e412017-09-07 17:03:51 +02008909 /*
8910 * Between queueing the stop-work and running it is a hole in which
8911 * CPUs can become inactive. We should not move tasks from or to
8912 * inactive CPUs.
8913 */
8914 if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
8915 goto out_unlock;
Tejun Heo969c7922010-05-06 18:49:21 +02008916
Ingo Molnar97fb7a02018-03-03 14:01:12 +01008917 /* Make sure the requested CPU hasn't gone down in the meantime: */
Tejun Heo969c7922010-05-06 18:49:21 +02008918 if (unlikely(busiest_cpu != smp_processor_id() ||
8919 !busiest_rq->active_balance))
8920 goto out_unlock;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008921
8922 /* Is there any task to move? */
8923 if (busiest_rq->nr_running <= 1)
Tejun Heo969c7922010-05-06 18:49:21 +02008924 goto out_unlock;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008925
8926 /*
8927 * This condition is "impossible", if it occurs
8928 * we need to fix it. Originally reported by
Ingo Molnar97fb7a02018-03-03 14:01:12 +01008929 * Bjorn Helgaas on a 128-CPU setup.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008930 */
8931 BUG_ON(busiest_rq == target_rq);
8932
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008933 /* Search for an sd spanning us and the target CPU. */
Peter Zijlstradce840a2011-04-07 14:09:50 +02008934 rcu_read_lock();
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008935 for_each_domain(target_cpu, sd) {
8936 if ((sd->flags & SD_LOAD_BALANCE) &&
8937 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
8938 break;
8939 }
8940
8941 if (likely(sd)) {
Peter Zijlstra8e45cb52012-02-22 12:47:19 +01008942 struct lb_env env = {
8943 .sd = sd,
Peter Zijlstraddcdf6e2012-02-22 19:27:40 +01008944 .dst_cpu = target_cpu,
8945 .dst_rq = target_rq,
8946 .src_cpu = busiest_rq->cpu,
8947 .src_rq = busiest_rq,
Peter Zijlstra8e45cb52012-02-22 12:47:19 +01008948 .idle = CPU_IDLE,
Jeffrey Hugo65a44332017-06-07 13:18:57 -06008949 /*
8950 * can_migrate_task() doesn't need to compute new_dst_cpu
8951 * for active balancing. Since we have CPU_IDLE, but no
8952 * @dst_grpmask we need to make that test go away with lying
8953 * about DST_PINNED.
8954 */
8955 .flags = LBF_DST_PINNED,
Peter Zijlstra8e45cb52012-02-22 12:47:19 +01008956 };
8957
Josh Poimboeufae928822016-06-17 12:43:24 -05008958 schedstat_inc(sd->alb_count);
Peter Zijlstra3bed5e22016-10-03 16:35:32 +02008959 update_rq_clock(busiest_rq);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008960
Kirill Tkhaie5673f22014-08-20 13:48:01 +04008961 p = detach_one_task(&env);
Srikar Dronamrajud02c071182016-03-23 17:54:44 +05308962 if (p) {
Josh Poimboeufae928822016-06-17 12:43:24 -05008963 schedstat_inc(sd->alb_pushed);
Srikar Dronamrajud02c071182016-03-23 17:54:44 +05308964 /* Active balancing done, reset the failure counter. */
8965 sd->nr_balance_failed = 0;
8966 } else {
Josh Poimboeufae928822016-06-17 12:43:24 -05008967 schedstat_inc(sd->alb_failed);
Srikar Dronamrajud02c071182016-03-23 17:54:44 +05308968 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008969 }
Peter Zijlstradce840a2011-04-07 14:09:50 +02008970 rcu_read_unlock();
Tejun Heo969c7922010-05-06 18:49:21 +02008971out_unlock:
8972 busiest_rq->active_balance = 0;
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02008973 rq_unlock(busiest_rq, &rf);
Kirill Tkhaie5673f22014-08-20 13:48:01 +04008974
8975 if (p)
8976 attach_one_task(target_rq, p);
8977
8978 local_irq_enable();
8979
Tejun Heo969c7922010-05-06 18:49:21 +02008980 return 0;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008981}
8982
Peter Zijlstraaf3fe032018-02-20 10:58:39 +01008983static DEFINE_SPINLOCK(balancing);
8984
8985/*
8986 * Scale the max load_balance interval with the number of CPUs in the system.
8987 * This trades load-balance latency on larger machines for less cross talk.
8988 */
8989void update_max_interval(void)
8990{
8991 max_load_balance_interval = HZ*num_online_cpus()/10;
8992}
8993
8994/*
8995 * It checks each scheduling domain to see if it is due to be balanced,
8996 * and initiates a balancing operation if so.
8997 *
8998 * Balancing parameters are set up in init_sched_domains.
8999 */
9000static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
9001{
9002 int continue_balancing = 1;
9003 int cpu = rq->cpu;
9004 unsigned long interval;
9005 struct sched_domain *sd;
9006 /* Earliest time when we have to do rebalance again */
9007 unsigned long next_balance = jiffies + 60*HZ;
9008 int update_next_balance = 0;
9009 int need_serialize, need_decay = 0;
9010 u64 max_cost = 0;
9011
9012 rcu_read_lock();
9013 for_each_domain(cpu, sd) {
9014 /*
9015 * Decay the newidle max times here because this is a regular
9016 * visit to all the domains. Decay ~1% per second.
9017 */
9018 if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
9019 sd->max_newidle_lb_cost =
9020 (sd->max_newidle_lb_cost * 253) / 256;
9021 sd->next_decay_max_lb_cost = jiffies + HZ;
9022 need_decay = 1;
9023 }
9024 max_cost += sd->max_newidle_lb_cost;
9025
9026 if (!(sd->flags & SD_LOAD_BALANCE))
9027 continue;
9028
9029 /*
9030 * Stop the load balance at this level. There is another
9031 * CPU in our sched group which is doing load balancing more
9032 * actively.
9033 */
9034 if (!continue_balancing) {
9035 if (need_decay)
9036 continue;
9037 break;
9038 }
9039
9040 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
9041
9042 need_serialize = sd->flags & SD_SERIALIZE;
9043 if (need_serialize) {
9044 if (!spin_trylock(&balancing))
9045 goto out;
9046 }
9047
9048 if (time_after_eq(jiffies, sd->last_balance + interval)) {
9049 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
9050 /*
9051 * The LBF_DST_PINNED logic could have changed
9052 * env->dst_cpu, so we can't know our idle
9053 * state even if we migrated tasks. Update it.
9054 */
9055 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
9056 }
9057 sd->last_balance = jiffies;
9058 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
9059 }
9060 if (need_serialize)
9061 spin_unlock(&balancing);
9062out:
9063 if (time_after(next_balance, sd->last_balance + interval)) {
9064 next_balance = sd->last_balance + interval;
9065 update_next_balance = 1;
9066 }
9067 }
9068 if (need_decay) {
9069 /*
9070 * Ensure the rq-wide value also decays but keep it at a
9071 * reasonable floor to avoid funnies with rq->avg_idle.
9072 */
9073 rq->max_idle_balance_cost =
9074 max((u64)sysctl_sched_migration_cost, max_cost);
9075 }
9076 rcu_read_unlock();
9077
9078 /*
9079 * next_balance will be updated only when there is a need.
9080 * When the cpu is attached to null domain for ex, it will not be
9081 * updated.
9082 */
9083 if (likely(update_next_balance)) {
9084 rq->next_balance = next_balance;
9085
9086#ifdef CONFIG_NO_HZ_COMMON
9087 /*
9088 * If this CPU has been elected to perform the nohz idle
9089 * balance. Other idle CPUs have already rebalanced with
9090 * nohz_idle_balance() and nohz.next_balance has been
9091 * updated accordingly. This CPU is now running the idle load
9092 * balance for itself and we need to update the
9093 * nohz.next_balance accordingly.
9094 */
9095 if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
9096 nohz.next_balance = rq->next_balance;
9097#endif
9098 }
9099}
9100
Mike Galbraithd987fc72011-12-05 10:01:47 +01009101static inline int on_null_domain(struct rq *rq)
9102{
9103 return unlikely(!rcu_dereference_sched(rq->sd));
9104}
9105
Frederic Weisbecker3451d022011-08-10 23:21:01 +02009106#ifdef CONFIG_NO_HZ_COMMON
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009107/*
9108 * idle load balancing details
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009109 * - When one of the busy CPUs notice that there may be an idle rebalancing
9110 * needed, they will kick the idle load balancer, which then does idle
9111 * load balancing for all the idle CPUs.
9112 */
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009113
Daniel Lezcano3dd03372014-01-06 12:34:41 +01009114static inline int find_new_ilb(void)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009115{
Suresh Siddha0b005cf2011-12-01 17:07:34 -08009116 int ilb = cpumask_first(nohz.idle_cpus_mask);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009117
Suresh Siddha786d6dc2011-12-01 17:07:35 -08009118 if (ilb < nr_cpu_ids && idle_cpu(ilb))
9119 return ilb;
9120
9121 return nr_cpu_ids;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009122}
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009123
9124/*
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009125 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
9126 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
9127 * CPU (if there is one).
9128 */
Peter Zijlstraa4064fb2017-12-21 10:42:50 +01009129static void kick_ilb(unsigned int flags)
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009130{
9131 int ilb_cpu;
9132
9133 nohz.next_balance++;
9134
Daniel Lezcano3dd03372014-01-06 12:34:41 +01009135 ilb_cpu = find_new_ilb();
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009136
Suresh Siddha0b005cf2011-12-01 17:07:34 -08009137 if (ilb_cpu >= nr_cpu_ids)
9138 return;
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009139
Peter Zijlstraa4064fb2017-12-21 10:42:50 +01009140 flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
Peter Zijlstrab7031a02017-12-21 10:11:09 +01009141 if (flags & NOHZ_KICK_MASK)
Suresh Siddha1c792db2011-12-01 17:07:32 -08009142 return;
Peter Zijlstra45504872017-12-21 10:47:48 +01009143
Suresh Siddha1c792db2011-12-01 17:07:32 -08009144 /*
9145 * Use smp_send_reschedule() instead of resched_cpu().
Ingo Molnar97fb7a02018-03-03 14:01:12 +01009146 * This way we generate a sched IPI on the target CPU which
Suresh Siddha1c792db2011-12-01 17:07:32 -08009147 * is idle. And the softirq performing nohz idle load balance
9148 * will be run before returning from the IPI.
9149 */
9150 smp_send_reschedule(ilb_cpu);
Peter Zijlstra45504872017-12-21 10:47:48 +01009151}
9152
9153/*
9154 * Current heuristic for kicking the idle load balancer in the presence
9155 * of an idle cpu in the system.
9156 * - This rq has more than one task.
9157 * - This rq has at least one CFS task and the capacity of the CPU is
9158 * significantly reduced because of RT tasks or IRQs.
9159 * - At parent of LLC scheduler domain level, this cpu's scheduler group has
9160 * multiple busy cpu.
9161 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
9162 * domain span are idle.
9163 */
9164static void nohz_balancer_kick(struct rq *rq)
9165{
9166 unsigned long now = jiffies;
9167 struct sched_domain_shared *sds;
9168 struct sched_domain *sd;
9169 int nr_busy, i, cpu = rq->cpu;
Peter Zijlstraa4064fb2017-12-21 10:42:50 +01009170 unsigned int flags = 0;
Peter Zijlstra45504872017-12-21 10:47:48 +01009171
9172 if (unlikely(rq->idle_balance))
9173 return;
9174
9175 /*
9176 * We may be recently in ticked or tickless idle mode. At the first
9177 * busy tick after returning from idle, we will update the busy stats.
9178 */
Peter Zijlstra00357f52017-12-21 15:06:50 +01009179 nohz_balance_exit_idle(rq);
Peter Zijlstra45504872017-12-21 10:47:48 +01009180
9181 /*
9182 * None are in tickless mode and hence no need for NOHZ idle load
9183 * balancing.
9184 */
9185 if (likely(!atomic_read(&nohz.nr_cpus)))
9186 return;
9187
Vincent Guittotf643ea22018-02-13 11:31:17 +01009188 if (READ_ONCE(nohz.has_blocked) &&
9189 time_after(now, READ_ONCE(nohz.next_blocked)))
Peter Zijlstraa4064fb2017-12-21 10:42:50 +01009190 flags = NOHZ_STATS_KICK;
9191
Peter Zijlstra45504872017-12-21 10:47:48 +01009192 if (time_before(now, nohz.next_balance))
Peter Zijlstraa4064fb2017-12-21 10:42:50 +01009193 goto out;
Peter Zijlstra45504872017-12-21 10:47:48 +01009194
Valentin Schneider5fbdfae2018-07-04 11:17:43 +01009195 if (rq->nr_running >= 2 || rq->misfit_task_load) {
Peter Zijlstraa4064fb2017-12-21 10:42:50 +01009196 flags = NOHZ_KICK_MASK;
Peter Zijlstra45504872017-12-21 10:47:48 +01009197 goto out;
9198 }
9199
9200 rcu_read_lock();
9201 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
9202 if (sds) {
9203 /*
9204 * XXX: write a coherent comment on why we do this.
9205 * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
9206 */
9207 nr_busy = atomic_read(&sds->nr_busy_cpus);
9208 if (nr_busy > 1) {
Peter Zijlstraa4064fb2017-12-21 10:42:50 +01009209 flags = NOHZ_KICK_MASK;
Peter Zijlstra45504872017-12-21 10:47:48 +01009210 goto unlock;
9211 }
9212
9213 }
9214
9215 sd = rcu_dereference(rq->sd);
9216 if (sd) {
9217 if ((rq->cfs.h_nr_running >= 1) &&
9218 check_cpu_capacity(rq, sd)) {
Peter Zijlstraa4064fb2017-12-21 10:42:50 +01009219 flags = NOHZ_KICK_MASK;
Peter Zijlstra45504872017-12-21 10:47:48 +01009220 goto unlock;
9221 }
9222 }
9223
9224 sd = rcu_dereference(per_cpu(sd_asym, cpu));
9225 if (sd) {
9226 for_each_cpu(i, sched_domain_span(sd)) {
9227 if (i == cpu ||
9228 !cpumask_test_cpu(i, nohz.idle_cpus_mask))
9229 continue;
9230
9231 if (sched_asym_prefer(i, cpu)) {
Peter Zijlstraa4064fb2017-12-21 10:42:50 +01009232 flags = NOHZ_KICK_MASK;
Peter Zijlstra45504872017-12-21 10:47:48 +01009233 goto unlock;
9234 }
9235 }
9236 }
9237unlock:
9238 rcu_read_unlock();
9239out:
Peter Zijlstraa4064fb2017-12-21 10:42:50 +01009240 if (flags)
9241 kick_ilb(flags);
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009242}
9243
Peter Zijlstra00357f52017-12-21 15:06:50 +01009244static void set_cpu_sd_state_busy(int cpu)
Suresh Siddha69e1e812011-12-01 17:07:33 -08009245{
9246 struct sched_domain *sd;
Peter Zijlstra00357f52017-12-21 15:06:50 +01009247
9248 rcu_read_lock();
9249 sd = rcu_dereference(per_cpu(sd_llc, cpu));
9250
9251 if (!sd || !sd->nohz_idle)
9252 goto unlock;
9253 sd->nohz_idle = 0;
9254
9255 atomic_inc(&sd->shared->nr_busy_cpus);
9256unlock:
9257 rcu_read_unlock();
9258}
9259
9260void nohz_balance_exit_idle(struct rq *rq)
9261{
9262 SCHED_WARN_ON(rq != this_rq());
9263
9264 if (likely(!rq->nohz_tick_stopped))
9265 return;
9266
9267 rq->nohz_tick_stopped = 0;
9268 cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
9269 atomic_dec(&nohz.nr_cpus);
9270
9271 set_cpu_sd_state_busy(rq->cpu);
9272}
9273
9274static void set_cpu_sd_state_idle(int cpu)
9275{
9276 struct sched_domain *sd;
Suresh Siddha69e1e812011-12-01 17:07:33 -08009277
Suresh Siddha69e1e812011-12-01 17:07:33 -08009278 rcu_read_lock();
Peter Zijlstra0e369d72016-05-09 10:38:01 +02009279 sd = rcu_dereference(per_cpu(sd_llc, cpu));
Vincent Guittot25f55d92013-04-23 16:59:02 +02009280
9281 if (!sd || sd->nohz_idle)
9282 goto unlock;
9283 sd->nohz_idle = 1;
9284
Peter Zijlstra0e369d72016-05-09 10:38:01 +02009285 atomic_dec(&sd->shared->nr_busy_cpus);
Vincent Guittot25f55d92013-04-23 16:59:02 +02009286unlock:
Suresh Siddha69e1e812011-12-01 17:07:33 -08009287 rcu_read_unlock();
9288}
9289
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009290/*
Ingo Molnar97fb7a02018-03-03 14:01:12 +01009291 * This routine will record that the CPU is going idle with tick stopped.
Suresh Siddha0b005cf2011-12-01 17:07:34 -08009292 * This info will be used in performing idle load balancing in the future.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009293 */
Alex Shic1cc0172012-09-10 15:10:58 +08009294void nohz_balance_enter_idle(int cpu)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009295{
Peter Zijlstra00357f52017-12-21 15:06:50 +01009296 struct rq *rq = cpu_rq(cpu);
9297
9298 SCHED_WARN_ON(cpu != smp_processor_id());
9299
Ingo Molnar97fb7a02018-03-03 14:01:12 +01009300 /* If this CPU is going down, then nothing needs to be done: */
Suresh Siddha71325962012-01-19 18:28:57 -08009301 if (!cpu_active(cpu))
9302 return;
9303
Frederic Weisbecker387bc8b2017-06-19 04:12:02 +02009304 /* Spare idle load balancing on CPUs that don't want to be disturbed: */
Frederic Weisbeckerde201552017-10-27 04:42:35 +02009305 if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
Frederic Weisbecker387bc8b2017-06-19 04:12:02 +02009306 return;
9307
Vincent Guittotf643ea22018-02-13 11:31:17 +01009308 /*
9309 * Can be set safely without rq->lock held
9310 * If a clear happens, it will have evaluated last additions because
9311 * rq->lock is held during the check and the clear
9312 */
9313 rq->has_blocked_load = 1;
9314
9315 /*
9316 * The tick is still stopped but load could have been added in the
9317 * meantime. We set the nohz.has_blocked flag to trig a check of the
9318 * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
9319 * of nohz.has_blocked can only happen after checking the new load
9320 */
Peter Zijlstra00357f52017-12-21 15:06:50 +01009321 if (rq->nohz_tick_stopped)
Vincent Guittotf643ea22018-02-13 11:31:17 +01009322 goto out;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009323
Ingo Molnar97fb7a02018-03-03 14:01:12 +01009324 /* If we're a completely isolated CPU, we don't play: */
Peter Zijlstra00357f52017-12-21 15:06:50 +01009325 if (on_null_domain(rq))
Mike Galbraithd987fc72011-12-05 10:01:47 +01009326 return;
9327
Peter Zijlstra00357f52017-12-21 15:06:50 +01009328 rq->nohz_tick_stopped = 1;
9329
Alex Shic1cc0172012-09-10 15:10:58 +08009330 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
9331 atomic_inc(&nohz.nr_cpus);
Peter Zijlstra00357f52017-12-21 15:06:50 +01009332
Vincent Guittotf643ea22018-02-13 11:31:17 +01009333 /*
9334 * Ensures that if nohz_idle_balance() fails to observe our
9335 * @idle_cpus_mask store, it must observe the @has_blocked
9336 * store.
9337 */
9338 smp_mb__after_atomic();
9339
Peter Zijlstra00357f52017-12-21 15:06:50 +01009340 set_cpu_sd_state_idle(cpu);
Vincent Guittotf643ea22018-02-13 11:31:17 +01009341
9342out:
9343 /*
9344 * Each time a cpu enter idle, we assume that it has blocked load and
9345 * enable the periodic update of the load of idle cpus
9346 */
9347 WRITE_ONCE(nohz.has_blocked, 1);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009348}
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009349
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009350/*
Vincent Guittot31e77c92018-02-14 16:26:46 +01009351 * Internal function that runs load balance for all idle cpus. The load balance
9352 * can be a simple update of blocked load or a complete load balance with
9353 * tasks movement depending of flags.
9354 * The function returns false if the loop has stopped before running
9355 * through all idle CPUs.
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009356 */
Vincent Guittot31e77c92018-02-14 16:26:46 +01009357static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
9358 enum cpu_idle_type idle)
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009359{
Vincent Guittotc5afb6a2015-08-03 11:55:50 +02009360 /* Earliest time when we have to do rebalance again */
Peter Zijlstraa4064fb2017-12-21 10:42:50 +01009361 unsigned long now = jiffies;
9362 unsigned long next_balance = now + 60*HZ;
Vincent Guittotf643ea22018-02-13 11:31:17 +01009363 bool has_blocked_load = false;
Vincent Guittotc5afb6a2015-08-03 11:55:50 +02009364 int update_next_balance = 0;
Peter Zijlstrab7031a02017-12-21 10:11:09 +01009365 int this_cpu = this_rq->cpu;
Peter Zijlstrab7031a02017-12-21 10:11:09 +01009366 int balance_cpu;
Vincent Guittot31e77c92018-02-14 16:26:46 +01009367 int ret = false;
Peter Zijlstrab7031a02017-12-21 10:11:09 +01009368 struct rq *rq;
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009369
Peter Zijlstrab7031a02017-12-21 10:11:09 +01009370 SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009371
Vincent Guittotf643ea22018-02-13 11:31:17 +01009372 /*
9373 * We assume there will be no idle load after this update and clear
9374 * the has_blocked flag. If a cpu enters idle in the mean time, it will
9375 * set the has_blocked flag and trig another update of idle load.
9376 * Because a cpu that becomes idle, is added to idle_cpus_mask before
9377 * setting the flag, we are sure to not clear the state and not
9378 * check the load of an idle cpu.
9379 */
9380 WRITE_ONCE(nohz.has_blocked, 0);
9381
9382 /*
9383 * Ensures that if we miss the CPU, we must see the has_blocked
9384 * store from nohz_balance_enter_idle().
9385 */
9386 smp_mb();
9387
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009388 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
Suresh Siddha8a6d42d2011-12-06 11:19:37 -08009389 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009390 continue;
9391
9392 /*
Ingo Molnar97fb7a02018-03-03 14:01:12 +01009393 * If this CPU gets work to do, stop the load balancing
9394 * work being done for other CPUs. Next load
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009395 * balancing owner will pick it up.
9396 */
Vincent Guittotf643ea22018-02-13 11:31:17 +01009397 if (need_resched()) {
9398 has_blocked_load = true;
9399 goto abort;
9400 }
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009401
Vincent Guittot5ed4f1d2012-09-13 06:11:26 +02009402 rq = cpu_rq(balance_cpu);
9403
Peter Zijlstra63928382018-02-13 16:54:17 +01009404 has_blocked_load |= update_nohz_stats(rq, true);
Vincent Guittotf643ea22018-02-13 11:31:17 +01009405
Tim Chened61bbc2014-05-20 14:39:27 -07009406 /*
9407 * If time for next balance is due,
9408 * do the balance.
9409 */
9410 if (time_after_eq(jiffies, rq->next_balance)) {
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02009411 struct rq_flags rf;
9412
Vincent Guittot31e77c92018-02-14 16:26:46 +01009413 rq_lock_irqsave(rq, &rf);
Tim Chened61bbc2014-05-20 14:39:27 -07009414 update_rq_clock(rq);
Frederic Weisbeckercee1afc2016-04-13 15:56:50 +02009415 cpu_load_update_idle(rq);
Vincent Guittot31e77c92018-02-14 16:26:46 +01009416 rq_unlock_irqrestore(rq, &rf);
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02009417
Peter Zijlstrab7031a02017-12-21 10:11:09 +01009418 if (flags & NOHZ_BALANCE_KICK)
9419 rebalance_domains(rq, CPU_IDLE);
Tim Chened61bbc2014-05-20 14:39:27 -07009420 }
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009421
Vincent Guittotc5afb6a2015-08-03 11:55:50 +02009422 if (time_after(next_balance, rq->next_balance)) {
9423 next_balance = rq->next_balance;
9424 update_next_balance = 1;
9425 }
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009426 }
Vincent Guittotc5afb6a2015-08-03 11:55:50 +02009427
Vincent Guittot31e77c92018-02-14 16:26:46 +01009428 /* Newly idle CPU doesn't need an update */
9429 if (idle != CPU_NEWLY_IDLE) {
9430 update_blocked_averages(this_cpu);
9431 has_blocked_load |= this_rq->has_blocked_load;
9432 }
9433
Peter Zijlstrab7031a02017-12-21 10:11:09 +01009434 if (flags & NOHZ_BALANCE_KICK)
9435 rebalance_domains(this_rq, CPU_IDLE);
9436
Vincent Guittotf643ea22018-02-13 11:31:17 +01009437 WRITE_ONCE(nohz.next_blocked,
9438 now + msecs_to_jiffies(LOAD_AVG_PERIOD));
9439
Vincent Guittot31e77c92018-02-14 16:26:46 +01009440 /* The full idle balance loop has been done */
9441 ret = true;
9442
Vincent Guittotf643ea22018-02-13 11:31:17 +01009443abort:
9444 /* There is still blocked load, enable periodic update */
9445 if (has_blocked_load)
9446 WRITE_ONCE(nohz.has_blocked, 1);
Peter Zijlstraa4064fb2017-12-21 10:42:50 +01009447
Vincent Guittotc5afb6a2015-08-03 11:55:50 +02009448 /*
9449 * next_balance will be updated only when there is a need.
9450 * When the CPU is attached to null domain for ex, it will not be
9451 * updated.
9452 */
9453 if (likely(update_next_balance))
9454 nohz.next_balance = next_balance;
Peter Zijlstrab7031a02017-12-21 10:11:09 +01009455
Vincent Guittot31e77c92018-02-14 16:26:46 +01009456 return ret;
9457}
9458
9459/*
9460 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
9461 * rebalancing for all the cpus for whom scheduler ticks are stopped.
9462 */
9463static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
9464{
9465 int this_cpu = this_rq->cpu;
9466 unsigned int flags;
9467
9468 if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK))
9469 return false;
9470
9471 if (idle != CPU_IDLE) {
9472 atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
9473 return false;
9474 }
9475
9476 /*
9477 * barrier, pairs with nohz_balance_enter_idle(), ensures ...
9478 */
9479 flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
9480 if (!(flags & NOHZ_KICK_MASK))
9481 return false;
9482
9483 _nohz_idle_balance(this_rq, flags, idle);
9484
Peter Zijlstrab7031a02017-12-21 10:11:09 +01009485 return true;
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009486}
Vincent Guittot31e77c92018-02-14 16:26:46 +01009487
9488static void nohz_newidle_balance(struct rq *this_rq)
9489{
9490 int this_cpu = this_rq->cpu;
9491
9492 /*
9493 * This CPU doesn't want to be disturbed by scheduler
9494 * housekeeping
9495 */
9496 if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
9497 return;
9498
9499 /* Will wake up very soon. No time for doing anything else*/
9500 if (this_rq->avg_idle < sysctl_sched_migration_cost)
9501 return;
9502
9503 /* Don't need to update blocked load of idle CPUs*/
9504 if (!READ_ONCE(nohz.has_blocked) ||
9505 time_before(jiffies, READ_ONCE(nohz.next_blocked)))
9506 return;
9507
9508 raw_spin_unlock(&this_rq->lock);
9509 /*
9510 * This CPU is going to be idle and blocked load of idle CPUs
9511 * need to be updated. Run the ilb locally as it is a good
9512 * candidate for ilb instead of waking up another idle CPU.
9513 * Kick an normal ilb if we failed to do the update.
9514 */
9515 if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
9516 kick_ilb(NOHZ_STATS_KICK);
9517 raw_spin_lock(&this_rq->lock);
9518}
9519
Peter Zijlstradd707242018-02-20 10:59:45 +01009520#else /* !CONFIG_NO_HZ_COMMON */
9521static inline void nohz_balancer_kick(struct rq *rq) { }
9522
Vincent Guittot31e77c92018-02-14 16:26:46 +01009523static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
Peter Zijlstrab7031a02017-12-21 10:11:09 +01009524{
9525 return false;
9526}
Vincent Guittot31e77c92018-02-14 16:26:46 +01009527
9528static inline void nohz_newidle_balance(struct rq *this_rq) { }
Peter Zijlstradd707242018-02-20 10:59:45 +01009529#endif /* CONFIG_NO_HZ_COMMON */
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009530
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009531/*
Peter Zijlstra47ea5412018-02-20 11:45:47 +01009532 * idle_balance is called by schedule() if this_cpu is about to become
9533 * idle. Attempts to pull tasks from other CPUs.
9534 */
9535static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
9536{
9537 unsigned long next_balance = jiffies + HZ;
9538 int this_cpu = this_rq->cpu;
9539 struct sched_domain *sd;
9540 int pulled_task = 0;
9541 u64 curr_cost = 0;
9542
9543 /*
9544 * We must set idle_stamp _before_ calling idle_balance(), such that we
9545 * measure the duration of idle_balance() as idle time.
9546 */
9547 this_rq->idle_stamp = rq_clock(this_rq);
9548
9549 /*
9550 * Do not pull tasks towards !active CPUs...
9551 */
9552 if (!cpu_active(this_cpu))
9553 return 0;
9554
9555 /*
9556 * This is OK, because current is on_cpu, which avoids it being picked
9557 * for load-balance and preemption/IRQs are still disabled avoiding
9558 * further scheduler activity on it and we're being very careful to
9559 * re-start the picking loop.
9560 */
9561 rq_unpin_lock(this_rq, rf);
9562
9563 if (this_rq->avg_idle < sysctl_sched_migration_cost ||
Valentin Schneidere90c8fe2018-07-04 11:17:46 +01009564 !READ_ONCE(this_rq->rd->overload)) {
Vincent Guittot31e77c92018-02-14 16:26:46 +01009565
Peter Zijlstra47ea5412018-02-20 11:45:47 +01009566 rcu_read_lock();
9567 sd = rcu_dereference_check_sched_domain(this_rq->sd);
9568 if (sd)
9569 update_next_balance(sd, &next_balance);
9570 rcu_read_unlock();
9571
Vincent Guittot31e77c92018-02-14 16:26:46 +01009572 nohz_newidle_balance(this_rq);
9573
Peter Zijlstra47ea5412018-02-20 11:45:47 +01009574 goto out;
9575 }
9576
9577 raw_spin_unlock(&this_rq->lock);
9578
9579 update_blocked_averages(this_cpu);
9580 rcu_read_lock();
9581 for_each_domain(this_cpu, sd) {
9582 int continue_balancing = 1;
9583 u64 t0, domain_cost;
9584
9585 if (!(sd->flags & SD_LOAD_BALANCE))
9586 continue;
9587
9588 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
9589 update_next_balance(sd, &next_balance);
9590 break;
9591 }
9592
9593 if (sd->flags & SD_BALANCE_NEWIDLE) {
9594 t0 = sched_clock_cpu(this_cpu);
9595
9596 pulled_task = load_balance(this_cpu, this_rq,
9597 sd, CPU_NEWLY_IDLE,
9598 &continue_balancing);
9599
9600 domain_cost = sched_clock_cpu(this_cpu) - t0;
9601 if (domain_cost > sd->max_newidle_lb_cost)
9602 sd->max_newidle_lb_cost = domain_cost;
9603
9604 curr_cost += domain_cost;
9605 }
9606
9607 update_next_balance(sd, &next_balance);
9608
9609 /*
9610 * Stop searching for tasks to pull if there are
9611 * now runnable tasks on this rq.
9612 */
9613 if (pulled_task || this_rq->nr_running > 0)
9614 break;
9615 }
9616 rcu_read_unlock();
9617
9618 raw_spin_lock(&this_rq->lock);
9619
9620 if (curr_cost > this_rq->max_idle_balance_cost)
9621 this_rq->max_idle_balance_cost = curr_cost;
9622
Vincent Guittot457be902018-04-26 12:19:32 +02009623out:
Peter Zijlstra47ea5412018-02-20 11:45:47 +01009624 /*
9625 * While browsing the domains, we released the rq lock, a task could
9626 * have been enqueued in the meantime. Since we're not going idle,
9627 * pretend we pulled a task.
9628 */
9629 if (this_rq->cfs.h_nr_running && !pulled_task)
9630 pulled_task = 1;
9631
Peter Zijlstra47ea5412018-02-20 11:45:47 +01009632 /* Move the next balance forward */
9633 if (time_after(this_rq->next_balance, next_balance))
9634 this_rq->next_balance = next_balance;
9635
9636 /* Is there a task of a high priority class? */
9637 if (this_rq->nr_running != this_rq->cfs.h_nr_running)
9638 pulled_task = -1;
9639
9640 if (pulled_task)
9641 this_rq->idle_stamp = 0;
9642
9643 rq_repin_lock(this_rq, rf);
9644
9645 return pulled_task;
9646}
9647
9648/*
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009649 * run_rebalance_domains is triggered when needed from the scheduler tick.
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07009650 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009651 */
Emese Revfy0766f782016-06-20 20:42:34 +02009652static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009653{
Daniel Lezcano208cb162014-01-06 12:34:44 +01009654 struct rq *this_rq = this_rq();
Suresh Siddha6eb57e02011-10-03 15:09:01 -07009655 enum cpu_idle_type idle = this_rq->idle_balance ?
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009656 CPU_IDLE : CPU_NOT_IDLE;
9657
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009658 /*
Ingo Molnar97fb7a02018-03-03 14:01:12 +01009659 * If this CPU has a pending nohz_balance_kick, then do the
9660 * balancing on behalf of the other idle CPUs whose ticks are
Preeti U Murthyd4573c32015-03-26 18:32:44 +05309661 * stopped. Do nohz_idle_balance *before* rebalance_domains to
Ingo Molnar97fb7a02018-03-03 14:01:12 +01009662 * give the idle CPUs a chance to load balance. Else we may
Preeti U Murthyd4573c32015-03-26 18:32:44 +05309663 * load balance only within the local sched_domain hierarchy
9664 * and abort nohz_idle_balance altogether if we pull some load.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009665 */
Peter Zijlstrab7031a02017-12-21 10:11:09 +01009666 if (nohz_idle_balance(this_rq, idle))
9667 return;
9668
9669 /* normal load balance */
9670 update_blocked_averages(this_rq->cpu);
Preeti U Murthyd4573c32015-03-26 18:32:44 +05309671 rebalance_domains(this_rq, idle);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009672}
9673
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009674/*
9675 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009676 */
Daniel Lezcano7caff662014-01-06 12:34:38 +01009677void trigger_load_balance(struct rq *rq)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009678{
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009679 /* Don't need to rebalance while attached to NULL domain */
Daniel Lezcanoc7260992014-01-06 12:34:45 +01009680 if (unlikely(on_null_domain(rq)))
9681 return;
9682
9683 if (time_after_eq(jiffies, rq->next_balance))
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009684 raise_softirq(SCHED_SOFTIRQ);
Peter Zijlstra45504872017-12-21 10:47:48 +01009685
9686 nohz_balancer_kick(rq);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009687}
9688
Christian Ehrhardt0bcdcf22009-11-30 12:16:46 +01009689static void rq_online_fair(struct rq *rq)
9690{
9691 update_sysctl();
Kirill Tkhai0e59bda2014-06-25 12:19:42 +04009692
9693 update_runtime_enabled(rq);
Christian Ehrhardt0bcdcf22009-11-30 12:16:46 +01009694}
9695
9696static void rq_offline_fair(struct rq *rq)
9697{
9698 update_sysctl();
Peter Boonstoppela4c96ae2012-08-09 15:34:47 -07009699
9700 /* Ensure any throttled groups are reachable by pick_next_task */
9701 unthrottle_offline_cfs_rqs(rq);
Christian Ehrhardt0bcdcf22009-11-30 12:16:46 +01009702}
9703
Dhaval Giani55e12e52008-06-24 23:39:43 +05309704#endif /* CONFIG_SMP */
Peter Williamse1d14842007-10-24 18:23:51 +02009705
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02009706/*
Frederic Weisbeckerd84b3132018-02-21 05:17:27 +01009707 * scheduler tick hitting a task of our scheduling class.
9708 *
9709 * NOTE: This function can be called remotely by the tick offload that
9710 * goes along full dynticks. Therefore no local assumption can be made
9711 * and everything must be accessed through the @rq and @curr passed in
9712 * parameters.
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02009713 */
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01009714static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02009715{
9716 struct cfs_rq *cfs_rq;
9717 struct sched_entity *se = &curr->se;
9718
9719 for_each_sched_entity(se) {
9720 cfs_rq = cfs_rq_of(se);
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01009721 entity_tick(cfs_rq, se, queued);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02009722 }
Ben Segall18bf2802012-10-04 12:51:20 +02009723
Srikar Dronamrajub52da862015-10-02 07:48:25 +05309724 if (static_branch_unlikely(&sched_numa_balancing))
Peter Zijlstracbee9f82012-10-25 14:16:43 +02009725 task_tick_numa(rq, curr);
Morten Rasmussen3b1baa62018-07-04 11:17:40 +01009726
9727 update_misfit_status(curr, rq);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02009728}
9729
9730/*
Peter Zijlstracd29fe62009-11-27 17:32:46 +01009731 * called on fork with the child task as argument from the parent's context
9732 * - child not yet on the tasklist
9733 * - preemption disabled
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02009734 */
Peter Zijlstracd29fe62009-11-27 17:32:46 +01009735static void task_fork_fair(struct task_struct *p)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02009736{
Daisuke Nishimura4fc420c2011-12-15 14:36:55 +09009737 struct cfs_rq *cfs_rq;
9738 struct sched_entity *se = &p->se, *curr;
Peter Zijlstracd29fe62009-11-27 17:32:46 +01009739 struct rq *rq = this_rq();
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02009740 struct rq_flags rf;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02009741
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02009742 rq_lock(rq, &rf);
Peter Zijlstra861d0342010-08-19 13:31:43 +02009743 update_rq_clock(rq);
9744
Daisuke Nishimura4fc420c2011-12-15 14:36:55 +09009745 cfs_rq = task_cfs_rq(current);
9746 curr = cfs_rq->curr;
Peter Zijlstrae210bff2016-06-16 18:51:48 +02009747 if (curr) {
9748 update_curr(cfs_rq);
Mike Galbraithb5d9d732009-09-08 11:12:28 +02009749 se->vruntime = curr->vruntime;
Peter Zijlstrae210bff2016-06-16 18:51:48 +02009750 }
Peter Zijlstraaeb73b02007-10-15 17:00:05 +02009751 place_entity(cfs_rq, se, 1);
Peter Zijlstra4d78e7b2007-10-15 17:00:04 +02009752
Peter Zijlstracd29fe62009-11-27 17:32:46 +01009753 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
Dmitry Adamushko87fefa32007-10-15 17:00:08 +02009754 /*
Ingo Molnaredcb60a2007-10-15 17:00:08 +02009755 * Upon rescheduling, sched_class::put_prev_task() will place
9756 * 'current' within the tree based on its new key value.
9757 */
Peter Zijlstra4d78e7b2007-10-15 17:00:04 +02009758 swap(curr->vruntime, se->vruntime);
Kirill Tkhai88751252014-06-29 00:03:57 +04009759 resched_curr(rq);
Peter Zijlstra4d78e7b2007-10-15 17:00:04 +02009760 }
9761
Peter Zijlstra88ec22d2009-12-16 18:04:41 +01009762 se->vruntime -= cfs_rq->min_vruntime;
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02009763 rq_unlock(rq, &rf);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02009764}
9765
Steven Rostedtcb469842008-01-25 21:08:22 +01009766/*
9767 * Priority of the task has changed. Check to see if we preempt
9768 * the current task.
9769 */
Peter Zijlstrada7a7352011-01-17 17:03:27 +01009770static void
9771prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
Steven Rostedtcb469842008-01-25 21:08:22 +01009772{
Kirill Tkhaida0c1e62014-08-20 13:47:32 +04009773 if (!task_on_rq_queued(p))
Peter Zijlstrada7a7352011-01-17 17:03:27 +01009774 return;
9775
Steven Rostedtcb469842008-01-25 21:08:22 +01009776 /*
9777 * Reschedule if we are currently running on this runqueue and
9778 * our priority decreased, or if we are not currently running on
9779 * this runqueue and our priority is higher than the current's
9780 */
Peter Zijlstrada7a7352011-01-17 17:03:27 +01009781 if (rq->curr == p) {
Steven Rostedtcb469842008-01-25 21:08:22 +01009782 if (p->prio > oldprio)
Kirill Tkhai88751252014-06-29 00:03:57 +04009783 resched_curr(rq);
Steven Rostedtcb469842008-01-25 21:08:22 +01009784 } else
Peter Zijlstra15afe092008-09-20 23:38:02 +02009785 check_preempt_curr(rq, p, 0);
Steven Rostedtcb469842008-01-25 21:08:22 +01009786}
9787
Byungchul Parkdaa59402015-08-20 20:22:00 +09009788static inline bool vruntime_normalized(struct task_struct *p)
9789{
9790 struct sched_entity *se = &p->se;
9791
9792 /*
9793 * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
9794 * the dequeue_entity(.flags=0) will already have normalized the
9795 * vruntime.
9796 */
9797 if (p->on_rq)
9798 return true;
9799
9800 /*
9801 * When !on_rq, vruntime of the task has usually NOT been normalized.
9802 * But there are some cases where it has already been normalized:
9803 *
9804 * - A forked child which is waiting for being woken up by
9805 * wake_up_new_task().
9806 * - A task which has been woken up by try_to_wake_up() and
9807 * waiting for actually being woken up by sched_ttwu_pending().
9808 */
Steve Muckled0cdb3c2018-08-31 15:42:17 -07009809 if (!se->sum_exec_runtime ||
9810 (p->state == TASK_WAKING && p->sched_remote_wakeup))
Byungchul Parkdaa59402015-08-20 20:22:00 +09009811 return true;
9812
9813 return false;
9814}
9815
Vincent Guittot09a43ac2016-11-08 10:53:45 +01009816#ifdef CONFIG_FAIR_GROUP_SCHED
9817/*
9818 * Propagate the changes of the sched_entity across the tg tree to make it
9819 * visible to the root
9820 */
9821static void propagate_entity_cfs_rq(struct sched_entity *se)
9822{
9823 struct cfs_rq *cfs_rq;
9824
9825 /* Start to propagate at parent */
9826 se = se->parent;
9827
9828 for_each_sched_entity(se) {
9829 cfs_rq = cfs_rq_of(se);
9830
9831 if (cfs_rq_throttled(cfs_rq))
9832 break;
9833
Peter Zijlstra88c06162017-05-06 17:32:43 +02009834 update_load_avg(cfs_rq, se, UPDATE_TG);
Vincent Guittot09a43ac2016-11-08 10:53:45 +01009835 }
9836}
9837#else
9838static void propagate_entity_cfs_rq(struct sched_entity *se) { }
9839#endif
9840
Vincent Guittotdf217912016-11-08 10:53:42 +01009841static void detach_entity_cfs_rq(struct sched_entity *se)
Peter Zijlstrada7a7352011-01-17 17:03:27 +01009842{
Peter Zijlstrada7a7352011-01-17 17:03:27 +01009843 struct cfs_rq *cfs_rq = cfs_rq_of(se);
9844
Yuyang Du9d89c252015-07-15 08:04:37 +08009845 /* Catch up with the cfs_rq and remove our load when we leave */
Peter Zijlstra88c06162017-05-06 17:32:43 +02009846 update_load_avg(cfs_rq, se, 0);
Byungchul Parka05e8c52015-08-20 20:21:56 +09009847 detach_entity_load_avg(cfs_rq, se);
Peter Zijlstra7c3edd22016-07-13 10:56:25 +02009848 update_tg_load_avg(cfs_rq, false);
Vincent Guittot09a43ac2016-11-08 10:53:45 +01009849 propagate_entity_cfs_rq(se);
Peter Zijlstrada7a7352011-01-17 17:03:27 +01009850}
9851
Vincent Guittotdf217912016-11-08 10:53:42 +01009852static void attach_entity_cfs_rq(struct sched_entity *se)
Steven Rostedtcb469842008-01-25 21:08:22 +01009853{
Byungchul Parkdaa59402015-08-20 20:22:00 +09009854 struct cfs_rq *cfs_rq = cfs_rq_of(se);
Byungchul Park7855a352015-08-10 18:02:55 +09009855
9856#ifdef CONFIG_FAIR_GROUP_SCHED
Michael wangeb7a59b2014-02-20 11:14:53 +08009857 /*
9858 * Since the real-depth could have been changed (only FAIR
9859 * class maintain depth value), reset depth properly.
9860 */
9861 se->depth = se->parent ? se->parent->depth + 1 : 0;
9862#endif
Byungchul Park7855a352015-08-10 18:02:55 +09009863
Vincent Guittotdf217912016-11-08 10:53:42 +01009864 /* Synchronize entity with its cfs_rq */
Peter Zijlstra88c06162017-05-06 17:32:43 +02009865 update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
Peter Zijlstraea14b57e2018-02-02 10:27:00 +01009866 attach_entity_load_avg(cfs_rq, se, 0);
Peter Zijlstra7c3edd22016-07-13 10:56:25 +02009867 update_tg_load_avg(cfs_rq, false);
Vincent Guittot09a43ac2016-11-08 10:53:45 +01009868 propagate_entity_cfs_rq(se);
Vincent Guittotdf217912016-11-08 10:53:42 +01009869}
9870
9871static void detach_task_cfs_rq(struct task_struct *p)
9872{
9873 struct sched_entity *se = &p->se;
9874 struct cfs_rq *cfs_rq = cfs_rq_of(se);
9875
9876 if (!vruntime_normalized(p)) {
9877 /*
9878 * Fix up our vruntime so that the current sleep doesn't
9879 * cause 'unlimited' sleep bonus.
9880 */
9881 place_entity(cfs_rq, se, 0);
9882 se->vruntime -= cfs_rq->min_vruntime;
9883 }
9884
9885 detach_entity_cfs_rq(se);
9886}
9887
9888static void attach_task_cfs_rq(struct task_struct *p)
9889{
9890 struct sched_entity *se = &p->se;
9891 struct cfs_rq *cfs_rq = cfs_rq_of(se);
9892
9893 attach_entity_cfs_rq(se);
Byungchul Park6efdb102015-08-20 20:21:59 +09009894
Byungchul Parkdaa59402015-08-20 20:22:00 +09009895 if (!vruntime_normalized(p))
9896 se->vruntime += cfs_rq->min_vruntime;
9897}
Byungchul Park7855a352015-08-10 18:02:55 +09009898
Byungchul Parkdaa59402015-08-20 20:22:00 +09009899static void switched_from_fair(struct rq *rq, struct task_struct *p)
9900{
9901 detach_task_cfs_rq(p);
9902}
9903
9904static void switched_to_fair(struct rq *rq, struct task_struct *p)
9905{
9906 attach_task_cfs_rq(p);
9907
9908 if (task_on_rq_queued(p)) {
Byungchul Park7855a352015-08-10 18:02:55 +09009909 /*
Byungchul Parkdaa59402015-08-20 20:22:00 +09009910 * We were most likely switched from sched_rt, so
9911 * kick off the schedule if running, otherwise just see
9912 * if we can still preempt the current task.
Byungchul Park7855a352015-08-10 18:02:55 +09009913 */
Byungchul Parkdaa59402015-08-20 20:22:00 +09009914 if (rq->curr == p)
9915 resched_curr(rq);
9916 else
9917 check_preempt_curr(rq, p, 0);
Byungchul Park7855a352015-08-10 18:02:55 +09009918 }
Steven Rostedtcb469842008-01-25 21:08:22 +01009919}
9920
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +02009921/* Account for a task changing its policy or group.
9922 *
9923 * This routine is mostly called to set cfs_rq->curr field when a task
9924 * migrates between groups/classes.
9925 */
9926static void set_curr_task_fair(struct rq *rq)
9927{
9928 struct sched_entity *se = &rq->curr->se;
9929
Paul Turnerec12cb72011-07-21 09:43:30 -07009930 for_each_sched_entity(se) {
9931 struct cfs_rq *cfs_rq = cfs_rq_of(se);
9932
9933 set_next_entity(cfs_rq, se);
9934 /* ensure bandwidth has been allocated on our new cfs_rq */
9935 account_cfs_rq_runtime(cfs_rq, 0);
9936 }
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +02009937}
9938
Peter Zijlstra029632f2011-10-25 10:00:11 +02009939void init_cfs_rq(struct cfs_rq *cfs_rq)
9940{
Davidlohr Buesobfb06882017-09-08 16:14:55 -07009941 cfs_rq->tasks_timeline = RB_ROOT_CACHED;
Peter Zijlstra029632f2011-10-25 10:00:11 +02009942 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
9943#ifndef CONFIG_64BIT
9944 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
9945#endif
Alex Shi141965c2013-06-26 13:05:39 +08009946#ifdef CONFIG_SMP
Peter Zijlstra2a2f5d4e2017-05-08 16:51:41 +02009947 raw_spin_lock_init(&cfs_rq->removed.lock);
Paul Turner9ee474f2012-10-04 13:18:30 +02009948#endif
Peter Zijlstra029632f2011-10-25 10:00:11 +02009949}
9950
Peter Zijlstra810b3812008-02-29 15:21:01 -05009951#ifdef CONFIG_FAIR_GROUP_SCHED
Vincent Guittotea86cb42016-06-17 13:38:55 +02009952static void task_set_group_fair(struct task_struct *p)
9953{
9954 struct sched_entity *se = &p->se;
9955
9956 set_task_rq(p, task_cpu(p));
9957 se->depth = se->parent ? se->parent->depth + 1 : 0;
9958}
9959
Peter Zijlstrabc54da22015-08-31 17:13:55 +02009960static void task_move_group_fair(struct task_struct *p)
Peter Zijlstra810b3812008-02-29 15:21:01 -05009961{
Byungchul Parkdaa59402015-08-20 20:22:00 +09009962 detach_task_cfs_rq(p);
Peter Zijlstrab2b5ce02010-10-15 15:24:15 +02009963 set_task_rq(p, task_cpu(p));
Byungchul Park6efdb102015-08-20 20:21:59 +09009964
9965#ifdef CONFIG_SMP
9966 /* Tell se's cfs_rq has been changed -- migrated */
9967 p->se.avg.last_update_time = 0;
9968#endif
Byungchul Parkdaa59402015-08-20 20:22:00 +09009969 attach_task_cfs_rq(p);
Peter Zijlstra810b3812008-02-29 15:21:01 -05009970}
Peter Zijlstra029632f2011-10-25 10:00:11 +02009971
Vincent Guittotea86cb42016-06-17 13:38:55 +02009972static void task_change_group_fair(struct task_struct *p, int type)
9973{
9974 switch (type) {
9975 case TASK_SET_GROUP:
9976 task_set_group_fair(p);
9977 break;
9978
9979 case TASK_MOVE_GROUP:
9980 task_move_group_fair(p);
9981 break;
9982 }
9983}
9984
Peter Zijlstra029632f2011-10-25 10:00:11 +02009985void free_fair_sched_group(struct task_group *tg)
9986{
9987 int i;
9988
9989 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
9990
9991 for_each_possible_cpu(i) {
9992 if (tg->cfs_rq)
9993 kfree(tg->cfs_rq[i]);
Peter Zijlstra6fe1f342016-01-21 22:24:16 +01009994 if (tg->se)
Peter Zijlstra029632f2011-10-25 10:00:11 +02009995 kfree(tg->se[i]);
9996 }
9997
9998 kfree(tg->cfs_rq);
9999 kfree(tg->se);
10000}
10001
10002int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
10003{
Peter Zijlstra029632f2011-10-25 10:00:11 +020010004 struct sched_entity *se;
Peter Zijlstrab7fa30c2016-06-09 15:07:50 +020010005 struct cfs_rq *cfs_rq;
Peter Zijlstra029632f2011-10-25 10:00:11 +020010006 int i;
10007
Kees Cook6396bb22018-06-12 14:03:40 -070010008 tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
Peter Zijlstra029632f2011-10-25 10:00:11 +020010009 if (!tg->cfs_rq)
10010 goto err;
Kees Cook6396bb22018-06-12 14:03:40 -070010011 tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL);
Peter Zijlstra029632f2011-10-25 10:00:11 +020010012 if (!tg->se)
10013 goto err;
10014
10015 tg->shares = NICE_0_LOAD;
10016
10017 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
10018
10019 for_each_possible_cpu(i) {
10020 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
10021 GFP_KERNEL, cpu_to_node(i));
10022 if (!cfs_rq)
10023 goto err;
10024
10025 se = kzalloc_node(sizeof(struct sched_entity),
10026 GFP_KERNEL, cpu_to_node(i));
10027 if (!se)
10028 goto err_free_rq;
10029
10030 init_cfs_rq(cfs_rq);
10031 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
Yuyang Du540247f2015-07-15 08:04:39 +080010032 init_entity_runnable_average(se);
Peter Zijlstra029632f2011-10-25 10:00:11 +020010033 }
10034
10035 return 1;
10036
10037err_free_rq:
10038 kfree(cfs_rq);
10039err:
10040 return 0;
10041}
10042
Peter Zijlstra8663e242016-06-22 14:58:02 +020010043void online_fair_sched_group(struct task_group *tg)
10044{
10045 struct sched_entity *se;
10046 struct rq *rq;
10047 int i;
10048
10049 for_each_possible_cpu(i) {
10050 rq = cpu_rq(i);
10051 se = tg->se[i];
10052
10053 raw_spin_lock_irq(&rq->lock);
Peter Zijlstra4126bad2016-10-03 16:20:59 +020010054 update_rq_clock(rq);
Vincent Guittotd0326692016-11-08 10:53:47 +010010055 attach_entity_cfs_rq(se);
Peter Zijlstra55e16d32016-06-22 15:14:26 +020010056 sync_throttle(tg, i);
Peter Zijlstra8663e242016-06-22 14:58:02 +020010057 raw_spin_unlock_irq(&rq->lock);
10058 }
10059}
10060
Peter Zijlstra6fe1f342016-01-21 22:24:16 +010010061void unregister_fair_sched_group(struct task_group *tg)
Peter Zijlstra029632f2011-10-25 10:00:11 +020010062{
Peter Zijlstra029632f2011-10-25 10:00:11 +020010063 unsigned long flags;
Peter Zijlstra6fe1f342016-01-21 22:24:16 +010010064 struct rq *rq;
10065 int cpu;
Peter Zijlstra029632f2011-10-25 10:00:11 +020010066
Peter Zijlstra6fe1f342016-01-21 22:24:16 +010010067 for_each_possible_cpu(cpu) {
10068 if (tg->se[cpu])
10069 remove_entity_load_avg(tg->se[cpu]);
Peter Zijlstra029632f2011-10-25 10:00:11 +020010070
Peter Zijlstra6fe1f342016-01-21 22:24:16 +010010071 /*
10072 * Only empty task groups can be destroyed; so we can speculatively
10073 * check on_list without danger of it being re-added.
10074 */
10075 if (!tg->cfs_rq[cpu]->on_list)
10076 continue;
10077
10078 rq = cpu_rq(cpu);
10079
10080 raw_spin_lock_irqsave(&rq->lock, flags);
10081 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
10082 raw_spin_unlock_irqrestore(&rq->lock, flags);
10083 }
Peter Zijlstra029632f2011-10-25 10:00:11 +020010084}
10085
10086void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
10087 struct sched_entity *se, int cpu,
10088 struct sched_entity *parent)
10089{
10090 struct rq *rq = cpu_rq(cpu);
10091
10092 cfs_rq->tg = tg;
10093 cfs_rq->rq = rq;
Peter Zijlstra029632f2011-10-25 10:00:11 +020010094 init_cfs_rq_runtime(cfs_rq);
10095
10096 tg->cfs_rq[cpu] = cfs_rq;
10097 tg->se[cpu] = se;
10098
10099 /* se could be NULL for root_task_group */
10100 if (!se)
10101 return;
10102
Peter Zijlstrafed14d42012-02-11 06:05:00 +010010103 if (!parent) {
Peter Zijlstra029632f2011-10-25 10:00:11 +020010104 se->cfs_rq = &rq->cfs;
Peter Zijlstrafed14d42012-02-11 06:05:00 +010010105 se->depth = 0;
10106 } else {
Peter Zijlstra029632f2011-10-25 10:00:11 +020010107 se->cfs_rq = parent->my_q;
Peter Zijlstrafed14d42012-02-11 06:05:00 +010010108 se->depth = parent->depth + 1;
10109 }
Peter Zijlstra029632f2011-10-25 10:00:11 +020010110
10111 se->my_q = cfs_rq;
Paul Turner0ac9b1c2013-10-16 11:16:27 -070010112 /* guarantee group entities always have weight */
10113 update_load_set(&se->load, NICE_0_LOAD);
Peter Zijlstra029632f2011-10-25 10:00:11 +020010114 se->parent = parent;
10115}
10116
10117static DEFINE_MUTEX(shares_mutex);
10118
10119int sched_group_set_shares(struct task_group *tg, unsigned long shares)
10120{
10121 int i;
Peter Zijlstra029632f2011-10-25 10:00:11 +020010122
10123 /*
10124 * We can't change the weight of the root cgroup.
10125 */
10126 if (!tg->se[0])
10127 return -EINVAL;
10128
10129 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
10130
10131 mutex_lock(&shares_mutex);
10132 if (tg->shares == shares)
10133 goto done;
10134
10135 tg->shares = shares;
10136 for_each_possible_cpu(i) {
10137 struct rq *rq = cpu_rq(i);
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +020010138 struct sched_entity *se = tg->se[i];
10139 struct rq_flags rf;
Peter Zijlstra029632f2011-10-25 10:00:11 +020010140
Peter Zijlstra029632f2011-10-25 10:00:11 +020010141 /* Propagate contribution to hierarchy */
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +020010142 rq_lock_irqsave(rq, &rf);
Frederic Weisbecker71b1da42013-04-12 01:50:59 +020010143 update_rq_clock(rq);
Vincent Guittot89ee0482016-12-21 16:50:26 +010010144 for_each_sched_entity(se) {
Peter Zijlstra88c06162017-05-06 17:32:43 +020010145 update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
Peter Zijlstra1ea6c462017-05-06 15:59:54 +020010146 update_cfs_group(se);
Vincent Guittot89ee0482016-12-21 16:50:26 +010010147 }
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +020010148 rq_unlock_irqrestore(rq, &rf);
Peter Zijlstra029632f2011-10-25 10:00:11 +020010149 }
10150
10151done:
10152 mutex_unlock(&shares_mutex);
10153 return 0;
10154}
10155#else /* CONFIG_FAIR_GROUP_SCHED */
10156
10157void free_fair_sched_group(struct task_group *tg) { }
10158
10159int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
10160{
10161 return 1;
10162}
10163
Peter Zijlstra8663e242016-06-22 14:58:02 +020010164void online_fair_sched_group(struct task_group *tg) { }
10165
Peter Zijlstra6fe1f342016-01-21 22:24:16 +010010166void unregister_fair_sched_group(struct task_group *tg) { }
Peter Zijlstra029632f2011-10-25 10:00:11 +020010167
10168#endif /* CONFIG_FAIR_GROUP_SCHED */
10169
Peter Zijlstra810b3812008-02-29 15:21:01 -050010170
H Hartley Sweeten6d686f42010-01-13 20:21:52 -070010171static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
Peter Williams0d721ce2009-09-21 01:31:53 +000010172{
10173 struct sched_entity *se = &task->se;
Peter Williams0d721ce2009-09-21 01:31:53 +000010174 unsigned int rr_interval = 0;
10175
10176 /*
10177 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
10178 * idle runqueue:
10179 */
Peter Williams0d721ce2009-09-21 01:31:53 +000010180 if (rq->cfs.load.weight)
Zhu Yanhaia59f4e02013-01-08 12:56:52 +080010181 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
Peter Williams0d721ce2009-09-21 01:31:53 +000010182
10183 return rr_interval;
10184}
10185
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020010186/*
10187 * All the scheduling class methods:
10188 */
Peter Zijlstra029632f2011-10-25 10:00:11 +020010189const struct sched_class fair_sched_class = {
Ingo Molnar5522d5d2007-10-15 17:00:12 +020010190 .next = &idle_sched_class,
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020010191 .enqueue_task = enqueue_task_fair,
10192 .dequeue_task = dequeue_task_fair,
10193 .yield_task = yield_task_fair,
Mike Galbraithd95f4122011-02-01 09:50:51 -050010194 .yield_to_task = yield_to_task_fair,
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020010195
Ingo Molnar2e09bf52007-10-15 17:00:05 +020010196 .check_preempt_curr = check_preempt_wakeup,
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020010197
10198 .pick_next_task = pick_next_task_fair,
10199 .put_prev_task = put_prev_task_fair,
10200
Peter Williams681f3e62007-10-24 18:23:51 +020010201#ifdef CONFIG_SMP
Li Zefan4ce72a22008-10-22 15:25:26 +080010202 .select_task_rq = select_task_rq_fair,
Paul Turner0a74bef2012-10-04 13:18:30 +020010203 .migrate_task_rq = migrate_task_rq_fair,
Alex Shi141965c2013-06-26 13:05:39 +080010204
Christian Ehrhardt0bcdcf22009-11-30 12:16:46 +010010205 .rq_online = rq_online_fair,
10206 .rq_offline = rq_offline_fair,
Peter Zijlstra88ec22d2009-12-16 18:04:41 +010010207
Yuyang Du12695572015-07-15 08:04:40 +080010208 .task_dead = task_dead_fair,
Peter Zijlstrac5b28032015-05-15 17:43:35 +020010209 .set_cpus_allowed = set_cpus_allowed_common,
Peter Williams681f3e62007-10-24 18:23:51 +020010210#endif
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020010211
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +020010212 .set_curr_task = set_curr_task_fair,
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020010213 .task_tick = task_tick_fair,
Peter Zijlstracd29fe62009-11-27 17:32:46 +010010214 .task_fork = task_fork_fair,
Steven Rostedtcb469842008-01-25 21:08:22 +010010215
10216 .prio_changed = prio_changed_fair,
Peter Zijlstrada7a7352011-01-17 17:03:27 +010010217 .switched_from = switched_from_fair,
Steven Rostedtcb469842008-01-25 21:08:22 +010010218 .switched_to = switched_to_fair,
Peter Zijlstra810b3812008-02-29 15:21:01 -050010219
Peter Williams0d721ce2009-09-21 01:31:53 +000010220 .get_rr_interval = get_rr_interval_fair,
10221
Stanislaw Gruszka6e998912014-11-12 16:58:44 +010010222 .update_curr = update_curr_fair,
10223
Peter Zijlstra810b3812008-02-29 15:21:01 -050010224#ifdef CONFIG_FAIR_GROUP_SCHED
Vincent Guittotea86cb42016-06-17 13:38:55 +020010225 .task_change_group = task_change_group_fair,
Peter Zijlstra810b3812008-02-29 15:21:01 -050010226#endif
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020010227};
10228
10229#ifdef CONFIG_SCHED_DEBUG
Peter Zijlstra029632f2011-10-25 10:00:11 +020010230void print_cfs_stats(struct seq_file *m, int cpu)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020010231{
Tejun Heoa9e7f652017-04-25 17:43:50 -070010232 struct cfs_rq *cfs_rq, *pos;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020010233
Peter Zijlstra5973e5b2008-01-25 21:08:34 +010010234 rcu_read_lock();
Tejun Heoa9e7f652017-04-25 17:43:50 -070010235 for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
Ingo Molnar5cef9ec2007-08-09 11:16:47 +020010236 print_cfs_rq(m, cpu, cfs_rq);
Peter Zijlstra5973e5b2008-01-25 21:08:34 +010010237 rcu_read_unlock();
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020010238}
Srikar Dronamraju397f2372015-06-25 22:51:43 +053010239
10240#ifdef CONFIG_NUMA_BALANCING
10241void show_numa_stats(struct task_struct *p, struct seq_file *m)
10242{
10243 int node;
10244 unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
10245
10246 for_each_online_node(node) {
10247 if (p->numa_faults) {
10248 tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
10249 tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
10250 }
10251 if (p->numa_group) {
10252 gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
10253 gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
10254 }
10255 print_numa_stats(m, node, tsf, tpf, gsf, gpf);
10256 }
10257}
10258#endif /* CONFIG_NUMA_BALANCING */
10259#endif /* CONFIG_SCHED_DEBUG */
Peter Zijlstra029632f2011-10-25 10:00:11 +020010260
10261__init void init_sched_fair_class(void)
10262{
10263#ifdef CONFIG_SMP
10264 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
10265
Frederic Weisbecker3451d022011-08-10 23:21:01 +020010266#ifdef CONFIG_NO_HZ_COMMON
Diwakar Tundlam554ceca2012-03-07 14:44:26 -080010267 nohz.next_balance = jiffies;
Vincent Guittotf643ea22018-02-13 11:31:17 +010010268 nohz.next_blocked = jiffies;
Peter Zijlstra029632f2011-10-25 10:00:11 +020010269 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
Peter Zijlstra029632f2011-10-25 10:00:11 +020010270#endif
10271#endif /* SMP */
10272
10273}