blob: 5146163bfabb9ee3dd7dd7bd04c900257dea6220 [file] [log] [blame]
Greg Kroah-Hartmanb2441312017-11-01 15:07:57 +01001// SPDX-License-Identifier: GPL-2.0
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02002/*
3 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
4 *
5 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
6 *
7 * Interactivity improvements by Mike Galbraith
8 * (C) 2007 Mike Galbraith <efault@gmx.de>
9 *
10 * Various enhancements by Dmitry Adamushko.
11 * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
12 *
13 * Group scheduling enhancements by Srivatsa Vaddagiri
14 * Copyright IBM Corporation, 2007
15 * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
16 *
17 * Scaled math optimizations by Thomas Gleixner
18 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
Peter Zijlstra21805082007-08-25 18:41:53 +020019 *
20 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
Peter Zijlstra90eec102015-11-16 11:08:45 +010021 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020022 */
Ingo Molnar325ea102018-03-03 12:20:47 +010023#include "sched.h"
Peter Zijlstra029632f2011-10-25 10:00:11 +020024
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020025/*
Peter Zijlstra21805082007-08-25 18:41:53 +020026 * Targeted preemption latency for CPU-bound tasks:
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020027 *
Peter Zijlstra21805082007-08-25 18:41:53 +020028 * NOTE: this latency value is not the same as the concept of
Ingo Molnard274a4c2007-10-15 17:00:14 +020029 * 'timeslice length' - timeslices in CFS are of variable length
30 * and have no persistent notion like in traditional, time-slice
31 * based scheduling concepts.
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020032 *
Ingo Molnard274a4c2007-10-15 17:00:14 +020033 * (to see the precise effective timeslice length of your workload,
34 * run vmstat and monitor the context-switches (cs) field)
Ingo Molnar2b4d5b22016-11-23 07:37:00 +010035 *
36 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020037 */
Ingo Molnar2b4d5b22016-11-23 07:37:00 +010038unsigned int sysctl_sched_latency = 6000000ULL;
Muchun Songed8885a2018-11-10 15:52:02 +080039static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
Ingo Molnar2bd8e6d2007-10-15 17:00:02 +020040
41/*
Christian Ehrhardt1983a922009-11-30 12:16:47 +010042 * The initial- and re-scaling of tunables is configurable
Christian Ehrhardt1983a922009-11-30 12:16:47 +010043 *
44 * Options are:
Ingo Molnar2b4d5b22016-11-23 07:37:00 +010045 *
46 * SCHED_TUNABLESCALING_NONE - unscaled, always *1
47 * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
48 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
49 *
50 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
Christian Ehrhardt1983a922009-11-30 12:16:47 +010051 */
Peter Zijlstra8a99b682021-03-24 11:43:21 +010052unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
Christian Ehrhardt1983a922009-11-30 12:16:47 +010053
54/*
Peter Zijlstrab2be5e92007-11-09 22:39:37 +010055 * Minimal preemption granularity for CPU-bound tasks:
Ingo Molnar2b4d5b22016-11-23 07:37:00 +010056 *
Takuya Yoshikawa864616e2010-10-14 16:09:13 +090057 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
Peter Zijlstrab2be5e92007-11-09 22:39:37 +010058 */
Muchun Songed8885a2018-11-10 15:52:02 +080059unsigned int sysctl_sched_min_granularity = 750000ULL;
60static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
Peter Zijlstrab2be5e92007-11-09 22:39:37 +010061
62/*
Josh Don51ce83e2021-08-19 18:04:02 -070063 * Minimal preemption granularity for CPU-bound SCHED_IDLE tasks.
64 * Applies only when SCHED_IDLE tasks compete with normal tasks.
65 *
66 * (default: 0.75 msec)
67 */
68unsigned int sysctl_sched_idle_min_granularity = 750000ULL;
69
70/*
Ingo Molnar2b4d5b22016-11-23 07:37:00 +010071 * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
Peter Zijlstrab2be5e92007-11-09 22:39:37 +010072 */
Ingo Molnar0bf377b2010-09-12 08:14:52 +020073static unsigned int sched_nr_latency = 8;
Peter Zijlstrab2be5e92007-11-09 22:39:37 +010074
75/*
Mike Galbraith2bba22c2009-09-09 15:41:37 +020076 * After fork, child runs first. If set to 0 (default) then
Ingo Molnar2bd8e6d2007-10-15 17:00:02 +020077 * parent will (try to) run first.
78 */
Mike Galbraith2bba22c2009-09-09 15:41:37 +020079unsigned int sysctl_sched_child_runs_first __read_mostly;
Peter Zijlstra21805082007-08-25 18:41:53 +020080
81/*
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020082 * SCHED_OTHER wake-up granularity.
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020083 *
84 * This option delays the preemption effects of decoupled workloads
85 * and reduces their over-scheduling. Synchronous workloads will still
86 * have immediate wakeup/sleep latencies.
Ingo Molnar2b4d5b22016-11-23 07:37:00 +010087 *
88 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020089 */
Muchun Songed8885a2018-11-10 15:52:02 +080090unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
91static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020092
Ingo Molnar2b4d5b22016-11-23 07:37:00 +010093const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
Ingo Molnarda84d962007-10-15 17:00:18 +020094
Thara Gopinath05289b92020-02-21 19:52:13 -050095int sched_thermal_decay_shift;
96static int __init setup_sched_thermal_decay_shift(char *str)
97{
98 int _shift = 0;
99
100 if (kstrtoint(str, 0, &_shift))
101 pr_warn("Unable to set scheduler thermal pressure decay shift parameter\n");
102
103 sched_thermal_decay_shift = clamp(_shift, 0, 10);
104 return 1;
105}
106__setup("sched_thermal_decay_shift=", setup_sched_thermal_decay_shift);
107
Tim Chenafe06ef2016-11-22 12:23:53 -0800108#ifdef CONFIG_SMP
109/*
Ingo Molnar97fb7a02018-03-03 14:01:12 +0100110 * For asym packing, by default the lower numbered CPU has higher priority.
Tim Chenafe06ef2016-11-22 12:23:53 -0800111 */
112int __weak arch_asym_cpu_priority(int cpu)
113{
114 return -cpu;
115}
Olof Johansson6d101ba2018-11-25 14:41:05 -0800116
117/*
Viresh Kumar60e17f52019-06-04 12:31:52 +0530118 * The margin used when comparing utilization with CPU capacity.
Olof Johansson6d101ba2018-11-25 14:41:05 -0800119 *
120 * (default: ~20%)
121 */
Viresh Kumar60e17f52019-06-04 12:31:52 +0530122#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
123
Valentin Schneider4aed8aa2021-04-07 23:06:28 +0100124/*
125 * The margin used when comparing CPU capacities.
126 * is 'cap1' noticeably greater than 'cap2'
127 *
128 * (default: ~5%)
129 */
130#define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078)
Tim Chenafe06ef2016-11-22 12:23:53 -0800131#endif
132
Paul Turnerec12cb72011-07-21 09:43:30 -0700133#ifdef CONFIG_CFS_BANDWIDTH
134/*
135 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
136 * each time a cfs_rq requests quota.
137 *
138 * Note: in the case that the slice exceeds the runtime remaining (either due
139 * to consumption or the quota being specified to be smaller than the slice)
140 * we will always only issue the remaining available time.
141 *
Ingo Molnar2b4d5b22016-11-23 07:37:00 +0100142 * (default: 5 msec, units: microseconds)
143 */
144unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
Paul Turnerec12cb72011-07-21 09:43:30 -0700145#endif
146
Paul Gortmaker85276322013-04-19 15:10:50 -0400147static inline void update_load_add(struct load_weight *lw, unsigned long inc)
148{
149 lw->weight += inc;
150 lw->inv_weight = 0;
151}
152
153static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
154{
155 lw->weight -= dec;
156 lw->inv_weight = 0;
157}
158
159static inline void update_load_set(struct load_weight *lw, unsigned long w)
160{
161 lw->weight = w;
162 lw->inv_weight = 0;
163}
164
Peter Zijlstra029632f2011-10-25 10:00:11 +0200165/*
166 * Increase the granularity value when there are more CPUs,
167 * because with more CPUs the 'effective latency' as visible
168 * to users decreases. But the relationship is not linear,
169 * so pick a second-best guess by going with the log2 of the
170 * number of CPUs.
171 *
172 * This idea comes from the SD scheduler of Con Kolivas:
173 */
Nicholas Mc Guire58ac93e2015-05-15 21:05:42 +0200174static unsigned int get_update_sysctl_factor(void)
Peter Zijlstra029632f2011-10-25 10:00:11 +0200175{
Nicholas Mc Guire58ac93e2015-05-15 21:05:42 +0200176 unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
Peter Zijlstra029632f2011-10-25 10:00:11 +0200177 unsigned int factor;
178
179 switch (sysctl_sched_tunable_scaling) {
180 case SCHED_TUNABLESCALING_NONE:
181 factor = 1;
182 break;
183 case SCHED_TUNABLESCALING_LINEAR:
184 factor = cpus;
185 break;
186 case SCHED_TUNABLESCALING_LOG:
187 default:
188 factor = 1 + ilog2(cpus);
189 break;
190 }
191
192 return factor;
193}
194
195static void update_sysctl(void)
196{
197 unsigned int factor = get_update_sysctl_factor();
198
199#define SET_SYSCTL(name) \
200 (sysctl_##name = (factor) * normalized_sysctl_##name)
201 SET_SYSCTL(sched_min_granularity);
202 SET_SYSCTL(sched_latency);
203 SET_SYSCTL(sched_wakeup_granularity);
204#undef SET_SYSCTL
205}
206
Muchun Songf38f12d2020-04-06 15:47:50 +0800207void __init sched_init_granularity(void)
Peter Zijlstra029632f2011-10-25 10:00:11 +0200208{
209 update_sysctl();
210}
211
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100212#define WMULT_CONST (~0U)
Peter Zijlstra029632f2011-10-25 10:00:11 +0200213#define WMULT_SHIFT 32
214
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100215static void __update_inv_weight(struct load_weight *lw)
Peter Zijlstra029632f2011-10-25 10:00:11 +0200216{
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100217 unsigned long w;
Peter Zijlstra029632f2011-10-25 10:00:11 +0200218
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100219 if (likely(lw->inv_weight))
220 return;
221
222 w = scale_load_down(lw->weight);
223
224 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
225 lw->inv_weight = 1;
226 else if (unlikely(!w))
227 lw->inv_weight = WMULT_CONST;
Peter Zijlstra029632f2011-10-25 10:00:11 +0200228 else
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100229 lw->inv_weight = WMULT_CONST / w;
230}
Peter Zijlstra029632f2011-10-25 10:00:11 +0200231
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100232/*
233 * delta_exec * weight / lw.weight
234 * OR
235 * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
236 *
Yuyang Du1c3de5e2016-03-30 07:07:51 +0800237 * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100238 * we're guaranteed shift stays positive because inv_weight is guaranteed to
239 * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
240 *
241 * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
242 * weight/lw.weight <= 1, and therefore our shift will also be positive.
243 */
244static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
245{
246 u64 fact = scale_load_down(weight);
Clement Courbet1e17fb82021-03-03 14:46:53 -0800247 u32 fact_hi = (u32)(fact >> 32);
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100248 int shift = WMULT_SHIFT;
Clement Courbet1e17fb82021-03-03 14:46:53 -0800249 int fs;
Peter Zijlstra029632f2011-10-25 10:00:11 +0200250
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100251 __update_inv_weight(lw);
252
Clement Courbet1e17fb82021-03-03 14:46:53 -0800253 if (unlikely(fact_hi)) {
254 fs = fls(fact_hi);
255 shift -= fs;
256 fact >>= fs;
Peter Zijlstra029632f2011-10-25 10:00:11 +0200257 }
258
Peter Zijlstra2eeb01a2019-11-08 14:15:59 +0100259 fact = mul_u32_u32(fact, lw->inv_weight);
Peter Zijlstra029632f2011-10-25 10:00:11 +0200260
Clement Courbet1e17fb82021-03-03 14:46:53 -0800261 fact_hi = (u32)(fact >> 32);
262 if (fact_hi) {
263 fs = fls(fact_hi);
264 shift -= fs;
265 fact >>= fs;
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100266 }
267
268 return mul_u64_u32_shr(delta_exec, fact, shift);
Peter Zijlstra029632f2011-10-25 10:00:11 +0200269}
270
271
272const struct sched_class fair_sched_class;
Peter Zijlstraa4c2f002008-10-17 19:27:03 +0200273
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200274/**************************************************************
275 * CFS operations on generic schedulable entities:
276 */
277
278#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra8f488942009-07-24 12:25:30 +0200279
Peter Zijlstrab7581492008-04-19 19:45:00 +0200280/* Walk up scheduling entities hierarchy */
281#define for_each_sched_entity(se) \
282 for (; se; se = se->parent)
283
Qais Yousef3c93a0c2019-06-04 12:14:55 +0100284static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
285{
286 if (!path)
287 return;
288
289 if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
290 autogroup_path(cfs_rq->tg, path, len);
291 else if (cfs_rq && cfs_rq->tg->css.cgroup)
292 cgroup_path(cfs_rq->tg->css.cgroup, path, len);
293 else
294 strlcpy(path, "(null)", len);
295}
296
Vincent Guittotf6783312019-01-30 06:22:47 +0100297static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
Peter Zijlstra3d4b47b2010-11-15 15:47:01 -0800298{
Peter Zijlstra5d299ea2019-01-30 14:41:04 +0100299 struct rq *rq = rq_of(cfs_rq);
300 int cpu = cpu_of(rq);
Peter Zijlstra3d4b47b2010-11-15 15:47:01 -0800301
Peter Zijlstra5d299ea2019-01-30 14:41:04 +0100302 if (cfs_rq->on_list)
Vincent Guittotf6783312019-01-30 06:22:47 +0100303 return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list;
Peter Zijlstra5d299ea2019-01-30 14:41:04 +0100304
305 cfs_rq->on_list = 1;
306
307 /*
308 * Ensure we either appear before our parent (if already
309 * enqueued) or force our parent to appear after us when it is
310 * enqueued. The fact that we always enqueue bottom-up
311 * reduces this to two cases and a special case for the root
312 * cfs_rq. Furthermore, it also means that we will always reset
313 * tmp_alone_branch either when the branch is connected
314 * to a tree or when we reach the top of the tree
315 */
316 if (cfs_rq->tg->parent &&
317 cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
318 /*
319 * If parent is already on the list, we add the child
320 * just before. Thanks to circular linked property of
321 * the list, this means to put the child at the tail
322 * of the list that starts by parent.
323 */
324 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
325 &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
326 /*
327 * The branch is now connected to its tree so we can
328 * reset tmp_alone_branch to the beginning of the
329 * list.
330 */
331 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
Vincent Guittotf6783312019-01-30 06:22:47 +0100332 return true;
Peter Zijlstra3d4b47b2010-11-15 15:47:01 -0800333 }
Peter Zijlstra5d299ea2019-01-30 14:41:04 +0100334
335 if (!cfs_rq->tg->parent) {
336 /*
337 * cfs rq without parent should be put
338 * at the tail of the list.
339 */
340 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
341 &rq->leaf_cfs_rq_list);
342 /*
343 * We have reach the top of a tree so we can reset
344 * tmp_alone_branch to the beginning of the list.
345 */
346 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
Vincent Guittotf6783312019-01-30 06:22:47 +0100347 return true;
Peter Zijlstra5d299ea2019-01-30 14:41:04 +0100348 }
349
350 /*
351 * The parent has not already been added so we want to
352 * make sure that it will be put after us.
353 * tmp_alone_branch points to the begin of the branch
354 * where we will add parent.
355 */
356 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch);
357 /*
358 * update tmp_alone_branch to points to the new begin
359 * of the branch
360 */
361 rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
Vincent Guittotf6783312019-01-30 06:22:47 +0100362 return false;
Peter Zijlstra3d4b47b2010-11-15 15:47:01 -0800363}
364
365static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
366{
367 if (cfs_rq->on_list) {
Vincent Guittot31bc6ae2019-02-06 17:14:21 +0100368 struct rq *rq = rq_of(cfs_rq);
369
370 /*
371 * With cfs_rq being unthrottled/throttled during an enqueue,
372 * it can happen the tmp_alone_branch points the a leaf that
373 * we finally want to del. In this case, tmp_alone_branch moves
374 * to the prev element but it will point to rq->leaf_cfs_rq_list
375 * at the end of the enqueue.
376 */
377 if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list)
378 rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev;
379
Peter Zijlstra3d4b47b2010-11-15 15:47:01 -0800380 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
381 cfs_rq->on_list = 0;
382 }
383}
384
Peter Zijlstra5d299ea2019-01-30 14:41:04 +0100385static inline void assert_list_leaf_cfs_rq(struct rq *rq)
386{
387 SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
388}
389
Vincent Guittot039ae8b2019-02-06 17:14:22 +0100390/* Iterate thr' all leaf cfs_rq's on a runqueue */
391#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
392 list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
393 leaf_cfs_rq_list)
Peter Zijlstrab7581492008-04-19 19:45:00 +0200394
395/* Do the two (enqueued) entities belong to the same group ? */
Peter Zijlstrafed14d42012-02-11 06:05:00 +0100396static inline struct cfs_rq *
Peter Zijlstrab7581492008-04-19 19:45:00 +0200397is_same_group(struct sched_entity *se, struct sched_entity *pse)
398{
399 if (se->cfs_rq == pse->cfs_rq)
Peter Zijlstrafed14d42012-02-11 06:05:00 +0100400 return se->cfs_rq;
Peter Zijlstrab7581492008-04-19 19:45:00 +0200401
Peter Zijlstrafed14d42012-02-11 06:05:00 +0100402 return NULL;
Peter Zijlstrab7581492008-04-19 19:45:00 +0200403}
404
405static inline struct sched_entity *parent_entity(struct sched_entity *se)
406{
407 return se->parent;
408}
409
Peter Zijlstra464b7522008-10-24 11:06:15 +0200410static void
411find_matching_se(struct sched_entity **se, struct sched_entity **pse)
412{
413 int se_depth, pse_depth;
414
415 /*
416 * preemption test can be made between sibling entities who are in the
417 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
418 * both tasks until we find their ancestors who are siblings of common
419 * parent.
420 */
421
422 /* First walk up until both entities are at same depth */
Peter Zijlstrafed14d42012-02-11 06:05:00 +0100423 se_depth = (*se)->depth;
424 pse_depth = (*pse)->depth;
Peter Zijlstra464b7522008-10-24 11:06:15 +0200425
426 while (se_depth > pse_depth) {
427 se_depth--;
428 *se = parent_entity(*se);
429 }
430
431 while (pse_depth > se_depth) {
432 pse_depth--;
433 *pse = parent_entity(*pse);
434 }
435
436 while (!is_same_group(*se, *pse)) {
437 *se = parent_entity(*se);
438 *pse = parent_entity(*pse);
439 }
440}
441
Josh Don30400032021-07-29 19:00:18 -0700442static int tg_is_idle(struct task_group *tg)
443{
444 return tg->idle > 0;
445}
446
447static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
448{
449 return cfs_rq->idle > 0;
450}
451
452static int se_is_idle(struct sched_entity *se)
453{
454 if (entity_is_task(se))
455 return task_has_idle_policy(task_of(se));
456 return cfs_rq_is_idle(group_cfs_rq(se));
457}
458
Peter Zijlstra8f488942009-07-24 12:25:30 +0200459#else /* !CONFIG_FAIR_GROUP_SCHED */
460
Peter Zijlstrab7581492008-04-19 19:45:00 +0200461#define for_each_sched_entity(se) \
462 for (; se; se = NULL)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200463
Qais Yousef3c93a0c2019-06-04 12:14:55 +0100464static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
465{
466 if (path)
467 strlcpy(path, "(null)", len);
468}
469
Vincent Guittotf6783312019-01-30 06:22:47 +0100470static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
Peter Zijlstra3d4b47b2010-11-15 15:47:01 -0800471{
Vincent Guittotf6783312019-01-30 06:22:47 +0100472 return true;
Peter Zijlstra3d4b47b2010-11-15 15:47:01 -0800473}
474
475static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
476{
477}
478
Peter Zijlstra5d299ea2019-01-30 14:41:04 +0100479static inline void assert_list_leaf_cfs_rq(struct rq *rq)
480{
481}
482
Vincent Guittot039ae8b2019-02-06 17:14:22 +0100483#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
484 for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
Peter Zijlstrab7581492008-04-19 19:45:00 +0200485
Peter Zijlstrab7581492008-04-19 19:45:00 +0200486static inline struct sched_entity *parent_entity(struct sched_entity *se)
487{
488 return NULL;
489}
490
Peter Zijlstra464b7522008-10-24 11:06:15 +0200491static inline void
492find_matching_se(struct sched_entity **se, struct sched_entity **pse)
493{
494}
495
Ingo Molnar366e7ad62021-08-26 10:47:09 +0200496static inline int tg_is_idle(struct task_group *tg)
Josh Don30400032021-07-29 19:00:18 -0700497{
498 return 0;
499}
500
501static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
502{
503 return 0;
504}
505
506static int se_is_idle(struct sched_entity *se)
507{
508 return 0;
509}
510
Peter Zijlstrab7581492008-04-19 19:45:00 +0200511#endif /* CONFIG_FAIR_GROUP_SCHED */
512
Peter Zijlstra6c16a6d2012-03-21 13:07:16 -0700513static __always_inline
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100514void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200515
516/**************************************************************
517 * Scheduling class tree data structure manipulation methods:
518 */
519
Andrei Epure1bf08232013-03-12 21:12:24 +0200520static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
Peter Zijlstra02e04312007-10-15 17:00:07 +0200521{
Andrei Epure1bf08232013-03-12 21:12:24 +0200522 s64 delta = (s64)(vruntime - max_vruntime);
Peter Zijlstra368059a2007-10-15 17:00:11 +0200523 if (delta > 0)
Andrei Epure1bf08232013-03-12 21:12:24 +0200524 max_vruntime = vruntime;
Peter Zijlstra02e04312007-10-15 17:00:07 +0200525
Andrei Epure1bf08232013-03-12 21:12:24 +0200526 return max_vruntime;
Peter Zijlstra02e04312007-10-15 17:00:07 +0200527}
528
Ingo Molnar0702e3e2007-10-15 17:00:14 +0200529static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
Peter Zijlstrab0ffd242007-10-15 17:00:12 +0200530{
531 s64 delta = (s64)(vruntime - min_vruntime);
532 if (delta < 0)
533 min_vruntime = vruntime;
534
535 return min_vruntime;
536}
537
Peter Zijlstrabf9be9a2020-04-29 17:04:12 +0200538static inline bool entity_before(struct sched_entity *a,
Fabio Checconi54fdc582009-07-16 12:32:27 +0200539 struct sched_entity *b)
540{
541 return (s64)(a->vruntime - b->vruntime) < 0;
542}
543
Peter Zijlstrabf9be9a2020-04-29 17:04:12 +0200544#define __node_2_se(node) \
545 rb_entry((node), struct sched_entity, run_node)
546
Peter Zijlstra1af5f732008-10-24 11:06:13 +0200547static void update_min_vruntime(struct cfs_rq *cfs_rq)
548{
Peter Zijlstrab60205c2016-09-20 21:58:12 +0200549 struct sched_entity *curr = cfs_rq->curr;
Davidlohr Buesobfb06882017-09-08 16:14:55 -0700550 struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline);
Peter Zijlstrab60205c2016-09-20 21:58:12 +0200551
Peter Zijlstra1af5f732008-10-24 11:06:13 +0200552 u64 vruntime = cfs_rq->min_vruntime;
553
Peter Zijlstrab60205c2016-09-20 21:58:12 +0200554 if (curr) {
555 if (curr->on_rq)
556 vruntime = curr->vruntime;
557 else
558 curr = NULL;
559 }
Peter Zijlstra1af5f732008-10-24 11:06:13 +0200560
Davidlohr Buesobfb06882017-09-08 16:14:55 -0700561 if (leftmost) { /* non-empty tree */
Peter Zijlstrabf9be9a2020-04-29 17:04:12 +0200562 struct sched_entity *se = __node_2_se(leftmost);
Peter Zijlstra1af5f732008-10-24 11:06:13 +0200563
Peter Zijlstrab60205c2016-09-20 21:58:12 +0200564 if (!curr)
Peter Zijlstra1af5f732008-10-24 11:06:13 +0200565 vruntime = se->vruntime;
566 else
567 vruntime = min_vruntime(vruntime, se->vruntime);
568 }
569
Andrei Epure1bf08232013-03-12 21:12:24 +0200570 /* ensure we never gain time by being placed backwards. */
Peter Zijlstra1af5f732008-10-24 11:06:13 +0200571 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
Peter Zijlstra3fe16982011-04-05 17:23:48 +0200572#ifndef CONFIG_64BIT
573 smp_wmb();
574 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
575#endif
Peter Zijlstra1af5f732008-10-24 11:06:13 +0200576}
577
Peter Zijlstrabf9be9a2020-04-29 17:04:12 +0200578static inline bool __entity_less(struct rb_node *a, const struct rb_node *b)
579{
580 return entity_before(__node_2_se(a), __node_2_se(b));
581}
582
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200583/*
584 * Enqueue an entity into the rb-tree:
585 */
Ingo Molnar0702e3e2007-10-15 17:00:14 +0200586static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200587{
Peter Zijlstrabf9be9a2020-04-29 17:04:12 +0200588 rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200589}
590
Ingo Molnar0702e3e2007-10-15 17:00:14 +0200591static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200592{
Davidlohr Buesobfb06882017-09-08 16:14:55 -0700593 rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200594}
595
Peter Zijlstra029632f2011-10-25 10:00:11 +0200596struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200597{
Davidlohr Buesobfb06882017-09-08 16:14:55 -0700598 struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
Peter Zijlstraf4b67552008-11-04 21:25:07 +0100599
600 if (!left)
601 return NULL;
602
Peter Zijlstrabf9be9a2020-04-29 17:04:12 +0200603 return __node_2_se(left);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200604}
605
Rik van Rielac53db52011-02-01 09:51:03 -0500606static struct sched_entity *__pick_next_entity(struct sched_entity *se)
607{
608 struct rb_node *next = rb_next(&se->run_node);
609
610 if (!next)
611 return NULL;
612
Peter Zijlstrabf9be9a2020-04-29 17:04:12 +0200613 return __node_2_se(next);
Rik van Rielac53db52011-02-01 09:51:03 -0500614}
615
616#ifdef CONFIG_SCHED_DEBUG
Peter Zijlstra029632f2011-10-25 10:00:11 +0200617struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
Peter Zijlstraaeb73b02007-10-15 17:00:05 +0200618{
Davidlohr Buesobfb06882017-09-08 16:14:55 -0700619 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root);
Peter Zijlstraaeb73b02007-10-15 17:00:05 +0200620
Balbir Singh70eee742008-02-22 13:25:53 +0530621 if (!last)
622 return NULL;
Ingo Molnar7eee3e62008-02-22 10:32:21 +0100623
Peter Zijlstrabf9be9a2020-04-29 17:04:12 +0200624 return __node_2_se(last);
Peter Zijlstraaeb73b02007-10-15 17:00:05 +0200625}
626
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200627/**************************************************************
628 * Scheduling class statistics methods:
629 */
630
Peter Zijlstra8a99b682021-03-24 11:43:21 +0100631int sched_update_scaling(void)
Peter Zijlstrab2be5e92007-11-09 22:39:37 +0100632{
Nicholas Mc Guire58ac93e2015-05-15 21:05:42 +0200633 unsigned int factor = get_update_sysctl_factor();
Peter Zijlstrab2be5e92007-11-09 22:39:37 +0100634
Peter Zijlstrab2be5e92007-11-09 22:39:37 +0100635 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
636 sysctl_sched_min_granularity);
637
Christian Ehrhardtacb4a842009-11-30 12:16:48 +0100638#define WRT_SYSCTL(name) \
639 (normalized_sysctl_##name = sysctl_##name / (factor))
640 WRT_SYSCTL(sched_min_granularity);
641 WRT_SYSCTL(sched_latency);
642 WRT_SYSCTL(sched_wakeup_granularity);
Christian Ehrhardtacb4a842009-11-30 12:16:48 +0100643#undef WRT_SYSCTL
644
Peter Zijlstrab2be5e92007-11-09 22:39:37 +0100645 return 0;
646}
647#endif
Ingo Molnar647e7ca2007-10-15 17:00:13 +0200648
649/*
Peter Zijlstraf9c0b092008-10-17 19:27:04 +0200650 * delta /= w
Peter Zijlstraa7be37a2008-06-27 13:41:11 +0200651 */
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100652static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
Peter Zijlstraa7be37a2008-06-27 13:41:11 +0200653{
Peter Zijlstraf9c0b092008-10-17 19:27:04 +0200654 if (unlikely(se->load.weight != NICE_0_LOAD))
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100655 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
Peter Zijlstraa7be37a2008-06-27 13:41:11 +0200656
657 return delta;
658}
659
660/*
Ingo Molnar647e7ca2007-10-15 17:00:13 +0200661 * The idea is to set a period in which each task runs once.
662 *
Borislav Petkov532b1852012-08-08 16:16:04 +0200663 * When there are too many tasks (sched_nr_latency) we have to stretch
Ingo Molnar647e7ca2007-10-15 17:00:13 +0200664 * this period because otherwise the slices get too small.
665 *
666 * p = (nr <= nl) ? l : l*nr/nl
667 */
Peter Zijlstra4d78e7b2007-10-15 17:00:04 +0200668static u64 __sched_period(unsigned long nr_running)
669{
Boqun Feng8e2b0bf2015-07-02 22:25:52 +0800670 if (unlikely(nr_running > sched_nr_latency))
671 return nr_running * sysctl_sched_min_granularity;
672 else
673 return sysctl_sched_latency;
Peter Zijlstra4d78e7b2007-10-15 17:00:04 +0200674}
675
Josh Don51ce83e2021-08-19 18:04:02 -0700676static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq);
677
Ingo Molnar647e7ca2007-10-15 17:00:13 +0200678/*
679 * We calculate the wall-time slice from the period by taking a part
680 * proportional to the weight.
681 *
Peter Zijlstraf9c0b092008-10-17 19:27:04 +0200682 * s = p*P[w/rw]
Ingo Molnar647e7ca2007-10-15 17:00:13 +0200683 */
Peter Zijlstra6d0f0eb2007-10-15 17:00:05 +0200684static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
Peter Zijlstra21805082007-08-25 18:41:53 +0200685{
Peter Zijlstra0c2de3f2021-03-25 13:44:46 +0100686 unsigned int nr_running = cfs_rq->nr_running;
Josh Don51ce83e2021-08-19 18:04:02 -0700687 struct sched_entity *init_se = se;
688 unsigned int min_gran;
Peter Zijlstra0c2de3f2021-03-25 13:44:46 +0100689 u64 slice;
690
691 if (sched_feat(ALT_PERIOD))
692 nr_running = rq_of(cfs_rq)->cfs.h_nr_running;
693
694 slice = __sched_period(nr_running + !se->on_rq);
Peter Zijlstraf9c0b092008-10-17 19:27:04 +0200695
Mike Galbraith0a582442009-01-02 12:16:42 +0100696 for_each_sched_entity(se) {
Lin Ming6272d682009-01-15 17:17:15 +0100697 struct load_weight *load;
Christian Engelmayer3104bf02009-06-16 10:35:12 +0200698 struct load_weight lw;
Josh Don51ce83e2021-08-19 18:04:02 -0700699 struct cfs_rq *qcfs_rq;
Lin Ming6272d682009-01-15 17:17:15 +0100700
Josh Don51ce83e2021-08-19 18:04:02 -0700701 qcfs_rq = cfs_rq_of(se);
702 load = &qcfs_rq->load;
Peter Zijlstraf9c0b092008-10-17 19:27:04 +0200703
Mike Galbraith0a582442009-01-02 12:16:42 +0100704 if (unlikely(!se->on_rq)) {
Josh Don51ce83e2021-08-19 18:04:02 -0700705 lw = qcfs_rq->load;
Mike Galbraith0a582442009-01-02 12:16:42 +0100706
707 update_load_add(&lw, se->load.weight);
708 load = &lw;
709 }
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100710 slice = __calc_delta(slice, se->load.weight, load);
Mike Galbraith0a582442009-01-02 12:16:42 +0100711 }
Peter Zijlstra0c2de3f2021-03-25 13:44:46 +0100712
Josh Don51ce83e2021-08-19 18:04:02 -0700713 if (sched_feat(BASE_SLICE)) {
714 if (se_is_idle(init_se) && !sched_idle_cfs_rq(cfs_rq))
715 min_gran = sysctl_sched_idle_min_granularity;
716 else
717 min_gran = sysctl_sched_min_granularity;
718
719 slice = max_t(u64, slice, min_gran);
720 }
Peter Zijlstra0c2de3f2021-03-25 13:44:46 +0100721
Mike Galbraith0a582442009-01-02 12:16:42 +0100722 return slice;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200723}
724
Ingo Molnar647e7ca2007-10-15 17:00:13 +0200725/*
Andrei Epure660cc002013-03-11 12:03:20 +0200726 * We calculate the vruntime slice of a to-be-inserted task.
Ingo Molnar647e7ca2007-10-15 17:00:13 +0200727 *
Peter Zijlstraf9c0b092008-10-17 19:27:04 +0200728 * vs = s/w
Ingo Molnar647e7ca2007-10-15 17:00:13 +0200729 */
Peter Zijlstraf9c0b092008-10-17 19:27:04 +0200730static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
Ingo Molnar647e7ca2007-10-15 17:00:13 +0200731{
Peter Zijlstraf9c0b092008-10-17 19:27:04 +0200732 return calc_delta_fair(sched_slice(cfs_rq, se), se);
Peter Zijlstraa7be37a2008-06-27 13:41:11 +0200733}
734
Vincent Guittotc0796292018-06-28 17:45:04 +0200735#include "pelt.h"
Vincent Guittot23127292019-01-23 16:26:53 +0100736#ifdef CONFIG_SMP
Peter Zijlstra283e2ed2017-04-11 11:08:42 +0200737
Morten Rasmussen772bd008c2016-06-22 18:03:13 +0100738static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
Mel Gormanfb13c7e2013-10-07 11:29:17 +0100739static unsigned long task_h_load(struct task_struct *p);
Morten Rasmussen3b1baa62018-07-04 11:17:40 +0100740static unsigned long capacity_of(int cpu);
Mel Gormanfb13c7e2013-10-07 11:29:17 +0100741
Yuyang Du540247f2015-07-15 08:04:39 +0800742/* Give new sched_entity start runnable values to heavy its load in infant time */
743void init_entity_runnable_average(struct sched_entity *se)
Alex Shia75cdaa2013-06-20 10:18:47 +0800744{
Yuyang Du540247f2015-07-15 08:04:39 +0800745 struct sched_avg *sa = &se->avg;
Alex Shia75cdaa2013-06-20 10:18:47 +0800746
Peter Zijlstraf2079342017-05-12 14:16:30 +0200747 memset(sa, 0, sizeof(*sa));
748
Vincent Guittotb5a9b342016-10-19 14:45:23 +0200749 /*
Ingo Molnardfcb2452018-12-03 10:05:56 +0100750 * Tasks are initialized with full load to be seen as heavy tasks until
Vincent Guittotb5a9b342016-10-19 14:45:23 +0200751 * they get a chance to stabilize to their real load level.
Ingo Molnardfcb2452018-12-03 10:05:56 +0100752 * Group entities are initialized with zero load to reflect the fact that
Vincent Guittotb5a9b342016-10-19 14:45:23 +0200753 * nothing has been attached to the task group yet.
754 */
755 if (entity_is_task(se))
Vincent Guittot0dacee12020-02-24 09:52:17 +0000756 sa->load_avg = scale_load_down(se->load.weight);
Peter Zijlstraf2079342017-05-12 14:16:30 +0200757
Yuyang Du9d89c252015-07-15 08:04:37 +0800758 /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
Alex Shia75cdaa2013-06-20 10:18:47 +0800759}
Yuyang Du7ea241a2015-07-15 08:04:42 +0800760
Vincent Guittotdf217912016-11-08 10:53:42 +0100761static void attach_entity_cfs_rq(struct sched_entity *se);
Peter Zijlstra7dc603c2016-06-16 13:29:28 +0200762
Yuyang Du2b8c41d2016-03-30 04:30:56 +0800763/*
764 * With new tasks being created, their initial util_avgs are extrapolated
765 * based on the cfs_rq's current util_avg:
766 *
767 * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
768 *
769 * However, in many cases, the above util_avg does not give a desired
770 * value. Moreover, the sum of the util_avgs may be divergent, such
771 * as when the series is a harmonic series.
772 *
773 * To solve this problem, we also cap the util_avg of successive tasks to
774 * only 1/2 of the left utilization budget:
775 *
Quentin Perret8fe5c5a2018-06-12 12:22:15 +0100776 * util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n
Yuyang Du2b8c41d2016-03-30 04:30:56 +0800777 *
Quentin Perret8fe5c5a2018-06-12 12:22:15 +0100778 * where n denotes the nth task and cpu_scale the CPU capacity.
Yuyang Du2b8c41d2016-03-30 04:30:56 +0800779 *
Quentin Perret8fe5c5a2018-06-12 12:22:15 +0100780 * For example, for a CPU with 1024 of capacity, a simplest series from
781 * the beginning would be like:
Yuyang Du2b8c41d2016-03-30 04:30:56 +0800782 *
783 * task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
784 * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
785 *
786 * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
787 * if util_avg > util_avg_cap.
788 */
Dietmar Eggemannd0fe0b92019-01-22 16:25:01 +0000789void post_init_entity_util_avg(struct task_struct *p)
Yuyang Du2b8c41d2016-03-30 04:30:56 +0800790{
Dietmar Eggemannd0fe0b92019-01-22 16:25:01 +0000791 struct sched_entity *se = &p->se;
Yuyang Du2b8c41d2016-03-30 04:30:56 +0800792 struct cfs_rq *cfs_rq = cfs_rq_of(se);
793 struct sched_avg *sa = &se->avg;
Vincent Guittot8ec59c02019-06-17 17:00:17 +0200794 long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
Quentin Perret8fe5c5a2018-06-12 12:22:15 +0100795 long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
Yuyang Du2b8c41d2016-03-30 04:30:56 +0800796
797 if (cap > 0) {
798 if (cfs_rq->avg.util_avg != 0) {
799 sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
800 sa->util_avg /= (cfs_rq->avg.load_avg + 1);
801
802 if (sa->util_avg > cap)
803 sa->util_avg = cap;
804 } else {
805 sa->util_avg = cap;
806 }
Yuyang Du2b8c41d2016-03-30 04:30:56 +0800807 }
Peter Zijlstra7dc603c2016-06-16 13:29:28 +0200808
Vincent Guittote21cf432020-06-24 17:44:22 +0200809 sa->runnable_avg = sa->util_avg;
Vincent Guittot9f683952020-02-24 09:52:18 +0000810
Dietmar Eggemannd0fe0b92019-01-22 16:25:01 +0000811 if (p->sched_class != &fair_sched_class) {
812 /*
813 * For !fair tasks do:
814 *
815 update_cfs_rq_load_avg(now, cfs_rq);
Vincent Guittota4f9a0e2020-01-15 11:20:20 +0100816 attach_entity_load_avg(cfs_rq, se);
Dietmar Eggemannd0fe0b92019-01-22 16:25:01 +0000817 switched_from_fair(rq, p);
818 *
819 * such that the next switched_to_fair() has the
820 * expected state.
821 */
822 se->avg.last_update_time = cfs_rq_clock_pelt(cfs_rq);
823 return;
Peter Zijlstra7dc603c2016-06-16 13:29:28 +0200824 }
825
Vincent Guittotdf217912016-11-08 10:53:42 +0100826 attach_entity_cfs_rq(se);
Yuyang Du2b8c41d2016-03-30 04:30:56 +0800827}
828
Peter Zijlstra7dc603c2016-06-16 13:29:28 +0200829#else /* !CONFIG_SMP */
Yuyang Du540247f2015-07-15 08:04:39 +0800830void init_entity_runnable_average(struct sched_entity *se)
Alex Shia75cdaa2013-06-20 10:18:47 +0800831{
832}
Dietmar Eggemannd0fe0b92019-01-22 16:25:01 +0000833void post_init_entity_util_avg(struct task_struct *p)
Yuyang Du2b8c41d2016-03-30 04:30:56 +0800834{
835}
Xianting Tianfe749152020-09-24 09:47:55 +0800836static void update_tg_load_avg(struct cfs_rq *cfs_rq)
Peter Zijlstra3d30544f2016-06-21 14:27:50 +0200837{
838}
Peter Zijlstra7dc603c2016-06-16 13:29:28 +0200839#endif /* CONFIG_SMP */
Alex Shia75cdaa2013-06-20 10:18:47 +0800840
Peter Zijlstraa7be37a2008-06-27 13:41:11 +0200841/*
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100842 * Update the current task's runtime statistics.
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200843 */
Ingo Molnarb7cc0892007-08-09 11:16:47 +0200844static void update_curr(struct cfs_rq *cfs_rq)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200845{
Ingo Molnar429d43b2007-10-15 17:00:03 +0200846 struct sched_entity *curr = cfs_rq->curr;
Frederic Weisbecker78becc22013-04-12 01:51:02 +0200847 u64 now = rq_clock_task(rq_of(cfs_rq));
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100848 u64 delta_exec;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200849
850 if (unlikely(!curr))
851 return;
852
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100853 delta_exec = now - curr->exec_start;
854 if (unlikely((s64)delta_exec <= 0))
Peter Zijlstra34f28ec2008-12-16 08:45:31 +0100855 return;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200856
Ingo Molnar8ebc91d2007-10-15 17:00:03 +0200857 curr->exec_start = now;
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +0100858
Yafang Shaoceeadb82021-09-05 14:35:41 +0000859 if (schedstat_enabled()) {
860 struct sched_statistics *stats;
861
862 stats = __schedstats_from_se(curr);
863 __schedstat_set(stats->exec_max,
864 max(delta_exec, stats->exec_max));
865 }
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100866
867 curr->sum_exec_runtime += delta_exec;
Josh Poimboeufae928822016-06-17 12:43:24 -0500868 schedstat_add(cfs_rq->exec_clock, delta_exec);
Peter Zijlstra9dbdb152013-11-18 18:27:06 +0100869
870 curr->vruntime += calc_delta_fair(delta_exec, curr);
871 update_min_vruntime(cfs_rq);
872
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +0100873 if (entity_is_task(curr)) {
874 struct task_struct *curtask = task_of(curr);
875
Ingo Molnarf977bb42009-09-13 18:15:54 +0200876 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
Tejun Heod2cc5ed2017-09-25 08:12:04 -0700877 cgroup_account_cputime(curtask, delta_exec);
Frank Mayharf06febc2008-09-12 09:54:39 -0700878 account_group_exec_runtime(curtask, delta_exec);
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +0100879 }
Paul Turnerec12cb72011-07-21 09:43:30 -0700880
881 account_cfs_rq_runtime(cfs_rq, delta_exec);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200882}
883
Stanislaw Gruszka6e998912014-11-12 16:58:44 +0100884static void update_curr_fair(struct rq *rq)
885{
886 update_curr(cfs_rq_of(&rq->curr->se));
887}
888
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200889static inline void
Yafang Shao60f24152021-09-05 14:35:42 +0000890update_stats_wait_start_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200891{
Yafang Shaoceeadb82021-09-05 14:35:41 +0000892 struct sched_statistics *stats;
Yafang Shao60f24152021-09-05 14:35:42 +0000893 struct task_struct *p = NULL;
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -0500894
895 if (!schedstat_enabled())
896 return;
897
Yafang Shaoceeadb82021-09-05 14:35:41 +0000898 stats = __schedstats_from_se(se);
899
Yafang Shao60f24152021-09-05 14:35:42 +0000900 if (entity_is_task(se))
901 p = task_of(se);
Joonwoo Park3ea94de2015-11-12 19:38:54 -0800902
Yafang Shao60f24152021-09-05 14:35:42 +0000903 __update_stats_wait_start(rq_of(cfs_rq), p, stats);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200904}
905
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -0500906static inline void
Yafang Shao60f24152021-09-05 14:35:42 +0000907update_stats_wait_end_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
Joonwoo Park3ea94de2015-11-12 19:38:54 -0800908{
Yafang Shaoceeadb82021-09-05 14:35:41 +0000909 struct sched_statistics *stats;
910 struct task_struct *p = NULL;
Mel Gormancb251762016-02-05 09:08:36 +0000911
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -0500912 if (!schedstat_enabled())
913 return;
914
Yafang Shaoceeadb82021-09-05 14:35:41 +0000915 stats = __schedstats_from_se(se);
916
jun qianb9c88f72020-10-15 14:48:46 +0800917 /*
918 * When the sched_schedstat changes from 0 to 1, some sched se
919 * maybe already in the runqueue, the se->statistics.wait_start
920 * will be 0.So it will let the delta wrong. We need to avoid this
921 * scenario.
922 */
Yafang Shaoceeadb82021-09-05 14:35:41 +0000923 if (unlikely(!schedstat_val(stats->wait_start)))
jun qianb9c88f72020-10-15 14:48:46 +0800924 return;
925
Yafang Shao60f24152021-09-05 14:35:42 +0000926 if (entity_is_task(se))
Joonwoo Park3ea94de2015-11-12 19:38:54 -0800927 p = task_of(se);
Joonwoo Park3ea94de2015-11-12 19:38:54 -0800928
Yafang Shao60f24152021-09-05 14:35:42 +0000929 __update_stats_wait_end(rq_of(cfs_rq), p, stats);
Joonwoo Park3ea94de2015-11-12 19:38:54 -0800930}
Joonwoo Park3ea94de2015-11-12 19:38:54 -0800931
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -0500932static inline void
Yafang Shao60f24152021-09-05 14:35:42 +0000933update_stats_enqueue_sleeper_fair(struct cfs_rq *cfs_rq, struct sched_entity *se)
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -0500934{
Yafang Shaoceeadb82021-09-05 14:35:41 +0000935 struct sched_statistics *stats;
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -0500936 struct task_struct *tsk = NULL;
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -0500937
938 if (!schedstat_enabled())
939 return;
940
Yafang Shaoceeadb82021-09-05 14:35:41 +0000941 stats = __schedstats_from_se(se);
942
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -0500943 if (entity_is_task(se))
944 tsk = task_of(se);
945
Yafang Shao60f24152021-09-05 14:35:42 +0000946 __update_stats_enqueue_sleeper(rq_of(cfs_rq), tsk, stats);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200947}
948
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200949/*
950 * Task is being enqueued - update stats:
951 */
Mel Gormancb251762016-02-05 09:08:36 +0000952static inline void
Yafang Shao60f24152021-09-05 14:35:42 +0000953update_stats_enqueue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200954{
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -0500955 if (!schedstat_enabled())
956 return;
957
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200958 /*
959 * Are we enqueueing a waiting task? (for current tasks
960 * a dequeue/enqueue event is a NOP)
961 */
Ingo Molnar429d43b2007-10-15 17:00:03 +0200962 if (se != cfs_rq->curr)
Yafang Shao60f24152021-09-05 14:35:42 +0000963 update_stats_wait_start_fair(cfs_rq, se);
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -0500964
965 if (flags & ENQUEUE_WAKEUP)
Yafang Shao60f24152021-09-05 14:35:42 +0000966 update_stats_enqueue_sleeper_fair(cfs_rq, se);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200967}
968
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200969static inline void
Yafang Shao60f24152021-09-05 14:35:42 +0000970update_stats_dequeue_fair(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200971{
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -0500972
973 if (!schedstat_enabled())
974 return;
975
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200976 /*
977 * Mark the end of the wait period if dequeueing a
978 * waiting task:
979 */
Ingo Molnar429d43b2007-10-15 17:00:03 +0200980 if (se != cfs_rq->curr)
Yafang Shao60f24152021-09-05 14:35:42 +0000981 update_stats_wait_end_fair(cfs_rq, se);
Mel Gormancb251762016-02-05 09:08:36 +0000982
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -0500983 if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
984 struct task_struct *tsk = task_of(se);
Peter Zijlstra2f064a52021-06-11 10:28:17 +0200985 unsigned int state;
Mel Gormancb251762016-02-05 09:08:36 +0000986
Peter Zijlstra2f064a52021-06-11 10:28:17 +0200987 /* XXX racy against TTWU */
988 state = READ_ONCE(tsk->__state);
989 if (state & TASK_INTERRUPTIBLE)
Yafang Shaoceeadb82021-09-05 14:35:41 +0000990 __schedstat_set(tsk->stats.sleep_start,
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -0500991 rq_clock(rq_of(cfs_rq)));
Peter Zijlstra2f064a52021-06-11 10:28:17 +0200992 if (state & TASK_UNINTERRUPTIBLE)
Yafang Shaoceeadb82021-09-05 14:35:41 +0000993 __schedstat_set(tsk->stats.block_start,
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -0500994 rq_clock(rq_of(cfs_rq)));
Mel Gormancb251762016-02-05 09:08:36 +0000995 }
Ingo Molnarbf0f6f22007-07-09 18:51:58 +0200996}
997
998/*
999 * We are picking a new current task - update its stats:
1000 */
1001static inline void
Ingo Molnar79303e92007-08-09 11:16:47 +02001002update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02001003{
1004 /*
1005 * We are starting a new run period:
1006 */
Frederic Weisbecker78becc22013-04-12 01:51:02 +02001007 se->exec_start = rq_clock_task(rq_of(cfs_rq));
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02001008}
1009
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02001010/**************************************************
1011 * Scheduling class queueing methods:
1012 */
1013
Peter Zijlstracbee9f82012-10-25 14:16:43 +02001014#ifdef CONFIG_NUMA_BALANCING
1015/*
Mel Gorman598f0ec2013-10-07 11:28:55 +01001016 * Approximate time to scan a full NUMA task in ms. The task scan period is
1017 * calculated based on the tasks virtual memory size and
1018 * numa_balancing_scan_size.
Peter Zijlstracbee9f82012-10-25 14:16:43 +02001019 */
Mel Gorman598f0ec2013-10-07 11:28:55 +01001020unsigned int sysctl_numa_balancing_scan_period_min = 1000;
1021unsigned int sysctl_numa_balancing_scan_period_max = 60000;
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02001022
1023/* Portion of address space to scan in MB */
1024unsigned int sysctl_numa_balancing_scan_size = 256;
Peter Zijlstracbee9f82012-10-25 14:16:43 +02001025
Peter Zijlstra4b96a292012-10-25 14:16:47 +02001026/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
1027unsigned int sysctl_numa_balancing_scan_delay = 1000;
1028
Rik van Rielb5dd77c2017-07-31 15:28:47 -04001029struct numa_group {
Elena Reshetovac45a7792019-01-18 14:27:28 +02001030 refcount_t refcount;
Rik van Rielb5dd77c2017-07-31 15:28:47 -04001031
1032 spinlock_t lock; /* nr_tasks, tasks */
1033 int nr_tasks;
1034 pid_t gid;
1035 int active_nodes;
1036
1037 struct rcu_head rcu;
1038 unsigned long total_faults;
1039 unsigned long max_faults_cpu;
1040 /*
Bharata B Rao5b763a12021-10-04 16:27:04 +05301041 * faults[] array is split into two regions: faults_mem and faults_cpu.
1042 *
Rik van Rielb5dd77c2017-07-31 15:28:47 -04001043 * Faults_cpu is used to decide whether memory should move
1044 * towards the CPU. As a consequence, these stats are weighted
1045 * more by CPU use than by memory faults.
1046 */
Gustavo A. R. Silva04f5c362020-05-07 14:21:41 -05001047 unsigned long faults[];
Rik van Rielb5dd77c2017-07-31 15:28:47 -04001048};
1049
Jann Horncb361d82019-07-16 17:20:47 +02001050/*
1051 * For functions that can be called in multiple contexts that permit reading
1052 * ->numa_group (see struct task_struct for locking rules).
1053 */
1054static struct numa_group *deref_task_numa_group(struct task_struct *p)
1055{
1056 return rcu_dereference_check(p->numa_group, p == current ||
Peter Zijlstra9ef7e7e2021-03-03 16:45:41 +01001057 (lockdep_is_held(__rq_lockp(task_rq(p))) && !READ_ONCE(p->on_cpu)));
Jann Horncb361d82019-07-16 17:20:47 +02001058}
1059
1060static struct numa_group *deref_curr_numa_group(struct task_struct *p)
1061{
1062 return rcu_dereference_protected(p->numa_group, p == current);
1063}
1064
Rik van Rielb5dd77c2017-07-31 15:28:47 -04001065static inline unsigned long group_faults_priv(struct numa_group *ng);
1066static inline unsigned long group_faults_shared(struct numa_group *ng);
1067
Mel Gorman598f0ec2013-10-07 11:28:55 +01001068static unsigned int task_nr_scan_windows(struct task_struct *p)
1069{
1070 unsigned long rss = 0;
1071 unsigned long nr_scan_pages;
1072
1073 /*
1074 * Calculations based on RSS as non-present and empty pages are skipped
1075 * by the PTE scanner and NUMA hinting faults should be trapped based
1076 * on resident pages
1077 */
1078 nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
1079 rss = get_mm_rss(p->mm);
1080 if (!rss)
1081 rss = nr_scan_pages;
1082
1083 rss = round_up(rss, nr_scan_pages);
1084 return rss / nr_scan_pages;
1085}
1086
Ingo Molnar3b037062021-03-18 13:38:50 +01001087/* For sanity's sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
Mel Gorman598f0ec2013-10-07 11:28:55 +01001088#define MAX_SCAN_WINDOW 2560
1089
1090static unsigned int task_scan_min(struct task_struct *p)
1091{
Jason Low316c1608d2015-04-28 13:00:20 -07001092 unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
Mel Gorman598f0ec2013-10-07 11:28:55 +01001093 unsigned int scan, floor;
1094 unsigned int windows = 1;
1095
Kirill Tkhai64192652014-10-16 14:39:37 +04001096 if (scan_size < MAX_SCAN_WINDOW)
1097 windows = MAX_SCAN_WINDOW / scan_size;
Mel Gorman598f0ec2013-10-07 11:28:55 +01001098 floor = 1000 / windows;
1099
1100 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
1101 return max_t(unsigned int, floor, scan);
1102}
1103
Rik van Rielb5dd77c2017-07-31 15:28:47 -04001104static unsigned int task_scan_start(struct task_struct *p)
1105{
1106 unsigned long smin = task_scan_min(p);
1107 unsigned long period = smin;
Jann Horncb361d82019-07-16 17:20:47 +02001108 struct numa_group *ng;
Rik van Rielb5dd77c2017-07-31 15:28:47 -04001109
1110 /* Scale the maximum scan period with the amount of shared memory. */
Jann Horncb361d82019-07-16 17:20:47 +02001111 rcu_read_lock();
1112 ng = rcu_dereference(p->numa_group);
1113 if (ng) {
Rik van Rielb5dd77c2017-07-31 15:28:47 -04001114 unsigned long shared = group_faults_shared(ng);
1115 unsigned long private = group_faults_priv(ng);
1116
Elena Reshetovac45a7792019-01-18 14:27:28 +02001117 period *= refcount_read(&ng->refcount);
Rik van Rielb5dd77c2017-07-31 15:28:47 -04001118 period *= shared + 1;
1119 period /= private + shared + 1;
1120 }
Jann Horncb361d82019-07-16 17:20:47 +02001121 rcu_read_unlock();
Rik van Rielb5dd77c2017-07-31 15:28:47 -04001122
1123 return max(smin, period);
1124}
1125
Mel Gorman598f0ec2013-10-07 11:28:55 +01001126static unsigned int task_scan_max(struct task_struct *p)
1127{
Rik van Rielb5dd77c2017-07-31 15:28:47 -04001128 unsigned long smin = task_scan_min(p);
1129 unsigned long smax;
Jann Horncb361d82019-07-16 17:20:47 +02001130 struct numa_group *ng;
Mel Gorman598f0ec2013-10-07 11:28:55 +01001131
1132 /* Watch for min being lower than max due to floor calculations */
1133 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
Rik van Rielb5dd77c2017-07-31 15:28:47 -04001134
1135 /* Scale the maximum scan period with the amount of shared memory. */
Jann Horncb361d82019-07-16 17:20:47 +02001136 ng = deref_curr_numa_group(p);
1137 if (ng) {
Rik van Rielb5dd77c2017-07-31 15:28:47 -04001138 unsigned long shared = group_faults_shared(ng);
1139 unsigned long private = group_faults_priv(ng);
1140 unsigned long period = smax;
1141
Elena Reshetovac45a7792019-01-18 14:27:28 +02001142 period *= refcount_read(&ng->refcount);
Rik van Rielb5dd77c2017-07-31 15:28:47 -04001143 period *= shared + 1;
1144 period /= private + shared + 1;
1145
1146 smax = max(smax, period);
1147 }
1148
Mel Gorman598f0ec2013-10-07 11:28:55 +01001149 return max(smin, smax);
1150}
1151
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01001152static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1153{
Anshuman Khandual98fa15f2019-03-05 15:42:58 -08001154 rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01001155 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1156}
1157
1158static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1159{
Anshuman Khandual98fa15f2019-03-05 15:42:58 -08001160 rq->nr_numa_running -= (p->numa_preferred_nid != NUMA_NO_NODE);
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01001161 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1162}
1163
Rik van Rielbe1e4e72014-01-27 17:03:48 -05001164/* Shared or private faults. */
1165#define NR_NUMA_HINT_FAULT_TYPES 2
1166
1167/* Memory and CPU locality */
1168#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1169
1170/* Averaged statistics, and temporary buffers. */
1171#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1172
Mel Gormane29cf082013-10-07 11:29:22 +01001173pid_t task_numa_group_id(struct task_struct *p)
1174{
Jann Horncb361d82019-07-16 17:20:47 +02001175 struct numa_group *ng;
1176 pid_t gid = 0;
1177
1178 rcu_read_lock();
1179 ng = rcu_dereference(p->numa_group);
1180 if (ng)
1181 gid = ng->gid;
1182 rcu_read_unlock();
1183
1184 return gid;
Mel Gormane29cf082013-10-07 11:29:22 +01001185}
1186
Iulia Manda44dba3d2014-10-31 02:13:31 +02001187/*
Ingo Molnar97fb7a02018-03-03 14:01:12 +01001188 * The averaged statistics, shared & private, memory & CPU,
Iulia Manda44dba3d2014-10-31 02:13:31 +02001189 * occupy the first half of the array. The second half of the
1190 * array is for current counters, which are averaged into the
1191 * first set by task_numa_placement.
1192 */
1193static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
Mel Gormanac8e8952013-10-07 11:29:03 +01001194{
Iulia Manda44dba3d2014-10-31 02:13:31 +02001195 return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
Mel Gormanac8e8952013-10-07 11:29:03 +01001196}
1197
1198static inline unsigned long task_faults(struct task_struct *p, int nid)
1199{
Iulia Manda44dba3d2014-10-31 02:13:31 +02001200 if (!p->numa_faults)
Mel Gormanac8e8952013-10-07 11:29:03 +01001201 return 0;
1202
Iulia Manda44dba3d2014-10-31 02:13:31 +02001203 return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1204 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
Mel Gormanac8e8952013-10-07 11:29:03 +01001205}
1206
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001207static inline unsigned long group_faults(struct task_struct *p, int nid)
1208{
Jann Horncb361d82019-07-16 17:20:47 +02001209 struct numa_group *ng = deref_task_numa_group(p);
1210
1211 if (!ng)
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001212 return 0;
1213
Jann Horncb361d82019-07-16 17:20:47 +02001214 return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1215 ng->faults[task_faults_idx(NUMA_MEM, nid, 1)];
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001216}
1217
Rik van Riel20e07de2014-01-27 17:03:43 -05001218static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
1219{
Bharata B Rao5b763a12021-10-04 16:27:04 +05301220 return group->faults[task_faults_idx(NUMA_CPU, nid, 0)] +
1221 group->faults[task_faults_idx(NUMA_CPU, nid, 1)];
Rik van Riel20e07de2014-01-27 17:03:43 -05001222}
1223
Rik van Rielb5dd77c2017-07-31 15:28:47 -04001224static inline unsigned long group_faults_priv(struct numa_group *ng)
1225{
1226 unsigned long faults = 0;
1227 int node;
1228
1229 for_each_online_node(node) {
1230 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
1231 }
1232
1233 return faults;
1234}
1235
1236static inline unsigned long group_faults_shared(struct numa_group *ng)
1237{
1238 unsigned long faults = 0;
1239 int node;
1240
1241 for_each_online_node(node) {
1242 faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
1243 }
1244
1245 return faults;
1246}
1247
Rik van Riel4142c3e2016-01-25 17:07:39 -05001248/*
1249 * A node triggering more than 1/3 as many NUMA faults as the maximum is
1250 * considered part of a numa group's pseudo-interleaving set. Migrations
1251 * between these nodes are slowed down, to allow things to settle down.
1252 */
1253#define ACTIVE_NODE_FRACTION 3
1254
1255static bool numa_is_active_node(int nid, struct numa_group *ng)
1256{
1257 return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
1258}
1259
Rik van Riel6c6b1192014-10-17 03:29:52 -04001260/* Handle placement on systems where not all nodes are directly connected. */
1261static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1262 int maxdist, bool task)
1263{
1264 unsigned long score = 0;
1265 int node;
1266
1267 /*
1268 * All nodes are directly connected, and the same distance
1269 * from each other. No need for fancy placement algorithms.
1270 */
1271 if (sched_numa_topology_type == NUMA_DIRECT)
1272 return 0;
1273
1274 /*
1275 * This code is called for each node, introducing N^2 complexity,
1276 * which should be ok given the number of nodes rarely exceeds 8.
1277 */
1278 for_each_online_node(node) {
1279 unsigned long faults;
1280 int dist = node_distance(nid, node);
1281
1282 /*
1283 * The furthest away nodes in the system are not interesting
1284 * for placement; nid was already counted.
1285 */
1286 if (dist == sched_max_numa_distance || node == nid)
1287 continue;
1288
1289 /*
1290 * On systems with a backplane NUMA topology, compare groups
1291 * of nodes, and move tasks towards the group with the most
1292 * memory accesses. When comparing two nodes at distance
1293 * "hoplimit", only nodes closer by than "hoplimit" are part
1294 * of each group. Skip other nodes.
1295 */
1296 if (sched_numa_topology_type == NUMA_BACKPLANE &&
Srikar Dronamraju0ee7e742018-06-20 22:32:48 +05301297 dist >= maxdist)
Rik van Riel6c6b1192014-10-17 03:29:52 -04001298 continue;
1299
1300 /* Add up the faults from nearby nodes. */
1301 if (task)
1302 faults = task_faults(p, node);
1303 else
1304 faults = group_faults(p, node);
1305
1306 /*
1307 * On systems with a glueless mesh NUMA topology, there are
1308 * no fixed "groups of nodes". Instead, nodes that are not
1309 * directly connected bounce traffic through intermediate
1310 * nodes; a numa_group can occupy any set of nodes.
1311 * The further away a node is, the less the faults count.
1312 * This seems to result in good task placement.
1313 */
1314 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1315 faults *= (sched_max_numa_distance - dist);
1316 faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
1317 }
1318
1319 score += faults;
1320 }
1321
1322 return score;
1323}
1324
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001325/*
1326 * These return the fraction of accesses done by a particular task, or
1327 * task group, on a particular numa node. The group weight is given a
1328 * larger multiplier, in order to group tasks together that are almost
1329 * evenly spread out between numa nodes.
1330 */
Rik van Riel7bd95322014-10-17 03:29:51 -04001331static inline unsigned long task_weight(struct task_struct *p, int nid,
1332 int dist)
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001333{
Rik van Riel7bd95322014-10-17 03:29:51 -04001334 unsigned long faults, total_faults;
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001335
Iulia Manda44dba3d2014-10-31 02:13:31 +02001336 if (!p->numa_faults)
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001337 return 0;
1338
1339 total_faults = p->total_numa_faults;
1340
1341 if (!total_faults)
1342 return 0;
1343
Rik van Riel7bd95322014-10-17 03:29:51 -04001344 faults = task_faults(p, nid);
Rik van Riel6c6b1192014-10-17 03:29:52 -04001345 faults += score_nearby_nodes(p, nid, dist, true);
1346
Rik van Riel7bd95322014-10-17 03:29:51 -04001347 return 1000 * faults / total_faults;
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001348}
1349
Rik van Riel7bd95322014-10-17 03:29:51 -04001350static inline unsigned long group_weight(struct task_struct *p, int nid,
1351 int dist)
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001352{
Jann Horncb361d82019-07-16 17:20:47 +02001353 struct numa_group *ng = deref_task_numa_group(p);
Rik van Riel7bd95322014-10-17 03:29:51 -04001354 unsigned long faults, total_faults;
1355
Jann Horncb361d82019-07-16 17:20:47 +02001356 if (!ng)
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001357 return 0;
1358
Jann Horncb361d82019-07-16 17:20:47 +02001359 total_faults = ng->total_faults;
Rik van Riel7bd95322014-10-17 03:29:51 -04001360
1361 if (!total_faults)
1362 return 0;
1363
1364 faults = group_faults(p, nid);
Rik van Riel6c6b1192014-10-17 03:29:52 -04001365 faults += score_nearby_nodes(p, nid, dist, false);
1366
Rik van Riel7bd95322014-10-17 03:29:51 -04001367 return 1000 * faults / total_faults;
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001368}
1369
Rik van Riel10f39042014-01-27 17:03:44 -05001370bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1371 int src_nid, int dst_cpu)
1372{
Jann Horncb361d82019-07-16 17:20:47 +02001373 struct numa_group *ng = deref_curr_numa_group(p);
Rik van Riel10f39042014-01-27 17:03:44 -05001374 int dst_nid = cpu_to_node(dst_cpu);
1375 int last_cpupid, this_cpupid;
1376
1377 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
Mel Gorman37355bd2018-10-01 11:05:25 +01001378 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1379
1380 /*
1381 * Allow first faults or private faults to migrate immediately early in
1382 * the lifetime of a task. The magic number 4 is based on waiting for
1383 * two full passes of the "multi-stage node selection" test that is
1384 * executed below.
1385 */
Anshuman Khandual98fa15f2019-03-05 15:42:58 -08001386 if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
Mel Gorman37355bd2018-10-01 11:05:25 +01001387 (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
1388 return true;
Rik van Riel10f39042014-01-27 17:03:44 -05001389
1390 /*
1391 * Multi-stage node selection is used in conjunction with a periodic
1392 * migration fault to build a temporal task<->page relation. By using
1393 * a two-stage filter we remove short/unlikely relations.
1394 *
1395 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
1396 * a task's usage of a particular page (n_p) per total usage of this
1397 * page (n_t) (in a given time-span) to a probability.
1398 *
1399 * Our periodic faults will sample this probability and getting the
1400 * same result twice in a row, given these samples are fully
1401 * independent, is then given by P(n)^2, provided our sample period
1402 * is sufficiently short compared to the usage pattern.
1403 *
1404 * This quadric squishes small probabilities, making it less likely we
1405 * act on an unlikely task<->page relation.
1406 */
Rik van Riel10f39042014-01-27 17:03:44 -05001407 if (!cpupid_pid_unset(last_cpupid) &&
1408 cpupid_to_nid(last_cpupid) != dst_nid)
1409 return false;
1410
1411 /* Always allow migrate on private faults */
1412 if (cpupid_match_pid(p, last_cpupid))
1413 return true;
1414
1415 /* A shared fault, but p->numa_group has not been set up yet. */
1416 if (!ng)
1417 return true;
1418
1419 /*
Rik van Riel4142c3e2016-01-25 17:07:39 -05001420 * Destination node is much more heavily used than the source
1421 * node? Allow migration.
Rik van Riel10f39042014-01-27 17:03:44 -05001422 */
Rik van Riel4142c3e2016-01-25 17:07:39 -05001423 if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
1424 ACTIVE_NODE_FRACTION)
Rik van Riel10f39042014-01-27 17:03:44 -05001425 return true;
1426
1427 /*
Rik van Riel4142c3e2016-01-25 17:07:39 -05001428 * Distribute memory according to CPU & memory use on each node,
1429 * with 3/4 hysteresis to avoid unnecessary memory migrations:
1430 *
1431 * faults_cpu(dst) 3 faults_cpu(src)
1432 * --------------- * - > ---------------
1433 * faults_mem(dst) 4 faults_mem(src)
Rik van Riel10f39042014-01-27 17:03:44 -05001434 */
Rik van Riel4142c3e2016-01-25 17:07:39 -05001435 return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
1436 group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
Rik van Riel10f39042014-01-27 17:03:44 -05001437}
1438
Vincent Guittot6499b1b2020-02-24 09:52:15 +00001439/*
1440 * 'numa_type' describes the node at the moment of load balancing.
1441 */
1442enum numa_type {
1443 /* The node has spare capacity that can be used to run more tasks. */
1444 node_has_spare = 0,
1445 /*
1446 * The node is fully used and the tasks don't compete for more CPU
1447 * cycles. Nevertheless, some tasks might wait before running.
1448 */
1449 node_fully_busy,
1450 /*
1451 * The node is overloaded and can't provide expected CPU cycles to all
1452 * tasks.
1453 */
1454 node_overloaded
1455};
Mel Gormane6628d52013-10-07 11:29:02 +01001456
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001457/* Cached statistics for all CPUs within a node */
Mel Gorman58d081b2013-10-07 11:29:10 +01001458struct numa_stats {
1459 unsigned long load;
Vincent Guittot8e0e0ed2020-09-21 09:29:59 +02001460 unsigned long runnable;
Vincent Guittot6499b1b2020-02-24 09:52:15 +00001461 unsigned long util;
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001462 /* Total compute capacity of CPUs on a node */
Nicolas Pitre5ef20ca2014-05-26 18:19:34 -04001463 unsigned long compute_capacity;
Vincent Guittot6499b1b2020-02-24 09:52:15 +00001464 unsigned int nr_running;
1465 unsigned int weight;
1466 enum numa_type node_type;
Mel Gormanff7db0b2020-02-24 09:52:20 +00001467 int idle_cpu;
Mel Gorman58d081b2013-10-07 11:29:10 +01001468};
Mel Gormane6628d52013-10-07 11:29:02 +01001469
Mel Gormanff7db0b2020-02-24 09:52:20 +00001470static inline bool is_core_idle(int cpu)
1471{
1472#ifdef CONFIG_SCHED_SMT
1473 int sibling;
1474
1475 for_each_cpu(sibling, cpu_smt_mask(cpu)) {
1476 if (cpu == sibling)
1477 continue;
1478
Mika Penttilä1c6829c2021-07-22 09:39:46 +03001479 if (!idle_cpu(sibling))
Mel Gormanff7db0b2020-02-24 09:52:20 +00001480 return false;
1481 }
1482#endif
1483
1484 return true;
1485}
1486
Mel Gorman58d081b2013-10-07 11:29:10 +01001487struct task_numa_env {
1488 struct task_struct *p;
1489
1490 int src_cpu, src_nid;
1491 int dst_cpu, dst_nid;
1492
1493 struct numa_stats src_stats, dst_stats;
1494
Wanpeng Li40ea2b42013-12-05 19:10:17 +08001495 int imbalance_pct;
Rik van Riel7bd95322014-10-17 03:29:51 -04001496 int dist;
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001497
1498 struct task_struct *best_task;
1499 long best_imp;
Mel Gorman58d081b2013-10-07 11:29:10 +01001500 int best_cpu;
1501};
1502
Vincent Guittot6499b1b2020-02-24 09:52:15 +00001503static unsigned long cpu_load(struct rq *rq);
Vincent Guittot8e0e0ed2020-09-21 09:29:59 +02001504static unsigned long cpu_runnable(struct rq *rq);
Mel Gorman7d2b5dd2020-11-20 09:06:29 +00001505static inline long adjust_numa_imbalance(int imbalance,
1506 int dst_running, int dst_weight);
Vincent Guittot6499b1b2020-02-24 09:52:15 +00001507
1508static inline enum
1509numa_type numa_classify(unsigned int imbalance_pct,
1510 struct numa_stats *ns)
1511{
1512 if ((ns->nr_running > ns->weight) &&
Vincent Guittot8e0e0ed2020-09-21 09:29:59 +02001513 (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) ||
1514 ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100))))
Vincent Guittot6499b1b2020-02-24 09:52:15 +00001515 return node_overloaded;
1516
1517 if ((ns->nr_running < ns->weight) ||
Vincent Guittot8e0e0ed2020-09-21 09:29:59 +02001518 (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) &&
1519 ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100))))
Vincent Guittot6499b1b2020-02-24 09:52:15 +00001520 return node_has_spare;
1521
1522 return node_fully_busy;
1523}
1524
Valentin Schneider76c389a2020-03-03 11:02:57 +00001525#ifdef CONFIG_SCHED_SMT
1526/* Forward declarations of select_idle_sibling helpers */
1527static inline bool test_idle_cores(int cpu, bool def);
Mel Gormanff7db0b2020-02-24 09:52:20 +00001528static inline int numa_idle_core(int idle_core, int cpu)
1529{
Mel Gormanff7db0b2020-02-24 09:52:20 +00001530 if (!static_branch_likely(&sched_smt_present) ||
1531 idle_core >= 0 || !test_idle_cores(cpu, false))
1532 return idle_core;
1533
1534 /*
1535 * Prefer cores instead of packing HT siblings
1536 * and triggering future load balancing.
1537 */
1538 if (is_core_idle(cpu))
1539 idle_core = cpu;
Mel Gormanff7db0b2020-02-24 09:52:20 +00001540
1541 return idle_core;
1542}
Valentin Schneider76c389a2020-03-03 11:02:57 +00001543#else
1544static inline int numa_idle_core(int idle_core, int cpu)
1545{
1546 return idle_core;
1547}
1548#endif
Mel Gormanff7db0b2020-02-24 09:52:20 +00001549
Vincent Guittot6499b1b2020-02-24 09:52:15 +00001550/*
Mel Gormanff7db0b2020-02-24 09:52:20 +00001551 * Gather all necessary information to make NUMA balancing placement
1552 * decisions that are compatible with standard load balancer. This
1553 * borrows code and logic from update_sg_lb_stats but sharing a
1554 * common implementation is impractical.
Vincent Guittot6499b1b2020-02-24 09:52:15 +00001555 */
1556static void update_numa_stats(struct task_numa_env *env,
Mel Gormanff7db0b2020-02-24 09:52:20 +00001557 struct numa_stats *ns, int nid,
1558 bool find_idle)
Vincent Guittot6499b1b2020-02-24 09:52:15 +00001559{
Mel Gormanff7db0b2020-02-24 09:52:20 +00001560 int cpu, idle_core = -1;
Vincent Guittot6499b1b2020-02-24 09:52:15 +00001561
1562 memset(ns, 0, sizeof(*ns));
Mel Gormanff7db0b2020-02-24 09:52:20 +00001563 ns->idle_cpu = -1;
1564
Mel Gorman0621df32020-02-27 19:18:04 +00001565 rcu_read_lock();
Vincent Guittot6499b1b2020-02-24 09:52:15 +00001566 for_each_cpu(cpu, cpumask_of_node(nid)) {
1567 struct rq *rq = cpu_rq(cpu);
1568
1569 ns->load += cpu_load(rq);
Vincent Guittot8e0e0ed2020-09-21 09:29:59 +02001570 ns->runnable += cpu_runnable(rq);
Dietmar Eggemann82762d22021-11-18 17:42:40 +01001571 ns->util += cpu_util_cfs(cpu);
Vincent Guittot6499b1b2020-02-24 09:52:15 +00001572 ns->nr_running += rq->cfs.h_nr_running;
1573 ns->compute_capacity += capacity_of(cpu);
Mel Gormanff7db0b2020-02-24 09:52:20 +00001574
1575 if (find_idle && !rq->nr_running && idle_cpu(cpu)) {
1576 if (READ_ONCE(rq->numa_migrate_on) ||
1577 !cpumask_test_cpu(cpu, env->p->cpus_ptr))
1578 continue;
1579
1580 if (ns->idle_cpu == -1)
1581 ns->idle_cpu = cpu;
1582
1583 idle_core = numa_idle_core(idle_core, cpu);
1584 }
Vincent Guittot6499b1b2020-02-24 09:52:15 +00001585 }
Mel Gorman0621df32020-02-27 19:18:04 +00001586 rcu_read_unlock();
Vincent Guittot6499b1b2020-02-24 09:52:15 +00001587
1588 ns->weight = cpumask_weight(cpumask_of_node(nid));
1589
1590 ns->node_type = numa_classify(env->imbalance_pct, ns);
Mel Gormanff7db0b2020-02-24 09:52:20 +00001591
1592 if (idle_core >= 0)
1593 ns->idle_cpu = idle_core;
Vincent Guittot6499b1b2020-02-24 09:52:15 +00001594}
1595
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001596static void task_numa_assign(struct task_numa_env *env,
1597 struct task_struct *p, long imp)
1598{
Srikar Dronamrajua4739ec2018-09-21 23:18:56 +05301599 struct rq *rq = cpu_rq(env->dst_cpu);
1600
Mel Gorman5fb52dd2020-02-24 09:52:21 +00001601 /* Check if run-queue part of active NUMA balance. */
1602 if (env->best_cpu != env->dst_cpu && xchg(&rq->numa_migrate_on, 1)) {
1603 int cpu;
1604 int start = env->dst_cpu;
Srikar Dronamrajua4739ec2018-09-21 23:18:56 +05301605
Mel Gorman5fb52dd2020-02-24 09:52:21 +00001606 /* Find alternative idle CPU. */
1607 for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) {
1608 if (cpu == env->best_cpu || !idle_cpu(cpu) ||
1609 !cpumask_test_cpu(cpu, env->p->cpus_ptr)) {
1610 continue;
1611 }
1612
1613 env->dst_cpu = cpu;
1614 rq = cpu_rq(env->dst_cpu);
1615 if (!xchg(&rq->numa_migrate_on, 1))
1616 goto assign;
1617 }
1618
1619 /* Failed to find an alternative idle CPU */
1620 return;
1621 }
1622
1623assign:
Srikar Dronamrajua4739ec2018-09-21 23:18:56 +05301624 /*
1625 * Clear previous best_cpu/rq numa-migrate flag, since task now
1626 * found a better CPU to move/swap.
1627 */
Mel Gorman5fb52dd2020-02-24 09:52:21 +00001628 if (env->best_cpu != -1 && env->best_cpu != env->dst_cpu) {
Srikar Dronamrajua4739ec2018-09-21 23:18:56 +05301629 rq = cpu_rq(env->best_cpu);
1630 WRITE_ONCE(rq->numa_migrate_on, 0);
1631 }
1632
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001633 if (env->best_task)
1634 put_task_struct(env->best_task);
Oleg Nesterovbac78572016-05-18 21:57:33 +02001635 if (p)
1636 get_task_struct(p);
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001637
1638 env->best_task = p;
1639 env->best_imp = imp;
1640 env->best_cpu = env->dst_cpu;
1641}
1642
Rik van Riel28a21742014-06-23 11:46:13 -04001643static bool load_too_imbalanced(long src_load, long dst_load,
Rik van Riele63da032014-05-14 13:22:21 -04001644 struct task_numa_env *env)
1645{
Rik van Riele4991b22015-05-27 15:04:27 -04001646 long imb, old_imb;
1647 long orig_src_load, orig_dst_load;
Rik van Riel28a21742014-06-23 11:46:13 -04001648 long src_capacity, dst_capacity;
1649
1650 /*
1651 * The load is corrected for the CPU capacity available on each node.
1652 *
1653 * src_load dst_load
1654 * ------------ vs ---------
1655 * src_capacity dst_capacity
1656 */
1657 src_capacity = env->src_stats.compute_capacity;
1658 dst_capacity = env->dst_stats.compute_capacity;
Rik van Riele63da032014-05-14 13:22:21 -04001659
Srikar Dronamraju5f95ba72018-06-20 22:32:44 +05301660 imb = abs(dst_load * src_capacity - src_load * dst_capacity);
Rik van Riele63da032014-05-14 13:22:21 -04001661
Rik van Riel28a21742014-06-23 11:46:13 -04001662 orig_src_load = env->src_stats.load;
Rik van Riele4991b22015-05-27 15:04:27 -04001663 orig_dst_load = env->dst_stats.load;
Rik van Riel28a21742014-06-23 11:46:13 -04001664
Srikar Dronamraju5f95ba72018-06-20 22:32:44 +05301665 old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
Rik van Riele4991b22015-05-27 15:04:27 -04001666
1667 /* Would this change make things worse? */
1668 return (imb > old_imb);
Rik van Riele63da032014-05-14 13:22:21 -04001669}
1670
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001671/*
Srikar Dronamraju6fd98e72018-09-21 23:19:01 +05301672 * Maximum NUMA importance can be 1998 (2*999);
1673 * SMALLIMP @ 30 would be close to 1998/64.
1674 * Used to deter task migration.
1675 */
1676#define SMALLIMP 30
1677
1678/*
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001679 * This checks if the overall compute and NUMA accesses of the system would
1680 * be improved if the source tasks was migrated to the target dst_cpu taking
1681 * into account that it might be best if task running on the dst_cpu should
1682 * be exchanged with the source task
1683 */
Mel Gormana0f03b62020-02-24 09:52:23 +00001684static bool task_numa_compare(struct task_numa_env *env,
Srikar Dronamraju305c1fa2018-06-20 22:32:43 +05301685 long taskimp, long groupimp, bool maymove)
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001686{
Jann Horncb361d82019-07-16 17:20:47 +02001687 struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001688 struct rq *dst_rq = cpu_rq(env->dst_cpu);
Jann Horncb361d82019-07-16 17:20:47 +02001689 long imp = p_ng ? groupimp : taskimp;
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001690 struct task_struct *cur;
Rik van Riel28a21742014-06-23 11:46:13 -04001691 long src_load, dst_load;
Rik van Riel7bd95322014-10-17 03:29:51 -04001692 int dist = env->dist;
Jann Horncb361d82019-07-16 17:20:47 +02001693 long moveimp = imp;
1694 long load;
Mel Gormana0f03b62020-02-24 09:52:23 +00001695 bool stopsearch = false;
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001696
Srikar Dronamrajua4739ec2018-09-21 23:18:56 +05301697 if (READ_ONCE(dst_rq->numa_migrate_on))
Mel Gormana0f03b62020-02-24 09:52:23 +00001698 return false;
Srikar Dronamrajua4739ec2018-09-21 23:18:56 +05301699
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001700 rcu_read_lock();
Eric W. Biederman154abaf2019-09-14 07:34:30 -05001701 cur = rcu_dereference(dst_rq->curr);
Oleg Nesterovbac78572016-05-18 21:57:33 +02001702 if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001703 cur = NULL;
1704
1705 /*
Peter Zijlstra7af68332014-11-10 10:54:35 +01001706 * Because we have preemption enabled we can get migrated around and
1707 * end try selecting ourselves (current == env->p) as a swap candidate.
1708 */
Mel Gormana0f03b62020-02-24 09:52:23 +00001709 if (cur == env->p) {
1710 stopsearch = true;
Peter Zijlstra7af68332014-11-10 10:54:35 +01001711 goto unlock;
Mel Gormana0f03b62020-02-24 09:52:23 +00001712 }
Peter Zijlstra7af68332014-11-10 10:54:35 +01001713
Srikar Dronamraju305c1fa2018-06-20 22:32:43 +05301714 if (!cur) {
Srikar Dronamraju6fd98e72018-09-21 23:19:01 +05301715 if (maymove && moveimp >= env->best_imp)
Srikar Dronamraju305c1fa2018-06-20 22:32:43 +05301716 goto assign;
1717 else
1718 goto unlock;
1719 }
1720
Mel Gorman88cca722020-02-24 09:52:22 +00001721 /* Skip this swap candidate if cannot move to the source cpu. */
1722 if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
1723 goto unlock;
1724
1725 /*
1726 * Skip this swap candidate if it is not moving to its preferred
1727 * node and the best task is.
1728 */
1729 if (env->best_task &&
1730 env->best_task->numa_preferred_nid == env->src_nid &&
1731 cur->numa_preferred_nid != env->src_nid) {
1732 goto unlock;
1733 }
1734
Peter Zijlstra7af68332014-11-10 10:54:35 +01001735 /*
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001736 * "imp" is the fault differential for the source task between the
1737 * source and destination node. Calculate the total differential for
1738 * the source task and potential destination task. The more negative
Srikar Dronamraju305c1fa2018-06-20 22:32:43 +05301739 * the value is, the more remote accesses that would be expected to
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001740 * be incurred if the tasks were swapped.
Mel Gorman88cca722020-02-24 09:52:22 +00001741 *
Srikar Dronamraju305c1fa2018-06-20 22:32:43 +05301742 * If dst and source tasks are in the same NUMA group, or not
1743 * in any group then look only at task weights.
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001744 */
Jann Horncb361d82019-07-16 17:20:47 +02001745 cur_ng = rcu_dereference(cur->numa_group);
1746 if (cur_ng == p_ng) {
Srikar Dronamraju305c1fa2018-06-20 22:32:43 +05301747 imp = taskimp + task_weight(cur, env->src_nid, dist) -
1748 task_weight(cur, env->dst_nid, dist);
Rik van Riel0132c3e2014-06-23 11:46:16 -04001749 /*
Srikar Dronamraju305c1fa2018-06-20 22:32:43 +05301750 * Add some hysteresis to prevent swapping the
1751 * tasks within a group over tiny differences.
Rik van Riel0132c3e2014-06-23 11:46:16 -04001752 */
Jann Horncb361d82019-07-16 17:20:47 +02001753 if (cur_ng)
Srikar Dronamraju305c1fa2018-06-20 22:32:43 +05301754 imp -= imp / 16;
1755 } else {
1756 /*
1757 * Compare the group weights. If a task is all by itself
1758 * (not part of a group), use the task weight instead.
1759 */
Jann Horncb361d82019-07-16 17:20:47 +02001760 if (cur_ng && p_ng)
Srikar Dronamraju305c1fa2018-06-20 22:32:43 +05301761 imp += group_weight(cur, env->src_nid, dist) -
1762 group_weight(cur, env->dst_nid, dist);
1763 else
1764 imp += task_weight(cur, env->src_nid, dist) -
1765 task_weight(cur, env->dst_nid, dist);
Rik van Riel0132c3e2014-06-23 11:46:16 -04001766 }
1767
Mel Gorman88cca722020-02-24 09:52:22 +00001768 /* Discourage picking a task already on its preferred node */
1769 if (cur->numa_preferred_nid == env->dst_nid)
1770 imp -= imp / 16;
1771
1772 /*
1773 * Encourage picking a task that moves to its preferred node.
1774 * This potentially makes imp larger than it's maximum of
1775 * 1998 (see SMALLIMP and task_weight for why) but in this
1776 * case, it does not matter.
1777 */
1778 if (cur->numa_preferred_nid == env->src_nid)
1779 imp += imp / 8;
1780
Srikar Dronamraju305c1fa2018-06-20 22:32:43 +05301781 if (maymove && moveimp > imp && moveimp > env->best_imp) {
Srikar Dronamraju6fd98e72018-09-21 23:19:01 +05301782 imp = moveimp;
Srikar Dronamraju305c1fa2018-06-20 22:32:43 +05301783 cur = NULL;
1784 goto assign;
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001785 }
1786
Srikar Dronamraju305c1fa2018-06-20 22:32:43 +05301787 /*
Mel Gorman88cca722020-02-24 09:52:22 +00001788 * Prefer swapping with a task moving to its preferred node over a
1789 * task that is not.
1790 */
1791 if (env->best_task && cur->numa_preferred_nid == env->src_nid &&
1792 env->best_task->numa_preferred_nid != env->src_nid) {
1793 goto assign;
1794 }
1795
1796 /*
Srikar Dronamraju6fd98e72018-09-21 23:19:01 +05301797 * If the NUMA importance is less than SMALLIMP,
1798 * task migration might only result in ping pong
1799 * of tasks and also hurt performance due to cache
1800 * misses.
1801 */
1802 if (imp < SMALLIMP || imp <= env->best_imp + SMALLIMP / 2)
1803 goto unlock;
1804
1805 /*
Srikar Dronamraju305c1fa2018-06-20 22:32:43 +05301806 * In the overloaded case, try and keep the load balanced.
1807 */
1808 load = task_h_load(env->p) - task_h_load(cur);
1809 if (!load)
1810 goto assign;
1811
1812 dst_load = env->dst_stats.load + load;
1813 src_load = env->src_stats.load - load;
1814
Rik van Riel28a21742014-06-23 11:46:13 -04001815 if (load_too_imbalanced(src_load, dst_load, env))
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001816 goto unlock;
1817
Srikar Dronamraju305c1fa2018-06-20 22:32:43 +05301818assign:
Mel Gormanff7db0b2020-02-24 09:52:20 +00001819 /* Evaluate an idle CPU for a task numa move. */
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02001820 if (!cur) {
Mel Gormanff7db0b2020-02-24 09:52:20 +00001821 int cpu = env->dst_stats.idle_cpu;
1822
1823 /* Nothing cached so current CPU went idle since the search. */
1824 if (cpu < 0)
1825 cpu = env->dst_cpu;
1826
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02001827 /*
Mel Gormanff7db0b2020-02-24 09:52:20 +00001828 * If the CPU is no longer truly idle and the previous best CPU
1829 * is, keep using it.
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02001830 */
Mel Gormanff7db0b2020-02-24 09:52:20 +00001831 if (!idle_cpu(cpu) && env->best_cpu >= 0 &&
1832 idle_cpu(env->best_cpu)) {
1833 cpu = env->best_cpu;
1834 }
1835
Mel Gormanff7db0b2020-02-24 09:52:20 +00001836 env->dst_cpu = cpu;
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02001837 }
Rik van Rielba7e5a22014-09-04 16:35:30 -04001838
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001839 task_numa_assign(env, cur, imp);
Mel Gormana0f03b62020-02-24 09:52:23 +00001840
1841 /*
1842 * If a move to idle is allowed because there is capacity or load
1843 * balance improves then stop the search. While a better swap
1844 * candidate may exist, a search is not free.
1845 */
1846 if (maymove && !cur && env->best_cpu >= 0 && idle_cpu(env->best_cpu))
1847 stopsearch = true;
1848
1849 /*
1850 * If a swap candidate must be identified and the current best task
1851 * moves its preferred node then stop the search.
1852 */
1853 if (!maymove && env->best_task &&
1854 env->best_task->numa_preferred_nid == env->src_nid) {
1855 stopsearch = true;
1856 }
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001857unlock:
1858 rcu_read_unlock();
Mel Gormana0f03b62020-02-24 09:52:23 +00001859
1860 return stopsearch;
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001861}
1862
Rik van Riel887c2902013-10-07 11:29:31 +01001863static void task_numa_find_cpu(struct task_numa_env *env,
1864 long taskimp, long groupimp)
Mel Gorman2c8a50a2013-10-07 11:29:18 +01001865{
Srikar Dronamraju305c1fa2018-06-20 22:32:43 +05301866 bool maymove = false;
Mel Gorman2c8a50a2013-10-07 11:29:18 +01001867 int cpu;
1868
Srikar Dronamraju305c1fa2018-06-20 22:32:43 +05301869 /*
Mel Gormanfb86f5b2020-02-24 09:52:16 +00001870 * If dst node has spare capacity, then check if there is an
1871 * imbalance that would be overruled by the load balancer.
Srikar Dronamraju305c1fa2018-06-20 22:32:43 +05301872 */
Mel Gormanfb86f5b2020-02-24 09:52:16 +00001873 if (env->dst_stats.node_type == node_has_spare) {
1874 unsigned int imbalance;
1875 int src_running, dst_running;
1876
1877 /*
1878 * Would movement cause an imbalance? Note that if src has
1879 * more running tasks that the imbalance is ignored as the
1880 * move improves the imbalance from the perspective of the
1881 * CPU load balancer.
1882 * */
1883 src_running = env->src_stats.nr_running - 1;
1884 dst_running = env->dst_stats.nr_running + 1;
1885 imbalance = max(0, dst_running - src_running);
Mel Gorman7d2b5dd2020-11-20 09:06:29 +00001886 imbalance = adjust_numa_imbalance(imbalance, dst_running,
1887 env->dst_stats.weight);
Mel Gormanfb86f5b2020-02-24 09:52:16 +00001888
1889 /* Use idle CPU if there is no imbalance */
Mel Gormanff7db0b2020-02-24 09:52:20 +00001890 if (!imbalance) {
Mel Gormanfb86f5b2020-02-24 09:52:16 +00001891 maymove = true;
Mel Gormanff7db0b2020-02-24 09:52:20 +00001892 if (env->dst_stats.idle_cpu >= 0) {
1893 env->dst_cpu = env->dst_stats.idle_cpu;
1894 task_numa_assign(env, NULL, 0);
1895 return;
1896 }
1897 }
Mel Gormanfb86f5b2020-02-24 09:52:16 +00001898 } else {
1899 long src_load, dst_load, load;
1900 /*
1901 * If the improvement from just moving env->p direction is better
1902 * than swapping tasks around, check if a move is possible.
1903 */
1904 load = task_h_load(env->p);
1905 dst_load = env->dst_stats.load + load;
1906 src_load = env->src_stats.load - load;
1907 maymove = !load_too_imbalanced(src_load, dst_load, env);
1908 }
Srikar Dronamraju305c1fa2018-06-20 22:32:43 +05301909
Mel Gorman2c8a50a2013-10-07 11:29:18 +01001910 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1911 /* Skip this CPU if the source task cannot migrate */
Sebastian Andrzej Siewior3bd37062019-04-23 16:26:36 +02001912 if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
Mel Gorman2c8a50a2013-10-07 11:29:18 +01001913 continue;
1914
1915 env->dst_cpu = cpu;
Mel Gormana0f03b62020-02-24 09:52:23 +00001916 if (task_numa_compare(env, taskimp, groupimp, maymove))
1917 break;
Mel Gorman2c8a50a2013-10-07 11:29:18 +01001918 }
1919}
1920
Mel Gorman58d081b2013-10-07 11:29:10 +01001921static int task_numa_migrate(struct task_struct *p)
Mel Gormane6628d52013-10-07 11:29:02 +01001922{
Mel Gorman58d081b2013-10-07 11:29:10 +01001923 struct task_numa_env env = {
1924 .p = p,
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001925
Mel Gorman58d081b2013-10-07 11:29:10 +01001926 .src_cpu = task_cpu(p),
Ingo Molnarb32e86b2013-10-07 11:29:30 +01001927 .src_nid = task_node(p),
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001928
1929 .imbalance_pct = 112,
1930
1931 .best_task = NULL,
1932 .best_imp = 0,
Rik van Riel4142c3e2016-01-25 17:07:39 -05001933 .best_cpu = -1,
Mel Gorman58d081b2013-10-07 11:29:10 +01001934 };
Rik van Riel887c2902013-10-07 11:29:31 +01001935 unsigned long taskweight, groupweight;
Jann Horncb361d82019-07-16 17:20:47 +02001936 struct sched_domain *sd;
Rik van Riel887c2902013-10-07 11:29:31 +01001937 long taskimp, groupimp;
Jann Horncb361d82019-07-16 17:20:47 +02001938 struct numa_group *ng;
1939 struct rq *best_rq;
1940 int nid, ret, dist;
Mel Gormane6628d52013-10-07 11:29:02 +01001941
Mel Gorman58d081b2013-10-07 11:29:10 +01001942 /*
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001943 * Pick the lowest SD_NUMA domain, as that would have the smallest
1944 * imbalance and would be the first to start moving tasks about.
1945 *
1946 * And we want to avoid any moving of tasks about, as that would create
1947 * random movement of tasks -- counter the numa conditions we're trying
1948 * to satisfy here.
Mel Gorman58d081b2013-10-07 11:29:10 +01001949 */
Mel Gormane6628d52013-10-07 11:29:02 +01001950 rcu_read_lock();
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001951 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
Rik van Riel46a73e82013-11-11 19:29:25 -05001952 if (sd)
1953 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
Mel Gormane6628d52013-10-07 11:29:02 +01001954 rcu_read_unlock();
1955
Rik van Riel46a73e82013-11-11 19:29:25 -05001956 /*
1957 * Cpusets can break the scheduler domain tree into smaller
1958 * balance domains, some of which do not cross NUMA boundaries.
1959 * Tasks that are "trapped" in such domains cannot be migrated
1960 * elsewhere, so there is no point in (re)trying.
1961 */
1962 if (unlikely(!sd)) {
Srikar Dronamraju8cd45ee2018-06-20 22:32:45 +05301963 sched_setnuma(p, task_node(p));
Rik van Riel46a73e82013-11-11 19:29:25 -05001964 return -EINVAL;
1965 }
1966
Mel Gorman2c8a50a2013-10-07 11:29:18 +01001967 env.dst_nid = p->numa_preferred_nid;
Rik van Riel7bd95322014-10-17 03:29:51 -04001968 dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1969 taskweight = task_weight(p, env.src_nid, dist);
1970 groupweight = group_weight(p, env.src_nid, dist);
Mel Gormanff7db0b2020-02-24 09:52:20 +00001971 update_numa_stats(&env, &env.src_stats, env.src_nid, false);
Rik van Riel7bd95322014-10-17 03:29:51 -04001972 taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1973 groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
Mel Gormanff7db0b2020-02-24 09:52:20 +00001974 update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
Mel Gorman58d081b2013-10-07 11:29:10 +01001975
Rik van Riela43455a2014-06-04 16:09:42 -04001976 /* Try to find a spot on the preferred nid. */
Srikar Dronamraju2d4056f2018-06-20 22:32:53 +05301977 task_numa_find_cpu(&env, taskimp, groupimp);
Rik van Riele1dda8a2013-10-07 11:29:19 +01001978
Rik van Riel9de05d42014-10-09 17:27:47 -04001979 /*
1980 * Look at other nodes in these cases:
1981 * - there is no space available on the preferred_nid
1982 * - the task is part of a numa_group that is interleaved across
1983 * multiple NUMA nodes; in order to better consolidate the group,
1984 * we need to check other locations.
1985 */
Jann Horncb361d82019-07-16 17:20:47 +02001986 ng = deref_curr_numa_group(p);
1987 if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
Mel Gorman2c8a50a2013-10-07 11:29:18 +01001988 for_each_online_node(nid) {
1989 if (nid == env.src_nid || nid == p->numa_preferred_nid)
Mel Gormanfb13c7e2013-10-07 11:29:17 +01001990 continue;
1991
Rik van Riel7bd95322014-10-17 03:29:51 -04001992 dist = node_distance(env.src_nid, env.dst_nid);
Rik van Riel6c6b1192014-10-17 03:29:52 -04001993 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1994 dist != env.dist) {
1995 taskweight = task_weight(p, env.src_nid, dist);
1996 groupweight = group_weight(p, env.src_nid, dist);
1997 }
Rik van Riel7bd95322014-10-17 03:29:51 -04001998
Mel Gorman83e1d2c2013-10-07 11:29:27 +01001999 /* Only consider nodes where both task and groups benefit */
Rik van Riel7bd95322014-10-17 03:29:51 -04002000 taskimp = task_weight(p, nid, dist) - taskweight;
2001 groupimp = group_weight(p, nid, dist) - groupweight;
Rik van Riel887c2902013-10-07 11:29:31 +01002002 if (taskimp < 0 && groupimp < 0)
Mel Gorman2c8a50a2013-10-07 11:29:18 +01002003 continue;
2004
Rik van Riel7bd95322014-10-17 03:29:51 -04002005 env.dist = dist;
Mel Gorman2c8a50a2013-10-07 11:29:18 +01002006 env.dst_nid = nid;
Mel Gormanff7db0b2020-02-24 09:52:20 +00002007 update_numa_stats(&env, &env.dst_stats, env.dst_nid, true);
Srikar Dronamraju2d4056f2018-06-20 22:32:53 +05302008 task_numa_find_cpu(&env, taskimp, groupimp);
Mel Gorman58d081b2013-10-07 11:29:10 +01002009 }
2010 }
2011
Rik van Riel68d1b022014-04-11 13:00:29 -04002012 /*
2013 * If the task is part of a workload that spans multiple NUMA nodes,
2014 * and is migrating into one of the workload's active nodes, remember
2015 * this node as the task's preferred numa node, so the workload can
2016 * settle down.
2017 * A task that migrated to a second choice node will be better off
2018 * trying for a better one later. Do not set the preferred node here.
2019 */
Jann Horncb361d82019-07-16 17:20:47 +02002020 if (ng) {
Rik van Rieldb015da2014-06-23 11:41:34 -04002021 if (env.best_cpu == -1)
2022 nid = env.src_nid;
2023 else
Srikar Dronamraju8cd45ee2018-06-20 22:32:45 +05302024 nid = cpu_to_node(env.best_cpu);
Rik van Rieldb015da2014-06-23 11:41:34 -04002025
Srikar Dronamraju8cd45ee2018-06-20 22:32:45 +05302026 if (nid != p->numa_preferred_nid)
2027 sched_setnuma(p, nid);
Rik van Rieldb015da2014-06-23 11:41:34 -04002028 }
2029
2030 /* No better CPU than the current one was found. */
Mel Gormanf22aef42020-02-24 09:52:12 +00002031 if (env.best_cpu == -1) {
Mel Gormanb2b20422020-02-24 09:52:13 +00002032 trace_sched_stick_numa(p, env.src_cpu, NULL, -1);
Rik van Rieldb015da2014-06-23 11:41:34 -04002033 return -EAGAIN;
Mel Gormanf22aef42020-02-24 09:52:12 +00002034 }
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01002035
Srikar Dronamrajua4739ec2018-09-21 23:18:56 +05302036 best_rq = cpu_rq(env.best_cpu);
Mel Gormanfb13c7e2013-10-07 11:29:17 +01002037 if (env.best_task == NULL) {
Mel Gorman286549d2014-01-21 15:51:03 -08002038 ret = migrate_task_to(p, env.best_cpu);
Srikar Dronamrajua4739ec2018-09-21 23:18:56 +05302039 WRITE_ONCE(best_rq->numa_migrate_on, 0);
Mel Gorman286549d2014-01-21 15:51:03 -08002040 if (ret != 0)
Mel Gormanb2b20422020-02-24 09:52:13 +00002041 trace_sched_stick_numa(p, env.src_cpu, NULL, env.best_cpu);
Mel Gormanfb13c7e2013-10-07 11:29:17 +01002042 return ret;
2043 }
2044
Srikar Dronamraju0ad4e3d2018-06-20 22:32:50 +05302045 ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
Srikar Dronamrajua4739ec2018-09-21 23:18:56 +05302046 WRITE_ONCE(best_rq->numa_migrate_on, 0);
Srikar Dronamraju0ad4e3d2018-06-20 22:32:50 +05302047
Mel Gorman286549d2014-01-21 15:51:03 -08002048 if (ret != 0)
Mel Gormanb2b20422020-02-24 09:52:13 +00002049 trace_sched_stick_numa(p, env.src_cpu, env.best_task, env.best_cpu);
Mel Gormanfb13c7e2013-10-07 11:29:17 +01002050 put_task_struct(env.best_task);
2051 return ret;
Mel Gormane6628d52013-10-07 11:29:02 +01002052}
2053
Mel Gorman6b9a7462013-10-07 11:29:11 +01002054/* Attempt to migrate a task to a CPU on the preferred node. */
2055static void numa_migrate_preferred(struct task_struct *p)
2056{
Rik van Riel5085e2a2014-04-11 13:00:28 -04002057 unsigned long interval = HZ;
2058
Rik van Riel2739d3e2013-10-07 11:29:41 +01002059 /* This task has no NUMA fault statistics yet */
Anshuman Khandual98fa15f2019-03-05 15:42:58 -08002060 if (unlikely(p->numa_preferred_nid == NUMA_NO_NODE || !p->numa_faults))
Rik van Riel2739d3e2013-10-07 11:29:41 +01002061 return;
2062
2063 /* Periodically retry migrating the task to the preferred node */
Rik van Riel5085e2a2014-04-11 13:00:28 -04002064 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
Mel Gorman789ba282018-05-09 17:31:15 +01002065 p->numa_migrate_retry = jiffies + interval;
Rik van Riel2739d3e2013-10-07 11:29:41 +01002066
Mel Gorman6b9a7462013-10-07 11:29:11 +01002067 /* Success if task is already running on preferred CPU */
Wanpeng Lide1b3012013-12-12 15:23:24 +08002068 if (task_node(p) == p->numa_preferred_nid)
Mel Gorman6b9a7462013-10-07 11:29:11 +01002069 return;
2070
Mel Gorman6b9a7462013-10-07 11:29:11 +01002071 /* Otherwise, try migrate to a CPU on the preferred node */
Rik van Riel2739d3e2013-10-07 11:29:41 +01002072 task_numa_migrate(p);
Mel Gorman6b9a7462013-10-07 11:29:11 +01002073}
2074
Rik van Riel04bb2f92013-10-07 11:29:36 +01002075/*
Bharata B Rao7d380f22021-10-04 16:27:05 +05302076 * Find out how many nodes the workload is actively running on. Do this by
Rik van Riel20e07de2014-01-27 17:03:43 -05002077 * tracking the nodes from which NUMA hinting faults are triggered. This can
2078 * be different from the set of nodes where the workload's memory is currently
2079 * located.
Rik van Riel20e07de2014-01-27 17:03:43 -05002080 */
Rik van Riel4142c3e2016-01-25 17:07:39 -05002081static void numa_group_count_active_nodes(struct numa_group *numa_group)
Rik van Riel20e07de2014-01-27 17:03:43 -05002082{
2083 unsigned long faults, max_faults = 0;
Rik van Riel4142c3e2016-01-25 17:07:39 -05002084 int nid, active_nodes = 0;
Rik van Riel20e07de2014-01-27 17:03:43 -05002085
2086 for_each_online_node(nid) {
2087 faults = group_faults_cpu(numa_group, nid);
2088 if (faults > max_faults)
2089 max_faults = faults;
2090 }
2091
2092 for_each_online_node(nid) {
2093 faults = group_faults_cpu(numa_group, nid);
Rik van Riel4142c3e2016-01-25 17:07:39 -05002094 if (faults * ACTIVE_NODE_FRACTION > max_faults)
2095 active_nodes++;
Rik van Riel20e07de2014-01-27 17:03:43 -05002096 }
Rik van Riel4142c3e2016-01-25 17:07:39 -05002097
2098 numa_group->max_faults_cpu = max_faults;
2099 numa_group->active_nodes = active_nodes;
Rik van Riel20e07de2014-01-27 17:03:43 -05002100}
2101
2102/*
Rik van Riel04bb2f92013-10-07 11:29:36 +01002103 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
2104 * increments. The more local the fault statistics are, the higher the scan
Rik van Riela22b4b02014-06-23 11:41:35 -04002105 * period will be for the next scan window. If local/(local+remote) ratio is
2106 * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
2107 * the scan period will decrease. Aim for 70% local accesses.
Rik van Riel04bb2f92013-10-07 11:29:36 +01002108 */
2109#define NUMA_PERIOD_SLOTS 10
Rik van Riela22b4b02014-06-23 11:41:35 -04002110#define NUMA_PERIOD_THRESHOLD 7
Rik van Riel04bb2f92013-10-07 11:29:36 +01002111
2112/*
2113 * Increase the scan period (slow down scanning) if the majority of
2114 * our memory is already on our local node, or if the majority of
2115 * the page accesses are shared with other processes.
2116 * Otherwise, decrease the scan period.
2117 */
2118static void update_task_scan_period(struct task_struct *p,
2119 unsigned long shared, unsigned long private)
2120{
2121 unsigned int period_slot;
Rik van Riel37ec97de2017-07-31 15:28:46 -04002122 int lr_ratio, ps_ratio;
Rik van Riel04bb2f92013-10-07 11:29:36 +01002123 int diff;
2124
2125 unsigned long remote = p->numa_faults_locality[0];
2126 unsigned long local = p->numa_faults_locality[1];
2127
2128 /*
2129 * If there were no record hinting faults then either the task is
Bharata B Rao7d380f22021-10-04 16:27:05 +05302130 * completely idle or all activity is in areas that are not of interest
Mel Gorman074c2382015-03-25 15:55:42 -07002131 * to automatic numa balancing. Related to that, if there were failed
2132 * migration then it implies we are migrating too quickly or the local
2133 * node is overloaded. In either case, scan slower
Rik van Riel04bb2f92013-10-07 11:29:36 +01002134 */
Mel Gorman074c2382015-03-25 15:55:42 -07002135 if (local + shared == 0 || p->numa_faults_locality[2]) {
Rik van Riel04bb2f92013-10-07 11:29:36 +01002136 p->numa_scan_period = min(p->numa_scan_period_max,
2137 p->numa_scan_period << 1);
2138
2139 p->mm->numa_next_scan = jiffies +
2140 msecs_to_jiffies(p->numa_scan_period);
2141
2142 return;
2143 }
2144
2145 /*
2146 * Prepare to scale scan period relative to the current period.
2147 * == NUMA_PERIOD_THRESHOLD scan period stays the same
2148 * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
2149 * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
2150 */
2151 period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
Rik van Riel37ec97de2017-07-31 15:28:46 -04002152 lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
2153 ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
2154
2155 if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
2156 /*
2157 * Most memory accesses are local. There is no need to
2158 * do fast NUMA scanning, since memory is already local.
2159 */
2160 int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
2161 if (!slot)
2162 slot = 1;
2163 diff = slot * period_slot;
2164 } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
2165 /*
2166 * Most memory accesses are shared with other tasks.
2167 * There is no point in continuing fast NUMA scanning,
2168 * since other tasks may just move the memory elsewhere.
2169 */
2170 int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
Rik van Riel04bb2f92013-10-07 11:29:36 +01002171 if (!slot)
2172 slot = 1;
2173 diff = slot * period_slot;
2174 } else {
Rik van Riel04bb2f92013-10-07 11:29:36 +01002175 /*
Rik van Riel37ec97de2017-07-31 15:28:46 -04002176 * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
2177 * yet they are not on the local NUMA node. Speed up
2178 * NUMA scanning to get the memory moved over.
Rik van Riel04bb2f92013-10-07 11:29:36 +01002179 */
Rik van Riel37ec97de2017-07-31 15:28:46 -04002180 int ratio = max(lr_ratio, ps_ratio);
2181 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
Rik van Riel04bb2f92013-10-07 11:29:36 +01002182 }
2183
2184 p->numa_scan_period = clamp(p->numa_scan_period + diff,
2185 task_scan_min(p), task_scan_max(p));
2186 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2187}
2188
Rik van Riel7e2703e2014-01-27 17:03:45 -05002189/*
2190 * Get the fraction of time the task has been running since the last
2191 * NUMA placement cycle. The scheduler keeps similar statistics, but
2192 * decays those on a 32ms period, which is orders of magnitude off
2193 * from the dozens-of-seconds NUMA balancing period. Use the scheduler
2194 * stats only if the task is so new there are no NUMA statistics yet.
2195 */
2196static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
2197{
2198 u64 runtime, delta, now;
2199 /* Use the start of this time slice to avoid calculations. */
2200 now = p->se.exec_start;
2201 runtime = p->se.sum_exec_runtime;
2202
2203 if (p->last_task_numa_placement) {
2204 delta = runtime - p->last_sum_exec_runtime;
2205 *period = now - p->last_task_numa_placement;
Xie XiuQia860fa72019-04-20 16:34:16 +08002206
2207 /* Avoid time going backwards, prevent potential divide error: */
2208 if (unlikely((s64)*period < 0))
2209 *period = 0;
Rik van Riel7e2703e2014-01-27 17:03:45 -05002210 } else {
Peter Zijlstrac7b50212017-05-06 16:42:08 +02002211 delta = p->se.avg.load_sum;
Yuyang Du9d89c252015-07-15 08:04:37 +08002212 *period = LOAD_AVG_MAX;
Rik van Riel7e2703e2014-01-27 17:03:45 -05002213 }
2214
2215 p->last_sum_exec_runtime = runtime;
2216 p->last_task_numa_placement = now;
2217
2218 return delta;
2219}
2220
Rik van Riel54009412014-10-17 03:29:53 -04002221/*
2222 * Determine the preferred nid for a task in a numa_group. This needs to
2223 * be done in a way that produces consistent results with group_weight,
2224 * otherwise workloads might not converge.
2225 */
2226static int preferred_group_nid(struct task_struct *p, int nid)
2227{
2228 nodemask_t nodes;
2229 int dist;
2230
2231 /* Direct connections between all NUMA nodes. */
2232 if (sched_numa_topology_type == NUMA_DIRECT)
2233 return nid;
2234
2235 /*
2236 * On a system with glueless mesh NUMA topology, group_weight
2237 * scores nodes according to the number of NUMA hinting faults on
2238 * both the node itself, and on nearby nodes.
2239 */
2240 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
2241 unsigned long score, max_score = 0;
2242 int node, max_node = nid;
2243
2244 dist = sched_max_numa_distance;
2245
2246 for_each_online_node(node) {
2247 score = group_weight(p, node, dist);
2248 if (score > max_score) {
2249 max_score = score;
2250 max_node = node;
2251 }
2252 }
2253 return max_node;
2254 }
2255
2256 /*
2257 * Finding the preferred nid in a system with NUMA backplane
2258 * interconnect topology is more involved. The goal is to locate
2259 * tasks from numa_groups near each other in the system, and
2260 * untangle workloads from different sides of the system. This requires
2261 * searching down the hierarchy of node groups, recursively searching
2262 * inside the highest scoring group of nodes. The nodemask tricks
2263 * keep the complexity of the search down.
2264 */
2265 nodes = node_online_map;
2266 for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
2267 unsigned long max_faults = 0;
Jan Beulich81907472015-01-23 08:25:38 +00002268 nodemask_t max_group = NODE_MASK_NONE;
Rik van Riel54009412014-10-17 03:29:53 -04002269 int a, b;
2270
2271 /* Are there nodes at this distance from each other? */
2272 if (!find_numa_distance(dist))
2273 continue;
2274
2275 for_each_node_mask(a, nodes) {
2276 unsigned long faults = 0;
2277 nodemask_t this_group;
2278 nodes_clear(this_group);
2279
2280 /* Sum group's NUMA faults; includes a==b case. */
2281 for_each_node_mask(b, nodes) {
2282 if (node_distance(a, b) < dist) {
2283 faults += group_faults(p, b);
2284 node_set(b, this_group);
2285 node_clear(b, nodes);
2286 }
2287 }
2288
2289 /* Remember the top group. */
2290 if (faults > max_faults) {
2291 max_faults = faults;
2292 max_group = this_group;
2293 /*
2294 * subtle: at the smallest distance there is
2295 * just one node left in each "group", the
2296 * winner is the preferred nid.
2297 */
2298 nid = a;
2299 }
2300 }
2301 /* Next round, evaluate the nodes within max_group. */
Jan Beulich890a5402015-02-09 12:30:00 +01002302 if (!max_faults)
2303 break;
Rik van Riel54009412014-10-17 03:29:53 -04002304 nodes = max_group;
2305 }
2306 return nid;
2307}
2308
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002309static void task_numa_placement(struct task_struct *p)
2310{
Anshuman Khandual98fa15f2019-03-05 15:42:58 -08002311 int seq, nid, max_nid = NUMA_NO_NODE;
Srikar Dronamrajuf03bb672018-06-20 22:32:46 +05302312 unsigned long max_faults = 0;
Rik van Riel04bb2f92013-10-07 11:29:36 +01002313 unsigned long fault_types[2] = { 0, 0 };
Rik van Riel7e2703e2014-01-27 17:03:45 -05002314 unsigned long total_faults;
2315 u64 runtime, period;
Mel Gorman7dbd13e2013-10-07 11:29:29 +01002316 spinlock_t *group_lock = NULL;
Jann Horncb361d82019-07-16 17:20:47 +02002317 struct numa_group *ng;
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002318
Jason Low7e5a2c12015-04-30 17:28:14 -07002319 /*
2320 * The p->mm->numa_scan_seq field gets updated without
2321 * exclusive access. Use READ_ONCE() here to ensure
2322 * that the field is read in a single access:
2323 */
Jason Low316c1608d2015-04-28 13:00:20 -07002324 seq = READ_ONCE(p->mm->numa_scan_seq);
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002325 if (p->numa_scan_seq == seq)
2326 return;
2327 p->numa_scan_seq = seq;
Mel Gorman598f0ec2013-10-07 11:28:55 +01002328 p->numa_scan_period_max = task_scan_max(p);
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002329
Rik van Riel7e2703e2014-01-27 17:03:45 -05002330 total_faults = p->numa_faults_locality[0] +
2331 p->numa_faults_locality[1];
2332 runtime = numa_get_avg_runtime(p, &period);
2333
Mel Gorman7dbd13e2013-10-07 11:29:29 +01002334 /* If the task is part of a group prevent parallel updates to group stats */
Jann Horncb361d82019-07-16 17:20:47 +02002335 ng = deref_curr_numa_group(p);
2336 if (ng) {
2337 group_lock = &ng->lock;
Mike Galbraith60e69ee2014-04-07 10:55:15 +02002338 spin_lock_irq(group_lock);
Mel Gorman7dbd13e2013-10-07 11:29:29 +01002339 }
2340
Mel Gorman688b7582013-10-07 11:28:58 +01002341 /* Find the node with the highest number of faults */
2342 for_each_online_node(nid) {
Iulia Manda44dba3d2014-10-31 02:13:31 +02002343 /* Keep track of the offsets in numa_faults array */
2344 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
Mel Gorman83e1d2c2013-10-07 11:29:27 +01002345 unsigned long faults = 0, group_faults = 0;
Iulia Manda44dba3d2014-10-31 02:13:31 +02002346 int priv;
Mel Gorman745d6142013-10-07 11:28:59 +01002347
Rik van Rielbe1e4e72014-01-27 17:03:48 -05002348 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
Rik van Riel7e2703e2014-01-27 17:03:45 -05002349 long diff, f_diff, f_weight;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002350
Iulia Manda44dba3d2014-10-31 02:13:31 +02002351 mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
2352 membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
2353 cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
2354 cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
Mel Gorman745d6142013-10-07 11:28:59 +01002355
Mel Gormanac8e8952013-10-07 11:29:03 +01002356 /* Decay existing window, copy faults since last scan */
Iulia Manda44dba3d2014-10-31 02:13:31 +02002357 diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
2358 fault_types[priv] += p->numa_faults[membuf_idx];
2359 p->numa_faults[membuf_idx] = 0;
Mel Gormanfb13c7e2013-10-07 11:29:17 +01002360
Rik van Riel7e2703e2014-01-27 17:03:45 -05002361 /*
2362 * Normalize the faults_from, so all tasks in a group
2363 * count according to CPU use, instead of by the raw
2364 * number of faults. Tasks with little runtime have
2365 * little over-all impact on throughput, and thus their
2366 * faults are less important.
2367 */
2368 f_weight = div64_u64(runtime << 16, period + 1);
Iulia Manda44dba3d2014-10-31 02:13:31 +02002369 f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
Rik van Riel7e2703e2014-01-27 17:03:45 -05002370 (total_faults + 1);
Iulia Manda44dba3d2014-10-31 02:13:31 +02002371 f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
2372 p->numa_faults[cpubuf_idx] = 0;
Rik van Riel50ec8a42014-01-27 17:03:42 -05002373
Iulia Manda44dba3d2014-10-31 02:13:31 +02002374 p->numa_faults[mem_idx] += diff;
2375 p->numa_faults[cpu_idx] += f_diff;
2376 faults += p->numa_faults[mem_idx];
Mel Gorman83e1d2c2013-10-07 11:29:27 +01002377 p->total_numa_faults += diff;
Jann Horncb361d82019-07-16 17:20:47 +02002378 if (ng) {
Iulia Manda44dba3d2014-10-31 02:13:31 +02002379 /*
2380 * safe because we can only change our own group
2381 *
2382 * mem_idx represents the offset for a given
2383 * nid and priv in a specific region because it
2384 * is at the beginning of the numa_faults array.
2385 */
Jann Horncb361d82019-07-16 17:20:47 +02002386 ng->faults[mem_idx] += diff;
Bharata B Rao5b763a12021-10-04 16:27:04 +05302387 ng->faults[cpu_idx] += f_diff;
Jann Horncb361d82019-07-16 17:20:47 +02002388 ng->total_faults += diff;
2389 group_faults += ng->faults[mem_idx];
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002390 }
Mel Gormanac8e8952013-10-07 11:29:03 +01002391 }
2392
Jann Horncb361d82019-07-16 17:20:47 +02002393 if (!ng) {
Srikar Dronamrajuf03bb672018-06-20 22:32:46 +05302394 if (faults > max_faults) {
2395 max_faults = faults;
2396 max_nid = nid;
2397 }
2398 } else if (group_faults > max_faults) {
2399 max_faults = group_faults;
Mel Gorman688b7582013-10-07 11:28:58 +01002400 max_nid = nid;
2401 }
Mel Gorman83e1d2c2013-10-07 11:29:27 +01002402 }
2403
Jann Horncb361d82019-07-16 17:20:47 +02002404 if (ng) {
2405 numa_group_count_active_nodes(ng);
Mike Galbraith60e69ee2014-04-07 10:55:15 +02002406 spin_unlock_irq(group_lock);
Srikar Dronamrajuf03bb672018-06-20 22:32:46 +05302407 max_nid = preferred_group_nid(p, max_nid);
Mel Gorman688b7582013-10-07 11:28:58 +01002408 }
2409
Rik van Rielbb97fc32014-06-04 16:33:15 -04002410 if (max_faults) {
2411 /* Set the new preferred node */
2412 if (max_nid != p->numa_preferred_nid)
2413 sched_setnuma(p, max_nid);
Mel Gorman3a7053b2013-10-07 11:29:00 +01002414 }
Srikar Dronamraju30619c82018-06-20 22:32:55 +05302415
2416 update_task_scan_period(p, fault_types[0], fault_types[1]);
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002417}
2418
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002419static inline int get_numa_group(struct numa_group *grp)
2420{
Elena Reshetovac45a7792019-01-18 14:27:28 +02002421 return refcount_inc_not_zero(&grp->refcount);
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002422}
2423
2424static inline void put_numa_group(struct numa_group *grp)
2425{
Elena Reshetovac45a7792019-01-18 14:27:28 +02002426 if (refcount_dec_and_test(&grp->refcount))
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002427 kfree_rcu(grp, rcu);
2428}
2429
Mel Gorman3e6a9412013-10-07 11:29:35 +01002430static void task_numa_group(struct task_struct *p, int cpupid, int flags,
2431 int *priv)
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002432{
2433 struct numa_group *grp, *my_grp;
2434 struct task_struct *tsk;
2435 bool join = false;
2436 int cpu = cpupid_to_cpu(cpupid);
2437 int i;
2438
Jann Horncb361d82019-07-16 17:20:47 +02002439 if (unlikely(!deref_curr_numa_group(p))) {
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002440 unsigned int size = sizeof(struct numa_group) +
Bharata B Rao7a2341fc2021-10-04 16:27:03 +05302441 NR_NUMA_HINT_FAULT_STATS *
2442 nr_node_ids * sizeof(unsigned long);
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002443
2444 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
2445 if (!grp)
2446 return;
2447
Elena Reshetovac45a7792019-01-18 14:27:28 +02002448 refcount_set(&grp->refcount, 1);
Rik van Riel4142c3e2016-01-25 17:07:39 -05002449 grp->active_nodes = 1;
2450 grp->max_faults_cpu = 0;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002451 spin_lock_init(&grp->lock);
Mel Gormane29cf082013-10-07 11:29:22 +01002452 grp->gid = p->pid;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002453
Rik van Rielbe1e4e72014-01-27 17:03:48 -05002454 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
Iulia Manda44dba3d2014-10-31 02:13:31 +02002455 grp->faults[i] = p->numa_faults[i];
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002456
Mel Gorman989348b2013-10-07 11:29:40 +01002457 grp->total_faults = p->total_numa_faults;
Mel Gorman83e1d2c2013-10-07 11:29:27 +01002458
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002459 grp->nr_tasks++;
2460 rcu_assign_pointer(p->numa_group, grp);
2461 }
2462
2463 rcu_read_lock();
Jason Low316c1608d2015-04-28 13:00:20 -07002464 tsk = READ_ONCE(cpu_rq(cpu)->curr);
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002465
2466 if (!cpupid_match_pid(tsk, cpupid))
Peter Zijlstra33547812013-10-09 10:24:48 +02002467 goto no_join;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002468
2469 grp = rcu_dereference(tsk->numa_group);
2470 if (!grp)
Peter Zijlstra33547812013-10-09 10:24:48 +02002471 goto no_join;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002472
Jann Horncb361d82019-07-16 17:20:47 +02002473 my_grp = deref_curr_numa_group(p);
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002474 if (grp == my_grp)
Peter Zijlstra33547812013-10-09 10:24:48 +02002475 goto no_join;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002476
2477 /*
2478 * Only join the other group if its bigger; if we're the bigger group,
2479 * the other task will join us.
2480 */
2481 if (my_grp->nr_tasks > grp->nr_tasks)
Peter Zijlstra33547812013-10-09 10:24:48 +02002482 goto no_join;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002483
2484 /*
2485 * Tie-break on the grp address.
2486 */
2487 if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
Peter Zijlstra33547812013-10-09 10:24:48 +02002488 goto no_join;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002489
Rik van Rieldabe1d92013-10-07 11:29:34 +01002490 /* Always join threads in the same process. */
2491 if (tsk->mm == current->mm)
2492 join = true;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002493
Rik van Rieldabe1d92013-10-07 11:29:34 +01002494 /* Simple filter to avoid false positives due to PID collisions */
2495 if (flags & TNF_SHARED)
2496 join = true;
2497
Mel Gorman3e6a9412013-10-07 11:29:35 +01002498 /* Update priv based on whether false sharing was detected */
2499 *priv = !join;
2500
Rik van Rieldabe1d92013-10-07 11:29:34 +01002501 if (join && !get_numa_group(grp))
Peter Zijlstra33547812013-10-09 10:24:48 +02002502 goto no_join;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002503
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002504 rcu_read_unlock();
2505
2506 if (!join)
2507 return;
2508
Mike Galbraith60e69ee2014-04-07 10:55:15 +02002509 BUG_ON(irqs_disabled());
2510 double_lock_irq(&my_grp->lock, &grp->lock);
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002511
Rik van Rielbe1e4e72014-01-27 17:03:48 -05002512 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
Iulia Manda44dba3d2014-10-31 02:13:31 +02002513 my_grp->faults[i] -= p->numa_faults[i];
2514 grp->faults[i] += p->numa_faults[i];
Mel Gorman989348b2013-10-07 11:29:40 +01002515 }
2516 my_grp->total_faults -= p->total_numa_faults;
2517 grp->total_faults += p->total_numa_faults;
2518
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002519 my_grp->nr_tasks--;
2520 grp->nr_tasks++;
2521
2522 spin_unlock(&my_grp->lock);
Mike Galbraith60e69ee2014-04-07 10:55:15 +02002523 spin_unlock_irq(&grp->lock);
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002524
2525 rcu_assign_pointer(p->numa_group, grp);
2526
2527 put_numa_group(my_grp);
Peter Zijlstra33547812013-10-09 10:24:48 +02002528 return;
2529
2530no_join:
2531 rcu_read_unlock();
2532 return;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002533}
2534
Jann Horn16d51a52019-07-16 17:20:45 +02002535/*
Ingo Molnar3b037062021-03-18 13:38:50 +01002536 * Get rid of NUMA statistics associated with a task (either current or dead).
Jann Horn16d51a52019-07-16 17:20:45 +02002537 * If @final is set, the task is dead and has reached refcount zero, so we can
2538 * safely free all relevant data structures. Otherwise, there might be
2539 * concurrent reads from places like load balancing and procfs, and we should
2540 * reset the data back to default state without freeing ->numa_faults.
2541 */
2542void task_numa_free(struct task_struct *p, bool final)
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002543{
Jann Horncb361d82019-07-16 17:20:47 +02002544 /* safe: p either is current or is being freed by current */
2545 struct numa_group *grp = rcu_dereference_raw(p->numa_group);
Jann Horn16d51a52019-07-16 17:20:45 +02002546 unsigned long *numa_faults = p->numa_faults;
Steven Rostedte9dd6852014-05-27 17:02:04 -04002547 unsigned long flags;
2548 int i;
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002549
Jann Horn16d51a52019-07-16 17:20:45 +02002550 if (!numa_faults)
2551 return;
2552
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002553 if (grp) {
Steven Rostedte9dd6852014-05-27 17:02:04 -04002554 spin_lock_irqsave(&grp->lock, flags);
Rik van Rielbe1e4e72014-01-27 17:03:48 -05002555 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
Iulia Manda44dba3d2014-10-31 02:13:31 +02002556 grp->faults[i] -= p->numa_faults[i];
Mel Gorman989348b2013-10-07 11:29:40 +01002557 grp->total_faults -= p->total_numa_faults;
2558
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002559 grp->nr_tasks--;
Steven Rostedte9dd6852014-05-27 17:02:04 -04002560 spin_unlock_irqrestore(&grp->lock, flags);
Andreea-Cristina Bernat35b123e2014-08-22 17:50:43 +03002561 RCU_INIT_POINTER(p->numa_group, NULL);
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002562 put_numa_group(grp);
2563 }
2564
Jann Horn16d51a52019-07-16 17:20:45 +02002565 if (final) {
2566 p->numa_faults = NULL;
2567 kfree(numa_faults);
2568 } else {
2569 p->total_numa_faults = 0;
2570 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2571 numa_faults[i] = 0;
2572 }
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002573}
2574
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002575/*
2576 * Got a PROT_NONE fault for a page on @node.
2577 */
Rik van Riel58b46da2014-01-27 17:03:47 -05002578void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002579{
2580 struct task_struct *p = current;
Peter Zijlstra6688cc02013-10-07 11:29:24 +01002581 bool migrated = flags & TNF_MIGRATED;
Rik van Riel58b46da2014-01-27 17:03:47 -05002582 int cpu_node = task_node(current);
Rik van Riel792568e2014-04-11 13:00:27 -04002583 int local = !!(flags & TNF_FAULT_LOCAL);
Rik van Riel4142c3e2016-01-25 17:07:39 -05002584 struct numa_group *ng;
Mel Gormanac8e8952013-10-07 11:29:03 +01002585 int priv;
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002586
Srikar Dronamraju2a595722015-08-11 21:54:21 +05302587 if (!static_branch_likely(&sched_numa_balancing))
Mel Gorman1a687c22012-11-22 11:16:36 +00002588 return;
2589
Mel Gorman9ff1d9f2013-10-07 11:29:04 +01002590 /* for example, ksmd faulting in a user's mm */
2591 if (!p->mm)
2592 return;
2593
Mel Gormanf809ca92013-10-07 11:28:57 +01002594 /* Allocate buffer to track faults on a per-node basis */
Iulia Manda44dba3d2014-10-31 02:13:31 +02002595 if (unlikely(!p->numa_faults)) {
2596 int size = sizeof(*p->numa_faults) *
Rik van Rielbe1e4e72014-01-27 17:03:48 -05002597 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
Mel Gormanf809ca92013-10-07 11:28:57 +01002598
Iulia Manda44dba3d2014-10-31 02:13:31 +02002599 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2600 if (!p->numa_faults)
Mel Gormanf809ca92013-10-07 11:28:57 +01002601 return;
Mel Gorman745d6142013-10-07 11:28:59 +01002602
Mel Gorman83e1d2c2013-10-07 11:29:27 +01002603 p->total_numa_faults = 0;
Rik van Riel04bb2f92013-10-07 11:29:36 +01002604 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
Mel Gormanf809ca92013-10-07 11:28:57 +01002605 }
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002606
Mel Gormanfb003b82012-11-15 09:01:14 +00002607 /*
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002608 * First accesses are treated as private, otherwise consider accesses
2609 * to be private if the accessing pid has not changed
2610 */
2611 if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2612 priv = 1;
2613 } else {
2614 priv = cpupid_match_pid(p, last_cpupid);
Peter Zijlstra6688cc02013-10-07 11:29:24 +01002615 if (!priv && !(flags & TNF_NO_GROUP))
Mel Gorman3e6a9412013-10-07 11:29:35 +01002616 task_numa_group(p, last_cpupid, flags, &priv);
Peter Zijlstra8c8a7432013-10-07 11:29:21 +01002617 }
2618
Rik van Riel792568e2014-04-11 13:00:27 -04002619 /*
2620 * If a workload spans multiple NUMA nodes, a shared fault that
2621 * occurs wholly within the set of nodes that the workload is
2622 * actively using should be counted as local. This allows the
2623 * scan rate to slow down when a workload has settled down.
2624 */
Jann Horncb361d82019-07-16 17:20:47 +02002625 ng = deref_curr_numa_group(p);
Rik van Riel4142c3e2016-01-25 17:07:39 -05002626 if (!priv && !local && ng && ng->active_nodes > 1 &&
2627 numa_is_active_node(cpu_node, ng) &&
2628 numa_is_active_node(mem_node, ng))
Rik van Riel792568e2014-04-11 13:00:27 -04002629 local = 1;
2630
Rik van Riel2739d3e2013-10-07 11:29:41 +01002631 /*
Yi Wange1ff5162018-11-05 08:50:13 +08002632 * Retry to migrate task to preferred node periodically, in case it
2633 * previously failed, or the scheduler moved us.
Rik van Riel2739d3e2013-10-07 11:29:41 +01002634 */
Srikar Dronamrajub6a60cf2018-06-20 22:33:00 +05302635 if (time_after(jiffies, p->numa_migrate_retry)) {
2636 task_numa_placement(p);
Mel Gorman6b9a7462013-10-07 11:29:11 +01002637 numa_migrate_preferred(p);
Srikar Dronamrajub6a60cf2018-06-20 22:33:00 +05302638 }
Mel Gorman6b9a7462013-10-07 11:29:11 +01002639
Ingo Molnarb32e86b2013-10-07 11:29:30 +01002640 if (migrated)
2641 p->numa_pages_migrated += pages;
Mel Gorman074c2382015-03-25 15:55:42 -07002642 if (flags & TNF_MIGRATE_FAIL)
2643 p->numa_faults_locality[2] += pages;
Ingo Molnarb32e86b2013-10-07 11:29:30 +01002644
Iulia Manda44dba3d2014-10-31 02:13:31 +02002645 p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2646 p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
Rik van Riel792568e2014-04-11 13:00:27 -04002647 p->numa_faults_locality[local] += pages;
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002648}
2649
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002650static void reset_ptenuma_scan(struct task_struct *p)
2651{
Jason Low7e5a2c12015-04-30 17:28:14 -07002652 /*
2653 * We only did a read acquisition of the mmap sem, so
2654 * p->mm->numa_scan_seq is written to without exclusive access
2655 * and the update is not guaranteed to be atomic. That's not
2656 * much of an issue though, since this is just used for
2657 * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
2658 * expensive, to avoid any form of compiler optimizations:
2659 */
Jason Low316c1608d2015-04-28 13:00:20 -07002660 WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002661 p->mm->numa_scan_offset = 0;
2662}
2663
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002664/*
2665 * The expensive part of numa migration is done from task_work context.
2666 * Triggered from task_tick_numa().
2667 */
Valentin Schneider9434f9f2019-07-15 11:25:08 +01002668static void task_numa_work(struct callback_head *work)
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002669{
2670 unsigned long migrate, next_scan, now = jiffies;
2671 struct task_struct *p = current;
2672 struct mm_struct *mm = p->mm;
Rik van Riel51170842015-11-05 15:56:23 -05002673 u64 runtime = p->se.sum_exec_runtime;
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002674 struct vm_area_struct *vma;
Mel Gorman9f406042012-11-14 18:34:32 +00002675 unsigned long start, end;
Mel Gorman598f0ec2013-10-07 11:28:55 +01002676 unsigned long nr_pte_updates = 0;
Rik van Riel4620f8c2015-09-11 09:00:27 -04002677 long pages, virtpages;
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002678
Peter Zijlstra9148a3a2016-09-20 22:34:51 +02002679 SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002680
Valentin Schneiderb34920d2019-07-15 11:25:07 +01002681 work->next = work;
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002682 /*
2683 * Who cares about NUMA placement when they're dying.
2684 *
2685 * NOTE: make sure not to dereference p->mm before this check,
2686 * exit_task_work() happens _after_ exit_mm() so we could be called
2687 * without p->mm even though we still had it when we enqueued this
2688 * work.
2689 */
2690 if (p->flags & PF_EXITING)
2691 return;
2692
Mel Gorman930aa172013-10-07 11:29:37 +01002693 if (!mm->numa_next_scan) {
Mel Gorman7e8d16b2013-10-07 11:28:54 +01002694 mm->numa_next_scan = now +
2695 msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
Mel Gormanb8593bf2012-11-21 01:18:23 +00002696 }
2697
2698 /*
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002699 * Enforce maximal scan/migration frequency..
2700 */
2701 migrate = mm->numa_next_scan;
2702 if (time_before(now, migrate))
2703 return;
2704
Mel Gorman598f0ec2013-10-07 11:28:55 +01002705 if (p->numa_scan_period == 0) {
2706 p->numa_scan_period_max = task_scan_max(p);
Rik van Rielb5dd77c2017-07-31 15:28:47 -04002707 p->numa_scan_period = task_scan_start(p);
Mel Gorman598f0ec2013-10-07 11:28:55 +01002708 }
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002709
Mel Gormanfb003b82012-11-15 09:01:14 +00002710 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002711 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2712 return;
2713
Mel Gormane14808b2012-11-19 10:59:15 +00002714 /*
Peter Zijlstra19a78d12013-10-07 11:28:51 +01002715 * Delay this task enough that another task of this mm will likely win
2716 * the next time around.
2717 */
2718 p->node_stamp += 2 * TICK_NSEC;
2719
Mel Gorman9f406042012-11-14 18:34:32 +00002720 start = mm->numa_scan_offset;
2721 pages = sysctl_numa_balancing_scan_size;
2722 pages <<= 20 - PAGE_SHIFT; /* MB in pages */
Rik van Riel4620f8c2015-09-11 09:00:27 -04002723 virtpages = pages * 8; /* Scan up to this much virtual space */
Mel Gorman9f406042012-11-14 18:34:32 +00002724 if (!pages)
2725 return;
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002726
Rik van Riel4620f8c2015-09-11 09:00:27 -04002727
Michel Lespinassed8ed45c2020-06-08 21:33:25 -07002728 if (!mmap_read_trylock(mm))
Vlastimil Babka8655d542017-05-15 15:13:16 +02002729 return;
Mel Gorman9f406042012-11-14 18:34:32 +00002730 vma = find_vma(mm, start);
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002731 if (!vma) {
2732 reset_ptenuma_scan(p);
Mel Gorman9f406042012-11-14 18:34:32 +00002733 start = 0;
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002734 vma = mm->mmap;
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002735 }
Mel Gorman9f406042012-11-14 18:34:32 +00002736 for (; vma; vma = vma->vm_next) {
Naoya Horiguchi6b79c572015-04-07 14:26:47 -07002737 if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
Mel Gorman8e76d4e2015-06-10 11:15:00 -07002738 is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002739 continue;
Naoya Horiguchi6b79c572015-04-07 14:26:47 -07002740 }
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002741
Mel Gorman4591ce4f2013-10-07 11:29:13 +01002742 /*
2743 * Shared library pages mapped by multiple processes are not
2744 * migrated as it is expected they are cache replicated. Avoid
2745 * hinting faults in read-only file-backed mappings or the vdso
2746 * as migrating the pages will be of marginal benefit.
2747 */
2748 if (!vma->vm_mm ||
2749 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2750 continue;
2751
Mel Gorman3c67f472013-12-18 17:08:40 -08002752 /*
2753 * Skip inaccessible VMAs to avoid any confusion between
2754 * PROT_NONE and NUMA hinting ptes
2755 */
Anshuman Khandual3122e802020-04-06 20:03:47 -07002756 if (!vma_is_accessible(vma))
Mel Gorman3c67f472013-12-18 17:08:40 -08002757 continue;
2758
Mel Gorman9f406042012-11-14 18:34:32 +00002759 do {
2760 start = max(start, vma->vm_start);
2761 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2762 end = min(end, vma->vm_end);
Rik van Riel4620f8c2015-09-11 09:00:27 -04002763 nr_pte_updates = change_prot_numa(vma, start, end);
Mel Gorman598f0ec2013-10-07 11:28:55 +01002764
2765 /*
Rik van Riel4620f8c2015-09-11 09:00:27 -04002766 * Try to scan sysctl_numa_balancing_size worth of
2767 * hpages that have at least one present PTE that
2768 * is not already pte-numa. If the VMA contains
2769 * areas that are unused or already full of prot_numa
2770 * PTEs, scan up to virtpages, to skip through those
2771 * areas faster.
Mel Gorman598f0ec2013-10-07 11:28:55 +01002772 */
2773 if (nr_pte_updates)
2774 pages -= (end - start) >> PAGE_SHIFT;
Rik van Riel4620f8c2015-09-11 09:00:27 -04002775 virtpages -= (end - start) >> PAGE_SHIFT;
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002776
Mel Gorman9f406042012-11-14 18:34:32 +00002777 start = end;
Rik van Riel4620f8c2015-09-11 09:00:27 -04002778 if (pages <= 0 || virtpages <= 0)
Mel Gorman9f406042012-11-14 18:34:32 +00002779 goto out;
Rik van Riel3cf19622014-02-18 17:12:44 -05002780
2781 cond_resched();
Mel Gorman9f406042012-11-14 18:34:32 +00002782 } while (end != vma->vm_end);
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002783 }
2784
Mel Gorman9f406042012-11-14 18:34:32 +00002785out:
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002786 /*
Peter Zijlstrac69307d2013-10-07 11:28:41 +01002787 * It is possible to reach the end of the VMA list but the last few
2788 * VMAs are not guaranteed to the vma_migratable. If they are not, we
2789 * would find the !migratable VMA on the next scan but not reset the
2790 * scanner to the start so check it now.
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002791 */
2792 if (vma)
Mel Gorman9f406042012-11-14 18:34:32 +00002793 mm->numa_scan_offset = start;
Peter Zijlstra6e5fb222012-10-25 14:16:45 +02002794 else
2795 reset_ptenuma_scan(p);
Michel Lespinassed8ed45c2020-06-08 21:33:25 -07002796 mmap_read_unlock(mm);
Rik van Riel51170842015-11-05 15:56:23 -05002797
2798 /*
2799 * Make sure tasks use at least 32x as much time to run other code
2800 * than they used here, to limit NUMA PTE scanning overhead to 3% max.
2801 * Usually update_task_scan_period slows down scanning enough; on an
2802 * overloaded system we need to limit overhead on a per task basis.
2803 */
2804 if (unlikely(p->se.sum_exec_runtime != runtime)) {
2805 u64 diff = p->se.sum_exec_runtime - runtime;
2806 p->node_stamp += 32 * diff;
2807 }
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002808}
2809
Valentin Schneiderd35927a2019-07-15 11:25:06 +01002810void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
2811{
2812 int mm_users = 0;
2813 struct mm_struct *mm = p->mm;
2814
2815 if (mm) {
2816 mm_users = atomic_read(&mm->mm_users);
2817 if (mm_users == 1) {
2818 mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2819 mm->numa_scan_seq = 0;
2820 }
2821 }
2822 p->node_stamp = 0;
2823 p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
2824 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
Valentin Schneiderb34920d2019-07-15 11:25:07 +01002825 /* Protect against double add, see task_tick_numa and task_numa_work */
Valentin Schneiderd35927a2019-07-15 11:25:06 +01002826 p->numa_work.next = &p->numa_work;
2827 p->numa_faults = NULL;
2828 RCU_INIT_POINTER(p->numa_group, NULL);
2829 p->last_task_numa_placement = 0;
2830 p->last_sum_exec_runtime = 0;
2831
Valentin Schneiderb34920d2019-07-15 11:25:07 +01002832 init_task_work(&p->numa_work, task_numa_work);
2833
Valentin Schneiderd35927a2019-07-15 11:25:06 +01002834 /* New address space, reset the preferred nid */
2835 if (!(clone_flags & CLONE_VM)) {
2836 p->numa_preferred_nid = NUMA_NO_NODE;
2837 return;
2838 }
2839
2840 /*
2841 * New thread, keep existing numa_preferred_nid which should be copied
2842 * already by arch_dup_task_struct but stagger when scans start.
2843 */
2844 if (mm) {
2845 unsigned int delay;
2846
2847 delay = min_t(unsigned int, task_scan_max(current),
2848 current->numa_scan_period * mm_users * NSEC_PER_MSEC);
2849 delay += 2 * TICK_NSEC;
2850 p->node_stamp = delay;
2851 }
2852}
2853
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002854/*
2855 * Drive the periodic memory faults..
2856 */
YueHaibingb1546ed2019-04-18 22:47:13 +08002857static void task_tick_numa(struct rq *rq, struct task_struct *curr)
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002858{
2859 struct callback_head *work = &curr->numa_work;
2860 u64 period, now;
2861
2862 /*
2863 * We don't care about NUMA placement if we don't have memory.
2864 */
Jens Axboe18f855e2020-05-26 09:38:31 -06002865 if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002866 return;
2867
2868 /*
2869 * Using runtime rather than walltime has the dual advantage that
2870 * we (mostly) drive the selection from busy threads and that the
2871 * task needs to have done some actual work before we bother with
2872 * NUMA placement.
2873 */
2874 now = curr->se.sum_exec_runtime;
2875 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2876
Rik van Riel25b3e5a2015-11-05 15:56:22 -05002877 if (now > curr->node_stamp + period) {
Peter Zijlstra4b96a292012-10-25 14:16:47 +02002878 if (!curr->node_stamp)
Rik van Rielb5dd77c2017-07-31 15:28:47 -04002879 curr->numa_scan_period = task_scan_start(curr);
Peter Zijlstra19a78d12013-10-07 11:28:51 +01002880 curr->node_stamp += period;
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002881
Valentin Schneiderb34920d2019-07-15 11:25:07 +01002882 if (!time_before(jiffies, curr->mm->numa_next_scan))
Jens Axboe91989c72020-10-16 09:02:26 -06002883 task_work_add(curr, work, TWA_RESUME);
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002884 }
2885}
Rik van Riel3fed3822017-06-23 12:55:29 -04002886
Srikar Dronamraju3f9672b2018-09-21 23:18:58 +05302887static void update_scan_period(struct task_struct *p, int new_cpu)
2888{
2889 int src_nid = cpu_to_node(task_cpu(p));
2890 int dst_nid = cpu_to_node(new_cpu);
2891
Mel Gorman05cbdf42018-09-21 23:18:59 +05302892 if (!static_branch_likely(&sched_numa_balancing))
2893 return;
2894
Srikar Dronamraju3f9672b2018-09-21 23:18:58 +05302895 if (!p->mm || !p->numa_faults || (p->flags & PF_EXITING))
2896 return;
2897
Mel Gorman05cbdf42018-09-21 23:18:59 +05302898 if (src_nid == dst_nid)
2899 return;
2900
2901 /*
2902 * Allow resets if faults have been trapped before one scan
2903 * has completed. This is most likely due to a new task that
2904 * is pulled cross-node due to wakeups or load balancing.
2905 */
2906 if (p->numa_scan_seq) {
2907 /*
2908 * Avoid scan adjustments if moving to the preferred
2909 * node or if the task was not previously running on
2910 * the preferred node.
2911 */
2912 if (dst_nid == p->numa_preferred_nid ||
Anshuman Khandual98fa15f2019-03-05 15:42:58 -08002913 (p->numa_preferred_nid != NUMA_NO_NODE &&
2914 src_nid != p->numa_preferred_nid))
Mel Gorman05cbdf42018-09-21 23:18:59 +05302915 return;
2916 }
2917
2918 p->numa_scan_period = task_scan_start(p);
Srikar Dronamraju3f9672b2018-09-21 23:18:58 +05302919}
2920
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002921#else
2922static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2923{
2924}
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01002925
2926static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2927{
2928}
2929
2930static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2931{
2932}
Rik van Riel3fed3822017-06-23 12:55:29 -04002933
Srikar Dronamraju3f9672b2018-09-21 23:18:58 +05302934static inline void update_scan_period(struct task_struct *p, int new_cpu)
2935{
2936}
2937
Peter Zijlstracbee9f82012-10-25 14:16:43 +02002938#endif /* CONFIG_NUMA_BALANCING */
2939
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02002940static void
2941account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2942{
2943 update_load_add(&cfs_rq->load, se->load.weight);
Peter Zijlstra367456c2012-02-20 21:49:09 +01002944#ifdef CONFIG_SMP
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01002945 if (entity_is_task(se)) {
2946 struct rq *rq = rq_of(cfs_rq);
2947
2948 account_numa_enqueue(rq, task_of(se));
2949 list_add(&se->group_node, &rq->cfs_tasks);
2950 }
Peter Zijlstra367456c2012-02-20 21:49:09 +01002951#endif
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02002952 cfs_rq->nr_running++;
Josh Dona480add2021-08-19 18:04:01 -07002953 if (se_is_idle(se))
2954 cfs_rq->idle_nr_running++;
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02002955}
2956
2957static void
2958account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2959{
2960 update_load_sub(&cfs_rq->load, se->load.weight);
Tim Chenbfdb1982016-02-01 14:47:59 -08002961#ifdef CONFIG_SMP
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01002962 if (entity_is_task(se)) {
2963 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
Bharata B Raob87f1722008-09-25 09:53:54 +05302964 list_del_init(&se->group_node);
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01002965 }
Tim Chenbfdb1982016-02-01 14:47:59 -08002966#endif
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02002967 cfs_rq->nr_running--;
Josh Dona480add2021-08-19 18:04:01 -07002968 if (se_is_idle(se))
2969 cfs_rq->idle_nr_running--;
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02002970}
2971
Peter Zijlstra8d5b9022017-08-24 17:45:35 +02002972/*
2973 * Signed add and clamp on underflow.
2974 *
2975 * Explicitly do a load-store to ensure the intermediate value never hits
2976 * memory. This allows lockless observations without ever seeing the negative
2977 * values.
2978 */
2979#define add_positive(_ptr, _val) do { \
2980 typeof(_ptr) ptr = (_ptr); \
2981 typeof(_val) val = (_val); \
2982 typeof(*ptr) res, var = READ_ONCE(*ptr); \
2983 \
2984 res = var + val; \
2985 \
2986 if (val < 0 && res > var) \
2987 res = 0; \
2988 \
2989 WRITE_ONCE(*ptr, res); \
2990} while (0)
2991
2992/*
2993 * Unsigned subtract and clamp on underflow.
2994 *
2995 * Explicitly do a load-store to ensure the intermediate value never hits
2996 * memory. This allows lockless observations without ever seeing the negative
2997 * values.
2998 */
2999#define sub_positive(_ptr, _val) do { \
3000 typeof(_ptr) ptr = (_ptr); \
3001 typeof(*ptr) val = (_val); \
3002 typeof(*ptr) res, var = READ_ONCE(*ptr); \
3003 res = var - val; \
3004 if (res > var) \
3005 res = 0; \
3006 WRITE_ONCE(*ptr, res); \
3007} while (0)
3008
Patrick Bellasib5c0ce72018-11-05 14:54:00 +00003009/*
3010 * Remove and clamp on negative, from a local variable.
3011 *
3012 * A variant of sub_positive(), which does not use explicit load-store
3013 * and is thus optimized for local variable updates.
3014 */
3015#define lsub_positive(_ptr, _val) do { \
3016 typeof(_ptr) ptr = (_ptr); \
3017 *ptr -= min_t(typeof(*ptr), *ptr, _val); \
3018} while (0)
3019
Peter Zijlstra8d5b9022017-08-24 17:45:35 +02003020#ifdef CONFIG_SMP
Peter Zijlstra8d5b9022017-08-24 17:45:35 +02003021static inline void
Peter Zijlstra8d5b9022017-08-24 17:45:35 +02003022enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3023{
3024 cfs_rq->avg.load_avg += se->avg.load_avg;
3025 cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
3026}
3027
3028static inline void
3029dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3030{
3031 sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
Vincent Guittot2d02fa82022-01-11 14:46:59 +01003032 sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
3033 /* See update_cfs_rq_load_avg() */
3034 cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
3035 cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
Peter Zijlstra8d5b9022017-08-24 17:45:35 +02003036}
3037#else
3038static inline void
Peter Zijlstra8d5b9022017-08-24 17:45:35 +02003039enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
3040static inline void
3041dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
3042#endif
3043
Vincent Guittot90593932017-05-17 11:50:45 +02003044static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
Vincent Guittot0dacee12020-02-24 09:52:17 +00003045 unsigned long weight)
Vincent Guittot90593932017-05-17 11:50:45 +02003046{
3047 if (se->on_rq) {
3048 /* commit outstanding execution time */
3049 if (cfs_rq->curr == se)
3050 update_curr(cfs_rq);
Jiang Biao1724b952020-08-11 19:32:09 +08003051 update_load_sub(&cfs_rq->load, se->load.weight);
Vincent Guittot90593932017-05-17 11:50:45 +02003052 }
3053 dequeue_load_avg(cfs_rq, se);
3054
3055 update_load_set(&se->load, weight);
3056
3057#ifdef CONFIG_SMP
Peter Zijlstra1ea6c462017-05-06 15:59:54 +02003058 do {
Vincent Guittot87e867b2020-06-12 17:47:03 +02003059 u32 divider = get_pelt_divider(&se->avg);
Peter Zijlstra1ea6c462017-05-06 15:59:54 +02003060
3061 se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
Peter Zijlstra1ea6c462017-05-06 15:59:54 +02003062 } while (0);
Vincent Guittot90593932017-05-17 11:50:45 +02003063#endif
3064
3065 enqueue_load_avg(cfs_rq, se);
Vincent Guittot0dacee12020-02-24 09:52:17 +00003066 if (se->on_rq)
Jiang Biao1724b952020-08-11 19:32:09 +08003067 update_load_add(&cfs_rq->load, se->load.weight);
Vincent Guittot0dacee12020-02-24 09:52:17 +00003068
Vincent Guittot90593932017-05-17 11:50:45 +02003069}
3070
3071void reweight_task(struct task_struct *p, int prio)
3072{
3073 struct sched_entity *se = &p->se;
3074 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3075 struct load_weight *load = &se->load;
3076 unsigned long weight = scale_load(sched_prio_to_weight[prio]);
3077
Vincent Guittot0dacee12020-02-24 09:52:17 +00003078 reweight_entity(cfs_rq, se, weight);
Vincent Guittot90593932017-05-17 11:50:45 +02003079 load->inv_weight = sched_prio_to_wmult[prio];
3080}
3081
Yong Zhang3ff6dca2011-01-24 15:33:52 +08003082#ifdef CONFIG_FAIR_GROUP_SCHED
Vincent Guittot387f77c2018-02-13 09:59:42 +01003083#ifdef CONFIG_SMP
Peter Zijlstracef27402017-05-09 11:04:07 +02003084/*
3085 * All this does is approximate the hierarchical proportion which includes that
3086 * global sum we all love to hate.
3087 *
3088 * That is, the weight of a group entity, is the proportional share of the
3089 * group weight based on the group runqueue weights. That is:
3090 *
3091 * tg->weight * grq->load.weight
3092 * ge->load.weight = ----------------------------- (1)
Odin Ugedal08f7c2f2021-05-18 14:52:02 +02003093 * \Sum grq->load.weight
Peter Zijlstracef27402017-05-09 11:04:07 +02003094 *
3095 * Now, because computing that sum is prohibitively expensive to compute (been
3096 * there, done that) we approximate it with this average stuff. The average
3097 * moves slower and therefore the approximation is cheaper and more stable.
3098 *
3099 * So instead of the above, we substitute:
3100 *
3101 * grq->load.weight -> grq->avg.load_avg (2)
3102 *
3103 * which yields the following:
3104 *
3105 * tg->weight * grq->avg.load_avg
3106 * ge->load.weight = ------------------------------ (3)
Odin Ugedal08f7c2f2021-05-18 14:52:02 +02003107 * tg->load_avg
Peter Zijlstracef27402017-05-09 11:04:07 +02003108 *
3109 * Where: tg->load_avg ~= \Sum grq->avg.load_avg
3110 *
3111 * That is shares_avg, and it is right (given the approximation (2)).
3112 *
3113 * The problem with it is that because the average is slow -- it was designed
3114 * to be exactly that of course -- this leads to transients in boundary
3115 * conditions. In specific, the case where the group was idle and we start the
3116 * one task. It takes time for our CPU's grq->avg.load_avg to build up,
3117 * yielding bad latency etc..
3118 *
3119 * Now, in that special case (1) reduces to:
3120 *
3121 * tg->weight * grq->load.weight
Peter Zijlstra17de4ee2017-08-24 13:06:35 +02003122 * ge->load.weight = ----------------------------- = tg->weight (4)
Odin Ugedal08f7c2f2021-05-18 14:52:02 +02003123 * grp->load.weight
Peter Zijlstracef27402017-05-09 11:04:07 +02003124 *
3125 * That is, the sum collapses because all other CPUs are idle; the UP scenario.
3126 *
3127 * So what we do is modify our approximation (3) to approach (4) in the (near)
3128 * UP case, like:
3129 *
3130 * ge->load.weight =
3131 *
3132 * tg->weight * grq->load.weight
3133 * --------------------------------------------------- (5)
3134 * tg->load_avg - grq->avg.load_avg + grq->load.weight
3135 *
Peter Zijlstra17de4ee2017-08-24 13:06:35 +02003136 * But because grq->load.weight can drop to 0, resulting in a divide by zero,
3137 * we need to use grq->avg.load_avg as its lower bound, which then gives:
3138 *
3139 *
3140 * tg->weight * grq->load.weight
3141 * ge->load.weight = ----------------------------- (6)
Odin Ugedal08f7c2f2021-05-18 14:52:02 +02003142 * tg_load_avg'
Peter Zijlstra17de4ee2017-08-24 13:06:35 +02003143 *
3144 * Where:
3145 *
3146 * tg_load_avg' = tg->load_avg - grq->avg.load_avg +
3147 * max(grq->load.weight, grq->avg.load_avg)
Peter Zijlstracef27402017-05-09 11:04:07 +02003148 *
3149 * And that is shares_weight and is icky. In the (near) UP case it approaches
3150 * (4) while in the normal case it approaches (3). It consistently
3151 * overestimates the ge->load.weight and therefore:
3152 *
3153 * \Sum ge->load.weight >= tg->weight
3154 *
3155 * hence icky!
3156 */
Josef Bacik2c8e4dc2017-08-03 11:13:39 -04003157static long calc_group_shares(struct cfs_rq *cfs_rq)
Yong Zhang3ff6dca2011-01-24 15:33:52 +08003158{
Peter Zijlstra7c80cfc2017-05-06 16:03:17 +02003159 long tg_weight, tg_shares, load, shares;
3160 struct task_group *tg = cfs_rq->tg;
3161
3162 tg_shares = READ_ONCE(tg->shares);
Yong Zhang3ff6dca2011-01-24 15:33:52 +08003163
Peter Zijlstra3d4b60d2017-05-11 18:16:06 +02003164 load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
Peter Zijlstraea1dc6f2016-06-24 16:11:02 +02003165
3166 tg_weight = atomic_long_read(&tg->load_avg);
3167
3168 /* Ensure tg_weight >= load */
3169 tg_weight -= cfs_rq->tg_load_avg_contrib;
3170 tg_weight += load;
Yong Zhang3ff6dca2011-01-24 15:33:52 +08003171
Peter Zijlstra7c80cfc2017-05-06 16:03:17 +02003172 shares = (tg_shares * load);
Peter Zijlstracf5f0ac2011-10-13 16:52:28 +02003173 if (tg_weight)
3174 shares /= tg_weight;
Yong Zhang3ff6dca2011-01-24 15:33:52 +08003175
Dietmar Eggemannb8fd8422017-01-11 11:29:47 +00003176 /*
3177 * MIN_SHARES has to be unscaled here to support per-CPU partitioning
3178 * of a group with small tg->shares value. It is a floor value which is
3179 * assigned as a minimum load.weight to the sched_entity representing
3180 * the group on a CPU.
3181 *
3182 * E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024
3183 * on an 8-core system with 8 tasks each runnable on one CPU shares has
3184 * to be 15*1024*1/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In
3185 * case no task is runnable on a CPU MIN_SHARES=2 should be returned
3186 * instead of 0.
3187 */
Peter Zijlstra7c80cfc2017-05-06 16:03:17 +02003188 return clamp_t(long, shares, MIN_SHARES, tg_shares);
Yong Zhang3ff6dca2011-01-24 15:33:52 +08003189}
Vincent Guittot387f77c2018-02-13 09:59:42 +01003190#endif /* CONFIG_SMP */
Peter Zijlstraea1dc6f2016-06-24 16:11:02 +02003191
Paul Turner82958362012-10-04 13:18:31 +02003192static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
3193
Peter Zijlstra1ea6c462017-05-06 15:59:54 +02003194/*
3195 * Recomputes the group entity based on the current state of its group
3196 * runqueue.
3197 */
3198static void update_cfs_group(struct sched_entity *se)
Peter Zijlstra2069dd72010-11-15 15:47:00 -08003199{
Peter Zijlstra1ea6c462017-05-06 15:59:54 +02003200 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
Vincent Guittot0dacee12020-02-24 09:52:17 +00003201 long shares;
Peter Zijlstra2069dd72010-11-15 15:47:00 -08003202
Peter Zijlstra1ea6c462017-05-06 15:59:54 +02003203 if (!gcfs_rq)
Peter Zijlstra2069dd72010-11-15 15:47:00 -08003204 return;
Vincent Guittot89ee0482016-12-21 16:50:26 +01003205
Peter Zijlstra1ea6c462017-05-06 15:59:54 +02003206 if (throttled_hierarchy(gcfs_rq))
Vincent Guittot89ee0482016-12-21 16:50:26 +01003207 return;
3208
Yong Zhang3ff6dca2011-01-24 15:33:52 +08003209#ifndef CONFIG_SMP
Vincent Guittot0dacee12020-02-24 09:52:17 +00003210 shares = READ_ONCE(gcfs_rq->tg->shares);
Peter Zijlstra7c80cfc2017-05-06 16:03:17 +02003211
3212 if (likely(se->load.weight == shares))
Yong Zhang3ff6dca2011-01-24 15:33:52 +08003213 return;
Peter Zijlstra7c80cfc2017-05-06 16:03:17 +02003214#else
Josef Bacik2c8e4dc2017-08-03 11:13:39 -04003215 shares = calc_group_shares(gcfs_rq);
Yong Zhang3ff6dca2011-01-24 15:33:52 +08003216#endif
Peter Zijlstra2069dd72010-11-15 15:47:00 -08003217
Vincent Guittot0dacee12020-02-24 09:52:17 +00003218 reweight_entity(cfs_rq_of(se), se, shares);
Peter Zijlstra2069dd72010-11-15 15:47:00 -08003219}
Vincent Guittot89ee0482016-12-21 16:50:26 +01003220
Peter Zijlstra2069dd72010-11-15 15:47:00 -08003221#else /* CONFIG_FAIR_GROUP_SCHED */
Peter Zijlstra1ea6c462017-05-06 15:59:54 +02003222static inline void update_cfs_group(struct sched_entity *se)
Peter Zijlstra2069dd72010-11-15 15:47:00 -08003223{
3224}
3225#endif /* CONFIG_FAIR_GROUP_SCHED */
3226
Peter Zijlstraea14b57e2018-02-02 10:27:00 +01003227static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
Viresh Kumara030d732017-05-24 10:59:52 +05303228{
Linus Torvalds43964402017-09-05 12:19:08 -07003229 struct rq *rq = rq_of(cfs_rq);
3230
Vincent Guittota4f9a0e2020-01-15 11:20:20 +01003231 if (&rq->cfs == cfs_rq) {
Viresh Kumara030d732017-05-24 10:59:52 +05303232 /*
3233 * There are a few boundary cases this might miss but it should
3234 * get called often enough that that should (hopefully) not be
Joel Fernandes9783be22017-12-15 07:39:43 -08003235 * a real problem.
Viresh Kumara030d732017-05-24 10:59:52 +05303236 *
3237 * It will not get called when we go idle, because the idle
3238 * thread is a different class (!fair), nor will the utilization
3239 * number include things like RT tasks.
3240 *
3241 * As is, the util number is not freq-invariant (we'd have to
3242 * implement arch_scale_freq_capacity() for that).
3243 *
Dietmar Eggemann82762d22021-11-18 17:42:40 +01003244 * See cpu_util_cfs().
Viresh Kumara030d732017-05-24 10:59:52 +05303245 */
Peter Zijlstraea14b57e2018-02-02 10:27:00 +01003246 cpufreq_update_util(rq, flags);
Viresh Kumara030d732017-05-24 10:59:52 +05303247 }
3248}
3249
Alex Shi141965c2013-06-26 13:05:39 +08003250#ifdef CONFIG_SMP
Paul Turnerc566e8e2012-10-04 13:18:30 +02003251#ifdef CONFIG_FAIR_GROUP_SCHED
Rik van Rielfdaba612021-06-21 19:43:30 +02003252/*
3253 * Because list_add_leaf_cfs_rq always places a child cfs_rq on the list
3254 * immediately before a parent cfs_rq, and cfs_rqs are removed from the list
3255 * bottom-up, we only have to test whether the cfs_rq before us on the list
3256 * is our child.
3257 * If cfs_rq is not on the list, test whether a child needs its to be added to
3258 * connect a branch to the tree * (see list_add_leaf_cfs_rq() for details).
3259 */
3260static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq)
3261{
3262 struct cfs_rq *prev_cfs_rq;
3263 struct list_head *prev;
3264
3265 if (cfs_rq->on_list) {
3266 prev = cfs_rq->leaf_cfs_rq_list.prev;
3267 } else {
3268 struct rq *rq = rq_of(cfs_rq);
3269
3270 prev = rq->tmp_alone_branch;
3271 }
3272
3273 prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list);
3274
3275 return (prev_cfs_rq->tg->parent == cfs_rq->tg);
3276}
Odin Ugedala7b359f2021-06-12 13:28:15 +02003277
3278static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
3279{
3280 if (cfs_rq->load.weight)
3281 return false;
3282
3283 if (cfs_rq->avg.load_sum)
3284 return false;
3285
3286 if (cfs_rq->avg.util_sum)
3287 return false;
3288
3289 if (cfs_rq->avg.runnable_sum)
3290 return false;
3291
Rik van Rielfdaba612021-06-21 19:43:30 +02003292 if (child_cfs_rq_on_list(cfs_rq))
3293 return false;
3294
Ingo Molnarb2c09312021-06-18 11:31:25 +02003295 /*
3296 * _avg must be null when _sum are null because _avg = _sum / divider
3297 * Make sure that rounding and/or propagation of PELT values never
3298 * break this.
3299 */
3300 SCHED_WARN_ON(cfs_rq->avg.load_avg ||
3301 cfs_rq->avg.util_avg ||
3302 cfs_rq->avg.runnable_avg);
3303
Odin Ugedala7b359f2021-06-12 13:28:15 +02003304 return true;
3305}
3306
Peter Zijlstra7c3edd22016-07-13 10:56:25 +02003307/**
3308 * update_tg_load_avg - update the tg's load avg
3309 * @cfs_rq: the cfs_rq whose avg changed
Peter Zijlstra7c3edd22016-07-13 10:56:25 +02003310 *
3311 * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
3312 * However, because tg->load_avg is a global value there are performance
3313 * considerations.
3314 *
3315 * In order to avoid having to look at the other cfs_rq's, we use a
3316 * differential update where we store the last value we propagated. This in
3317 * turn allows skipping updates if the differential is 'small'.
3318 *
Rik van Riel815abf52017-06-23 12:55:30 -04003319 * Updating tg's load_avg is necessary before update_cfs_share().
Paul Turnerbb17f652012-10-04 13:18:31 +02003320 */
Xianting Tianfe749152020-09-24 09:47:55 +08003321static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
Paul Turnerbb17f652012-10-04 13:18:31 +02003322{
Yuyang Du9d89c252015-07-15 08:04:37 +08003323 long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
Paul Turnerbb17f652012-10-04 13:18:31 +02003324
Waiman Longaa0b7ae2015-12-02 13:41:50 -05003325 /*
3326 * No need to update load_avg for root_task_group as it is not used.
3327 */
3328 if (cfs_rq->tg == &root_task_group)
3329 return;
3330
Xianting Tianfe749152020-09-24 09:47:55 +08003331 if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
Yuyang Du9d89c252015-07-15 08:04:37 +08003332 atomic_long_add(delta, &cfs_rq->tg->load_avg);
3333 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
Paul Turnerbb17f652012-10-04 13:18:31 +02003334 }
Paul Turner8165e142012-10-04 13:18:31 +02003335}
Dietmar Eggemannf5f97392014-02-26 11:19:33 +00003336
Byungchul Parkad936d82015-10-24 01:16:19 +09003337/*
Ingo Molnar97fb7a02018-03-03 14:01:12 +01003338 * Called within set_task_rq() right before setting a task's CPU. The
Byungchul Parkad936d82015-10-24 01:16:19 +09003339 * caller only guarantees p->pi_lock is held; no other assumptions,
3340 * including the state of rq->lock, should be made.
3341 */
3342void set_task_rq_fair(struct sched_entity *se,
3343 struct cfs_rq *prev, struct cfs_rq *next)
3344{
Peter Zijlstra0ccb9772017-03-28 11:08:20 +02003345 u64 p_last_update_time;
3346 u64 n_last_update_time;
3347
Byungchul Parkad936d82015-10-24 01:16:19 +09003348 if (!sched_feat(ATTACH_AGE_LOAD))
3349 return;
3350
3351 /*
3352 * We are supposed to update the task to "current" time, then its up to
3353 * date and ready to go to new CPU/cfs_rq. But we have difficulty in
3354 * getting what current time is, so simply throw away the out-of-date
3355 * time. This will result in the wakee task is less decayed, but giving
3356 * the wakee more load sounds not bad.
3357 */
Peter Zijlstra0ccb9772017-03-28 11:08:20 +02003358 if (!(se->avg.last_update_time && prev))
3359 return;
Byungchul Parkad936d82015-10-24 01:16:19 +09003360
3361#ifndef CONFIG_64BIT
Peter Zijlstra0ccb9772017-03-28 11:08:20 +02003362 {
Byungchul Parkad936d82015-10-24 01:16:19 +09003363 u64 p_last_update_time_copy;
3364 u64 n_last_update_time_copy;
3365
3366 do {
3367 p_last_update_time_copy = prev->load_last_update_time_copy;
3368 n_last_update_time_copy = next->load_last_update_time_copy;
3369
3370 smp_rmb();
3371
3372 p_last_update_time = prev->avg.last_update_time;
3373 n_last_update_time = next->avg.last_update_time;
3374
3375 } while (p_last_update_time != p_last_update_time_copy ||
3376 n_last_update_time != n_last_update_time_copy);
Byungchul Parkad936d82015-10-24 01:16:19 +09003377 }
Peter Zijlstra0ccb9772017-03-28 11:08:20 +02003378#else
3379 p_last_update_time = prev->avg.last_update_time;
3380 n_last_update_time = next->avg.last_update_time;
3381#endif
Vincent Guittot23127292019-01-23 16:26:53 +01003382 __update_load_avg_blocked_se(p_last_update_time, se);
Peter Zijlstra0ccb9772017-03-28 11:08:20 +02003383 se->avg.last_update_time = n_last_update_time;
Byungchul Parkad936d82015-10-24 01:16:19 +09003384}
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003385
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003386/*
3387 * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
3388 * propagate its contribution. The key to this propagation is the invariant
3389 * that for each group:
3390 *
3391 * ge->avg == grq->avg (1)
3392 *
3393 * _IFF_ we look at the pure running and runnable sums. Because they
3394 * represent the very same entity, just at different points in the hierarchy.
3395 *
Vincent Guittot9f683952020-02-24 09:52:18 +00003396 * Per the above update_tg_cfs_util() and update_tg_cfs_runnable() are trivial
3397 * and simply copies the running/runnable sum over (but still wrong, because
3398 * the group entity and group rq do not have their PELT windows aligned).
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003399 *
Vincent Guittot0dacee12020-02-24 09:52:17 +00003400 * However, update_tg_cfs_load() is more complex. So we have:
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003401 *
3402 * ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg (2)
3403 *
3404 * And since, like util, the runnable part should be directly transferable,
3405 * the following would _appear_ to be the straight forward approach:
3406 *
Vincent Guittota4c3c042017-11-16 15:21:52 +01003407 * grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg (3)
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003408 *
3409 * And per (1) we have:
3410 *
Vincent Guittota4c3c042017-11-16 15:21:52 +01003411 * ge->avg.runnable_avg == grq->avg.runnable_avg
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003412 *
3413 * Which gives:
3414 *
3415 * ge->load.weight * grq->avg.load_avg
3416 * ge->avg.load_avg = ----------------------------------- (4)
3417 * grq->load.weight
3418 *
3419 * Except that is wrong!
3420 *
3421 * Because while for entities historical weight is not important and we
3422 * really only care about our future and therefore can consider a pure
3423 * runnable sum, runqueues can NOT do this.
3424 *
3425 * We specifically want runqueues to have a load_avg that includes
3426 * historical weights. Those represent the blocked load, the load we expect
3427 * to (shortly) return to us. This only works by keeping the weights as
3428 * integral part of the sum. We therefore cannot decompose as per (3).
3429 *
Vincent Guittota4c3c042017-11-16 15:21:52 +01003430 * Another reason this doesn't work is that runnable isn't a 0-sum entity.
3431 * Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the
3432 * rq itself is runnable anywhere between 2/3 and 1 depending on how the
3433 * runnable section of these tasks overlap (or not). If they were to perfectly
3434 * align the rq as a whole would be runnable 2/3 of the time. If however we
3435 * always have at least 1 runnable task, the rq as a whole is always runnable.
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003436 *
Vincent Guittota4c3c042017-11-16 15:21:52 +01003437 * So we'll have to approximate.. :/
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003438 *
Vincent Guittota4c3c042017-11-16 15:21:52 +01003439 * Given the constraint:
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003440 *
Vincent Guittota4c3c042017-11-16 15:21:52 +01003441 * ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003442 *
Vincent Guittota4c3c042017-11-16 15:21:52 +01003443 * We can construct a rule that adds runnable to a rq by assuming minimal
3444 * overlap.
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003445 *
Vincent Guittota4c3c042017-11-16 15:21:52 +01003446 * On removal, we'll assume each task is equally runnable; which yields:
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003447 *
Vincent Guittota4c3c042017-11-16 15:21:52 +01003448 * grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003449 *
Vincent Guittota4c3c042017-11-16 15:21:52 +01003450 * XXX: only do this for the part of runnable > running ?
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003451 *
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003452 */
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003453static inline void
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003454update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003455{
Vincent Guittot7ceb7712022-01-11 14:46:57 +01003456 long delta_sum, delta_avg = gcfs_rq->avg.util_avg - se->avg.util_avg;
3457 u32 new_sum, divider;
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003458
3459 /* Nothing to update */
Vincent Guittot7ceb7712022-01-11 14:46:57 +01003460 if (!delta_avg)
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003461 return;
3462
Vincent Guittot87e867b2020-06-12 17:47:03 +02003463 /*
3464 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
3465 * See ___update_load_avg() for details.
3466 */
3467 divider = get_pelt_divider(&cfs_rq->avg);
3468
Vincent Guittot7ceb7712022-01-11 14:46:57 +01003469
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003470 /* Set new sched_entity's utilization */
3471 se->avg.util_avg = gcfs_rq->avg.util_avg;
Vincent Guittot7ceb7712022-01-11 14:46:57 +01003472 new_sum = se->avg.util_avg * divider;
3473 delta_sum = (long)new_sum - (long)se->avg.util_sum;
3474 se->avg.util_sum = new_sum;
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003475
3476 /* Update parent cfs_rq utilization */
Vincent Guittot7ceb7712022-01-11 14:46:57 +01003477 add_positive(&cfs_rq->avg.util_avg, delta_avg);
3478 add_positive(&cfs_rq->avg.util_sum, delta_sum);
3479
3480 /* See update_cfs_rq_load_avg() */
3481 cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
3482 cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003483}
3484
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003485static inline void
Vincent Guittot9f683952020-02-24 09:52:18 +00003486update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
3487{
Vincent Guittot95246d12022-01-11 14:46:58 +01003488 long delta_sum, delta_avg = gcfs_rq->avg.runnable_avg - se->avg.runnable_avg;
3489 u32 new_sum, divider;
Vincent Guittot9f683952020-02-24 09:52:18 +00003490
3491 /* Nothing to update */
Vincent Guittot95246d12022-01-11 14:46:58 +01003492 if (!delta_avg)
Vincent Guittot9f683952020-02-24 09:52:18 +00003493 return;
3494
Vincent Guittot87e867b2020-06-12 17:47:03 +02003495 /*
3496 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
3497 * See ___update_load_avg() for details.
3498 */
3499 divider = get_pelt_divider(&cfs_rq->avg);
3500
Vincent Guittot9f683952020-02-24 09:52:18 +00003501 /* Set new sched_entity's runnable */
3502 se->avg.runnable_avg = gcfs_rq->avg.runnable_avg;
Vincent Guittot95246d12022-01-11 14:46:58 +01003503 new_sum = se->avg.runnable_avg * divider;
3504 delta_sum = (long)new_sum - (long)se->avg.runnable_sum;
3505 se->avg.runnable_sum = new_sum;
Vincent Guittot9f683952020-02-24 09:52:18 +00003506
3507 /* Update parent cfs_rq runnable */
Vincent Guittot95246d12022-01-11 14:46:58 +01003508 add_positive(&cfs_rq->avg.runnable_avg, delta_avg);
3509 add_positive(&cfs_rq->avg.runnable_sum, delta_sum);
3510 /* See update_cfs_rq_load_avg() */
3511 cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
3512 cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
Vincent Guittot9f683952020-02-24 09:52:18 +00003513}
3514
3515static inline void
Vincent Guittot0dacee12020-02-24 09:52:17 +00003516update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003517{
Vincent Guittot2d02fa82022-01-11 14:46:59 +01003518 long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum;
Vincent Guittot0dacee12020-02-24 09:52:17 +00003519 unsigned long load_avg;
3520 u64 load_sum = 0;
Vincent Guittot2d02fa82022-01-11 14:46:59 +01003521 s64 delta_sum;
Vincent Guittot95d68592020-05-06 17:53:01 +02003522 u32 divider;
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003523
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003524 if (!runnable_sum)
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003525 return;
3526
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003527 gcfs_rq->prop_runnable_sum = 0;
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003528
Vincent Guittot95d68592020-05-06 17:53:01 +02003529 /*
3530 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
3531 * See ___update_load_avg() for details.
3532 */
Vincent Guittot87e867b2020-06-12 17:47:03 +02003533 divider = get_pelt_divider(&cfs_rq->avg);
Vincent Guittot95d68592020-05-06 17:53:01 +02003534
Vincent Guittota4c3c042017-11-16 15:21:52 +01003535 if (runnable_sum >= 0) {
3536 /*
3537 * Add runnable; clip at LOAD_AVG_MAX. Reflects that until
3538 * the CPU is saturated running == runnable.
3539 */
3540 runnable_sum += se->avg.load_sum;
Vincent Guittot95d68592020-05-06 17:53:01 +02003541 runnable_sum = min_t(long, runnable_sum, divider);
Vincent Guittota4c3c042017-11-16 15:21:52 +01003542 } else {
3543 /*
3544 * Estimate the new unweighted runnable_sum of the gcfs_rq by
3545 * assuming all tasks are equally runnable.
3546 */
3547 if (scale_load_down(gcfs_rq->load.weight)) {
Vincent Guittot2d02fa82022-01-11 14:46:59 +01003548 load_sum = div_u64(gcfs_rq->avg.load_sum,
Vincent Guittota4c3c042017-11-16 15:21:52 +01003549 scale_load_down(gcfs_rq->load.weight));
3550 }
3551
3552 /* But make sure to not inflate se's runnable */
3553 runnable_sum = min(se->avg.load_sum, load_sum);
3554 }
3555
3556 /*
3557 * runnable_sum can't be lower than running_sum
Vincent Guittot23127292019-01-23 16:26:53 +01003558 * Rescale running sum to be in the same range as runnable sum
3559 * running_sum is in [0 : LOAD_AVG_MAX << SCHED_CAPACITY_SHIFT]
3560 * runnable_sum is in [0 : LOAD_AVG_MAX]
Vincent Guittota4c3c042017-11-16 15:21:52 +01003561 */
Vincent Guittot23127292019-01-23 16:26:53 +01003562 running_sum = se->avg.util_sum >> SCHED_CAPACITY_SHIFT;
Vincent Guittota4c3c042017-11-16 15:21:52 +01003563 runnable_sum = max(runnable_sum, running_sum);
3564
Vincent Guittot2d02fa82022-01-11 14:46:59 +01003565 load_sum = se_weight(se) * runnable_sum;
3566 load_avg = div_u64(load_sum, divider);
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003567
Vincent Guittot2d02fa82022-01-11 14:46:59 +01003568 delta_avg = load_avg - se->avg.load_avg;
3569 if (!delta_avg)
Dietmar Eggemann83c5e9d2021-06-01 10:36:16 +02003570 return;
3571
Vincent Guittot2d02fa82022-01-11 14:46:59 +01003572 delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum;
Vincent Guittot7c7ad622021-05-27 14:29:15 +02003573
Vincent Guittot2d02fa82022-01-11 14:46:59 +01003574 se->avg.load_sum = runnable_sum;
3575 se->avg.load_avg = load_avg;
3576 add_positive(&cfs_rq->avg.load_avg, delta_avg);
3577 add_positive(&cfs_rq->avg.load_sum, delta_sum);
3578 /* See update_cfs_rq_load_avg() */
3579 cfs_rq->avg.load_sum = max_t(u32, cfs_rq->avg.load_sum,
3580 cfs_rq->avg.load_avg * PELT_MIN_DIVIDER);
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003581}
3582
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003583static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003584{
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003585 cfs_rq->propagate = 1;
3586 cfs_rq->prop_runnable_sum += runnable_sum;
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003587}
3588
3589/* Update task and its cfs_rq load average */
3590static inline int propagate_entity_load_avg(struct sched_entity *se)
3591{
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003592 struct cfs_rq *cfs_rq, *gcfs_rq;
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003593
3594 if (entity_is_task(se))
3595 return 0;
3596
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003597 gcfs_rq = group_cfs_rq(se);
3598 if (!gcfs_rq->propagate)
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003599 return 0;
3600
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003601 gcfs_rq->propagate = 0;
3602
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003603 cfs_rq = cfs_rq_of(se);
3604
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003605 add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003606
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003607 update_tg_cfs_util(cfs_rq, se, gcfs_rq);
Vincent Guittot9f683952020-02-24 09:52:18 +00003608 update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
Vincent Guittot0dacee12020-02-24 09:52:17 +00003609 update_tg_cfs_load(cfs_rq, se, gcfs_rq);
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003610
Qais Yousefba19f512019-06-04 12:14:56 +01003611 trace_pelt_cfs_tp(cfs_rq);
Qais Yousef8de62422019-06-04 12:14:57 +01003612 trace_pelt_se_tp(se);
Qais Yousefba19f512019-06-04 12:14:56 +01003613
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003614 return 1;
3615}
3616
Vincent Guittotbc427892017-03-17 14:47:22 +01003617/*
3618 * Check if we need to update the load and the utilization of a blocked
3619 * group_entity:
3620 */
3621static inline bool skip_blocked_update(struct sched_entity *se)
3622{
3623 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3624
3625 /*
3626 * If sched_entity still have not zero load or utilization, we have to
3627 * decay it:
3628 */
3629 if (se->avg.load_avg || se->avg.util_avg)
3630 return false;
3631
3632 /*
3633 * If there is a pending propagation, we have to update the load and
3634 * the utilization of the sched_entity:
3635 */
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003636 if (gcfs_rq->propagate)
Vincent Guittotbc427892017-03-17 14:47:22 +01003637 return false;
3638
3639 /*
3640 * Otherwise, the load and the utilization of the sched_entity is
3641 * already zero and there is no pending propagation, so it will be a
3642 * waste of time to try to decay it:
3643 */
3644 return true;
3645}
3646
Peter Zijlstra6e831252014-02-11 16:11:48 +01003647#else /* CONFIG_FAIR_GROUP_SCHED */
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003648
Xianting Tianfe749152020-09-24 09:47:55 +08003649static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003650
3651static inline int propagate_entity_load_avg(struct sched_entity *se)
3652{
3653 return 0;
3654}
3655
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003656static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
Vincent Guittot09a43ac2016-11-08 10:53:45 +01003657
Peter Zijlstra6e831252014-02-11 16:11:48 +01003658#endif /* CONFIG_FAIR_GROUP_SCHED */
Paul Turnerc566e8e2012-10-04 13:18:30 +02003659
Peter Zijlstra3d30544f2016-06-21 14:27:50 +02003660/**
3661 * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
Vincent Guittot23127292019-01-23 16:26:53 +01003662 * @now: current time, as per cfs_rq_clock_pelt()
Peter Zijlstra3d30544f2016-06-21 14:27:50 +02003663 * @cfs_rq: cfs_rq to update
Peter Zijlstra3d30544f2016-06-21 14:27:50 +02003664 *
3665 * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
3666 * avg. The immediate corollary is that all (fair) tasks must be attached, see
3667 * post_init_entity_util_avg().
3668 *
3669 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
3670 *
Randy Dunlapa315da52021-12-17 21:59:00 -08003671 * Return: true if the load decayed or we removed load.
Peter Zijlstra7c3edd22016-07-13 10:56:25 +02003672 *
3673 * Since both these conditions indicate a changed cfs_rq->avg.load we should
3674 * call update_tg_load_avg() when this function returns true.
Peter Zijlstra3d30544f2016-06-21 14:27:50 +02003675 */
Steve Mucklea2c6c912016-03-24 15:26:07 -07003676static inline int
Viresh Kumar3a123bb2017-05-24 10:59:56 +05303677update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
Steve Mucklea2c6c912016-03-24 15:26:07 -07003678{
Vincent Guittot9f683952020-02-24 09:52:18 +00003679 unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0;
Steve Mucklea2c6c912016-03-24 15:26:07 -07003680 struct sched_avg *sa = &cfs_rq->avg;
Peter Zijlstra2a2f5d4e2017-05-08 16:51:41 +02003681 int decayed = 0;
Steve Mucklea2c6c912016-03-24 15:26:07 -07003682
Peter Zijlstra2a2f5d4e2017-05-08 16:51:41 +02003683 if (cfs_rq->removed.nr) {
3684 unsigned long r;
Vincent Guittot87e867b2020-06-12 17:47:03 +02003685 u32 divider = get_pelt_divider(&cfs_rq->avg);
Peter Zijlstra2a2f5d4e2017-05-08 16:51:41 +02003686
3687 raw_spin_lock(&cfs_rq->removed.lock);
3688 swap(cfs_rq->removed.util_avg, removed_util);
3689 swap(cfs_rq->removed.load_avg, removed_load);
Vincent Guittot9f683952020-02-24 09:52:18 +00003690 swap(cfs_rq->removed.runnable_avg, removed_runnable);
Peter Zijlstra2a2f5d4e2017-05-08 16:51:41 +02003691 cfs_rq->removed.nr = 0;
3692 raw_spin_unlock(&cfs_rq->removed.lock);
3693
Peter Zijlstra2a2f5d4e2017-05-08 16:51:41 +02003694 r = removed_load;
Peter Zijlstra89741892016-06-16 10:50:40 +02003695 sub_positive(&sa->load_avg, r);
Vincent Guittot2d02fa82022-01-11 14:46:59 +01003696 sub_positive(&sa->load_sum, r * divider);
3697 /* See sa->util_sum below */
3698 sa->load_sum = max_t(u32, sa->load_sum, sa->load_avg * PELT_MIN_DIVIDER);
Steve Mucklea2c6c912016-03-24 15:26:07 -07003699
Peter Zijlstra2a2f5d4e2017-05-08 16:51:41 +02003700 r = removed_util;
Peter Zijlstra89741892016-06-16 10:50:40 +02003701 sub_positive(&sa->util_avg, r);
Vincent Guittot98b0d892022-01-11 14:46:56 +01003702 sub_positive(&sa->util_sum, r * divider);
3703 /*
3704 * Because of rounding, se->util_sum might ends up being +1 more than
3705 * cfs->util_sum. Although this is not a problem by itself, detaching
3706 * a lot of tasks with the rounding problem between 2 updates of
3707 * util_avg (~1ms) can make cfs->util_sum becoming null whereas
3708 * cfs_util_avg is not.
3709 * Check that util_sum is still above its lower bound for the new
3710 * util_avg. Given that period_contrib might have moved since the last
3711 * sync, we are only sure that util_sum must be above or equal to
3712 * util_avg * minimum possible divider
3713 */
3714 sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER);
Peter Zijlstra2a2f5d4e2017-05-08 16:51:41 +02003715
Vincent Guittot9f683952020-02-24 09:52:18 +00003716 r = removed_runnable;
3717 sub_positive(&sa->runnable_avg, r);
Vincent Guittot95246d12022-01-11 14:46:58 +01003718 sub_positive(&sa->runnable_sum, r * divider);
3719 /* See sa->util_sum above */
3720 sa->runnable_sum = max_t(u32, sa->runnable_sum,
3721 sa->runnable_avg * PELT_MIN_DIVIDER);
Vincent Guittot9f683952020-02-24 09:52:18 +00003722
3723 /*
3724 * removed_runnable is the unweighted version of removed_load so we
3725 * can use it to estimate removed_load_sum.
3726 */
3727 add_tg_cfs_propagate(cfs_rq,
3728 -(long)(removed_runnable * divider) >> SCHED_CAPACITY_SHIFT);
Peter Zijlstra2a2f5d4e2017-05-08 16:51:41 +02003729
3730 decayed = 1;
Steve Mucklea2c6c912016-03-24 15:26:07 -07003731 }
3732
Vincent Guittot23127292019-01-23 16:26:53 +01003733 decayed |= __update_load_avg_cfs_rq(now, cfs_rq);
Steve Mucklea2c6c912016-03-24 15:26:07 -07003734
3735#ifndef CONFIG_64BIT
3736 smp_wmb();
3737 cfs_rq->load_last_update_time_copy = sa->last_update_time;
3738#endif
3739
Peter Zijlstra2a2f5d4e2017-05-08 16:51:41 +02003740 return decayed;
Yuyang Du9d89c252015-07-15 08:04:37 +08003741}
3742
Peter Zijlstra3d30544f2016-06-21 14:27:50 +02003743/**
3744 * attach_entity_load_avg - attach this entity to its cfs_rq load avg
3745 * @cfs_rq: cfs_rq to attach to
3746 * @se: sched_entity to attach
3747 *
3748 * Must call update_cfs_rq_load_avg() before this, since we rely on
3749 * cfs_rq->avg.last_update_time being current.
3750 */
Vincent Guittota4f9a0e2020-01-15 11:20:20 +01003751static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
Byungchul Parka05e8c52015-08-20 20:21:56 +09003752{
Vincent Guittot95d68592020-05-06 17:53:01 +02003753 /*
3754 * cfs_rq->avg.period_contrib can be used for both cfs_rq and se.
3755 * See ___update_load_avg() for details.
3756 */
Vincent Guittot87e867b2020-06-12 17:47:03 +02003757 u32 divider = get_pelt_divider(&cfs_rq->avg);
Peter Zijlstraf2079342017-05-12 14:16:30 +02003758
3759 /*
3760 * When we attach the @se to the @cfs_rq, we must align the decay
3761 * window because without that, really weird and wonderful things can
3762 * happen.
3763 *
3764 * XXX illustrate
3765 */
Byungchul Parka05e8c52015-08-20 20:21:56 +09003766 se->avg.last_update_time = cfs_rq->avg.last_update_time;
Peter Zijlstraf2079342017-05-12 14:16:30 +02003767 se->avg.period_contrib = cfs_rq->avg.period_contrib;
3768
3769 /*
3770 * Hell(o) Nasty stuff.. we need to recompute _sum based on the new
3771 * period_contrib. This isn't strictly correct, but since we're
3772 * entirely outside of the PELT hierarchy, nobody cares if we truncate
3773 * _sum a little.
3774 */
3775 se->avg.util_sum = se->avg.util_avg * divider;
3776
Vincent Guittot9f683952020-02-24 09:52:18 +00003777 se->avg.runnable_sum = se->avg.runnable_avg * divider;
3778
Peter Zijlstraf2079342017-05-12 14:16:30 +02003779 se->avg.load_sum = divider;
3780 if (se_weight(se)) {
3781 se->avg.load_sum =
3782 div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
3783 }
3784
Peter Zijlstra8d5b9022017-08-24 17:45:35 +02003785 enqueue_load_avg(cfs_rq, se);
Byungchul Parka05e8c52015-08-20 20:21:56 +09003786 cfs_rq->avg.util_avg += se->avg.util_avg;
3787 cfs_rq->avg.util_sum += se->avg.util_sum;
Vincent Guittot9f683952020-02-24 09:52:18 +00003788 cfs_rq->avg.runnable_avg += se->avg.runnable_avg;
3789 cfs_rq->avg.runnable_sum += se->avg.runnable_sum;
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003790
3791 add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
Steve Mucklea2c6c912016-03-24 15:26:07 -07003792
Vincent Guittota4f9a0e2020-01-15 11:20:20 +01003793 cfs_rq_util_change(cfs_rq, 0);
Qais Yousefba19f512019-06-04 12:14:56 +01003794
3795 trace_pelt_cfs_tp(cfs_rq);
Byungchul Parka05e8c52015-08-20 20:21:56 +09003796}
3797
Peter Zijlstra3d30544f2016-06-21 14:27:50 +02003798/**
3799 * detach_entity_load_avg - detach this entity from its cfs_rq load avg
3800 * @cfs_rq: cfs_rq to detach from
3801 * @se: sched_entity to detach
3802 *
3803 * Must call update_cfs_rq_load_avg() before this, since we rely on
3804 * cfs_rq->avg.last_update_time being current.
3805 */
Byungchul Parka05e8c52015-08-20 20:21:56 +09003806static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3807{
Peter Zijlstra8d5b9022017-08-24 17:45:35 +02003808 dequeue_load_avg(cfs_rq, se);
Peter Zijlstra89741892016-06-16 10:50:40 +02003809 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
Vincent Guittot7ceb7712022-01-11 14:46:57 +01003810 sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
3811 /* See update_cfs_rq_load_avg() */
3812 cfs_rq->avg.util_sum = max_t(u32, cfs_rq->avg.util_sum,
3813 cfs_rq->avg.util_avg * PELT_MIN_DIVIDER);
3814
Vincent Guittot9f683952020-02-24 09:52:18 +00003815 sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg);
Vincent Guittot95246d12022-01-11 14:46:58 +01003816 sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum);
3817 /* See update_cfs_rq_load_avg() */
3818 cfs_rq->avg.runnable_sum = max_t(u32, cfs_rq->avg.runnable_sum,
3819 cfs_rq->avg.runnable_avg * PELT_MIN_DIVIDER);
Peter Zijlstra0e2d2aa2017-05-08 17:30:46 +02003820
3821 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
Steve Mucklea2c6c912016-03-24 15:26:07 -07003822
Peter Zijlstraea14b57e2018-02-02 10:27:00 +01003823 cfs_rq_util_change(cfs_rq, 0);
Qais Yousefba19f512019-06-04 12:14:56 +01003824
3825 trace_pelt_cfs_tp(cfs_rq);
Byungchul Parka05e8c52015-08-20 20:21:56 +09003826}
3827
Peter Zijlstrab382a532017-05-06 17:37:03 +02003828/*
3829 * Optional action to be done while updating the load average
3830 */
3831#define UPDATE_TG 0x1
3832#define SKIP_AGE_LOAD 0x2
3833#define DO_ATTACH 0x4
3834
3835/* Update task and its cfs_rq load average */
3836static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3837{
Vincent Guittot23127292019-01-23 16:26:53 +01003838 u64 now = cfs_rq_clock_pelt(cfs_rq);
Peter Zijlstrab382a532017-05-06 17:37:03 +02003839 int decayed;
3840
3841 /*
3842 * Track task load average for carrying it to new CPU after migrated, and
3843 * track group sched_entity load average for task_h_load calc in migration
3844 */
3845 if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
Vincent Guittot23127292019-01-23 16:26:53 +01003846 __update_load_avg_se(now, cfs_rq, se);
Peter Zijlstrab382a532017-05-06 17:37:03 +02003847
3848 decayed = update_cfs_rq_load_avg(now, cfs_rq);
3849 decayed |= propagate_entity_load_avg(se);
3850
3851 if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
3852
Peter Zijlstraea14b57e2018-02-02 10:27:00 +01003853 /*
3854 * DO_ATTACH means we're here from enqueue_entity().
3855 * !last_update_time means we've passed through
3856 * migrate_task_rq_fair() indicating we migrated.
3857 *
3858 * IOW we're enqueueing a task on a new CPU.
3859 */
Vincent Guittota4f9a0e2020-01-15 11:20:20 +01003860 attach_entity_load_avg(cfs_rq, se);
Xianting Tianfe749152020-09-24 09:47:55 +08003861 update_tg_load_avg(cfs_rq);
Peter Zijlstrab382a532017-05-06 17:37:03 +02003862
Vincent Guittotbef69dd2019-11-18 14:21:19 +01003863 } else if (decayed) {
3864 cfs_rq_util_change(cfs_rq, 0);
3865
3866 if (flags & UPDATE_TG)
Xianting Tianfe749152020-09-24 09:47:55 +08003867 update_tg_load_avg(cfs_rq);
Vincent Guittotbef69dd2019-11-18 14:21:19 +01003868 }
Peter Zijlstrab382a532017-05-06 17:37:03 +02003869}
3870
Yuyang Du0905f042015-12-17 07:34:27 +08003871#ifndef CONFIG_64BIT
3872static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3873{
3874 u64 last_update_time_copy;
3875 u64 last_update_time;
3876
3877 do {
3878 last_update_time_copy = cfs_rq->load_last_update_time_copy;
3879 smp_rmb();
3880 last_update_time = cfs_rq->avg.last_update_time;
3881 } while (last_update_time != last_update_time_copy);
3882
3883 return last_update_time;
3884}
3885#else
3886static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3887{
3888 return cfs_rq->avg.last_update_time;
3889}
3890#endif
3891
Paul Turner9ee474f2012-10-04 13:18:30 +02003892/*
Morten Rasmussen104cb162016-10-14 14:41:07 +01003893 * Synchronize entity load avg of dequeued entity without locking
3894 * the previous rq.
3895 */
YueHaibing71b47ea2019-03-20 21:38:39 +08003896static void sync_entity_load_avg(struct sched_entity *se)
Morten Rasmussen104cb162016-10-14 14:41:07 +01003897{
3898 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3899 u64 last_update_time;
3900
3901 last_update_time = cfs_rq_last_update_time(cfs_rq);
Vincent Guittot23127292019-01-23 16:26:53 +01003902 __update_load_avg_blocked_se(last_update_time, se);
Morten Rasmussen104cb162016-10-14 14:41:07 +01003903}
3904
3905/*
Yuyang Du9d89c252015-07-15 08:04:37 +08003906 * Task first catches up with cfs_rq, and then subtract
3907 * itself from the cfs_rq (task must be off the queue now).
Paul Turner9ee474f2012-10-04 13:18:30 +02003908 */
YueHaibing71b47ea2019-03-20 21:38:39 +08003909static void remove_entity_load_avg(struct sched_entity *se)
Paul Turner9ee474f2012-10-04 13:18:30 +02003910{
Yuyang Du9d89c252015-07-15 08:04:37 +08003911 struct cfs_rq *cfs_rq = cfs_rq_of(se);
Peter Zijlstra2a2f5d4e2017-05-08 16:51:41 +02003912 unsigned long flags;
Paul Turner9ee474f2012-10-04 13:18:30 +02003913
Yuyang Du0905f042015-12-17 07:34:27 +08003914 /*
Peter Zijlstra7dc603c2016-06-16 13:29:28 +02003915 * tasks cannot exit without having gone through wake_up_new_task() ->
3916 * post_init_entity_util_avg() which will have added things to the
3917 * cfs_rq, so we can remove unconditionally.
Yuyang Du0905f042015-12-17 07:34:27 +08003918 */
Paul Turner9ee474f2012-10-04 13:18:30 +02003919
Morten Rasmussen104cb162016-10-14 14:41:07 +01003920 sync_entity_load_avg(se);
Peter Zijlstra2a2f5d4e2017-05-08 16:51:41 +02003921
3922 raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
3923 ++cfs_rq->removed.nr;
3924 cfs_rq->removed.util_avg += se->avg.util_avg;
3925 cfs_rq->removed.load_avg += se->avg.load_avg;
Vincent Guittot9f683952020-02-24 09:52:18 +00003926 cfs_rq->removed.runnable_avg += se->avg.runnable_avg;
Peter Zijlstra2a2f5d4e2017-05-08 16:51:41 +02003927 raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
Paul Turner2dac7542012-10-04 13:18:30 +02003928}
Vincent Guittot642dbc32013-04-18 18:34:26 +02003929
Vincent Guittot9f683952020-02-24 09:52:18 +00003930static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq)
3931{
3932 return cfs_rq->avg.runnable_avg;
3933}
3934
Yuyang Du7ea241a2015-07-15 08:04:42 +08003935static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
3936{
3937 return cfs_rq->avg.load_avg;
3938}
3939
Chen Yud91cecc2020-04-21 18:50:34 +08003940static int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
3941
Patrick Bellasi7f65ea42018-03-09 09:52:42 +00003942static inline unsigned long task_util(struct task_struct *p)
3943{
3944 return READ_ONCE(p->se.avg.util_avg);
3945}
3946
3947static inline unsigned long _task_util_est(struct task_struct *p)
3948{
3949 struct util_est ue = READ_ONCE(p->se.avg.util_est);
3950
Dietmar Eggemann68d7a192021-06-02 16:58:08 +02003951 return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED));
Patrick Bellasi7f65ea42018-03-09 09:52:42 +00003952}
3953
3954static inline unsigned long task_util_est(struct task_struct *p)
3955{
3956 return max(task_util(p), _task_util_est(p));
3957}
3958
Valentin Schneidera7008c072019-12-11 11:38:50 +00003959#ifdef CONFIG_UCLAMP_TASK
3960static inline unsigned long uclamp_task_util(struct task_struct *p)
3961{
3962 return clamp(task_util_est(p),
3963 uclamp_eff_value(p, UCLAMP_MIN),
3964 uclamp_eff_value(p, UCLAMP_MAX));
3965}
3966#else
3967static inline unsigned long uclamp_task_util(struct task_struct *p)
3968{
3969 return task_util_est(p);
3970}
3971#endif
3972
Patrick Bellasi7f65ea42018-03-09 09:52:42 +00003973static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
3974 struct task_struct *p)
3975{
3976 unsigned int enqueued;
3977
3978 if (!sched_feat(UTIL_EST))
3979 return;
3980
3981 /* Update root cfs_rq's estimated utilization */
3982 enqueued = cfs_rq->avg.util_est.enqueued;
Patrick Bellasi92a801e2018-11-05 14:53:59 +00003983 enqueued += _task_util_est(p);
Patrick Bellasi7f65ea42018-03-09 09:52:42 +00003984 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
Vincent Donnefort4581bea2020-05-27 17:39:14 +01003985
3986 trace_sched_util_est_cfs_tp(cfs_rq);
Patrick Bellasi7f65ea42018-03-09 09:52:42 +00003987}
3988
Xuewen Yan8c1f5602020-12-18 17:27:52 +08003989static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
3990 struct task_struct *p)
3991{
3992 unsigned int enqueued;
3993
3994 if (!sched_feat(UTIL_EST))
3995 return;
3996
3997 /* Update root cfs_rq's estimated utilization */
3998 enqueued = cfs_rq->avg.util_est.enqueued;
3999 enqueued -= min_t(unsigned int, enqueued, _task_util_est(p));
4000 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
4001
4002 trace_sched_util_est_cfs_tp(cfs_rq);
4003}
4004
Vincent Donnefortb89997a2021-02-25 16:58:20 +00004005#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
4006
Patrick Bellasi7f65ea42018-03-09 09:52:42 +00004007/*
4008 * Check if a (signed) value is within a specified (unsigned) margin,
4009 * based on the observation that:
4010 *
4011 * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
4012 *
Ingo Molnar3b037062021-03-18 13:38:50 +01004013 * NOTE: this only works when value + margin < INT_MAX.
Patrick Bellasi7f65ea42018-03-09 09:52:42 +00004014 */
4015static inline bool within_margin(int value, int margin)
4016{
4017 return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
4018}
4019
Xuewen Yan8c1f5602020-12-18 17:27:52 +08004020static inline void util_est_update(struct cfs_rq *cfs_rq,
4021 struct task_struct *p,
4022 bool task_sleep)
Patrick Bellasi7f65ea42018-03-09 09:52:42 +00004023{
Vincent Donnefortb89997a2021-02-25 16:58:20 +00004024 long last_ewma_diff, last_enqueued_diff;
Patrick Bellasi7f65ea42018-03-09 09:52:42 +00004025 struct util_est ue;
4026
4027 if (!sched_feat(UTIL_EST))
4028 return;
4029
Patrick Bellasi7f65ea42018-03-09 09:52:42 +00004030 /*
4031 * Skip update of task's estimated utilization when the task has not
4032 * yet completed an activation, e.g. being migrated.
4033 */
4034 if (!task_sleep)
4035 return;
4036
4037 /*
Patrick Bellasid5193292018-03-09 09:52:45 +00004038 * If the PELT values haven't changed since enqueue time,
4039 * skip the util_est update.
4040 */
4041 ue = p->se.avg.util_est;
4042 if (ue.enqueued & UTIL_AVG_UNCHANGED)
4043 return;
4044
Vincent Donnefortb89997a2021-02-25 16:58:20 +00004045 last_enqueued_diff = ue.enqueued;
4046
Patrick Bellasid5193292018-03-09 09:52:45 +00004047 /*
Patrick Bellasib8c96362019-10-23 21:56:30 +01004048 * Reset EWMA on utilization increases, the moving average is used only
4049 * to smooth utilization decreases.
4050 */
Dietmar Eggemann68d7a192021-06-02 16:58:08 +02004051 ue.enqueued = task_util(p);
Patrick Bellasib8c96362019-10-23 21:56:30 +01004052 if (sched_feat(UTIL_EST_FASTUP)) {
4053 if (ue.ewma < ue.enqueued) {
4054 ue.ewma = ue.enqueued;
4055 goto done;
4056 }
4057 }
4058
4059 /*
Vincent Donnefortb89997a2021-02-25 16:58:20 +00004060 * Skip update of task's estimated utilization when its members are
Patrick Bellasi7f65ea42018-03-09 09:52:42 +00004061 * already ~1% close to its last activation value.
4062 */
Patrick Bellasi7f65ea42018-03-09 09:52:42 +00004063 last_ewma_diff = ue.enqueued - ue.ewma;
Vincent Donnefortb89997a2021-02-25 16:58:20 +00004064 last_enqueued_diff -= ue.enqueued;
4065 if (within_margin(last_ewma_diff, UTIL_EST_MARGIN)) {
4066 if (!within_margin(last_enqueued_diff, UTIL_EST_MARGIN))
4067 goto done;
4068
Patrick Bellasi7f65ea42018-03-09 09:52:42 +00004069 return;
Vincent Donnefortb89997a2021-02-25 16:58:20 +00004070 }
Patrick Bellasi7f65ea42018-03-09 09:52:42 +00004071
4072 /*
Vincent Guittot10a35e62019-01-23 16:26:54 +01004073 * To avoid overestimation of actual task utilization, skip updates if
4074 * we cannot grant there is idle time in this CPU.
4075 */
Xuewen Yan8c1f5602020-12-18 17:27:52 +08004076 if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq))))
Vincent Guittot10a35e62019-01-23 16:26:54 +01004077 return;
4078
4079 /*
Patrick Bellasi7f65ea42018-03-09 09:52:42 +00004080 * Update Task's estimated utilization
4081 *
4082 * When *p completes an activation we can consolidate another sample
4083 * of the task size. This is done by storing the current PELT value
4084 * as ue.enqueued and by using this value to update the Exponential
4085 * Weighted Moving Average (EWMA):
4086 *
4087 * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1)
4088 * = w * task_util(p) + ewma(t-1) - w * ewma(t-1)
4089 * = w * (task_util(p) - ewma(t-1)) + ewma(t-1)
4090 * = w * ( last_ewma_diff ) + ewma(t-1)
4091 * = w * (last_ewma_diff + ewma(t-1) / w)
4092 *
4093 * Where 'w' is the weight of new samples, which is configured to be
4094 * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
4095 */
4096 ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
4097 ue.ewma += last_ewma_diff;
4098 ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
Patrick Bellasib8c96362019-10-23 21:56:30 +01004099done:
Dietmar Eggemann68d7a192021-06-02 16:58:08 +02004100 ue.enqueued |= UTIL_AVG_UNCHANGED;
Patrick Bellasi7f65ea42018-03-09 09:52:42 +00004101 WRITE_ONCE(p->se.avg.util_est, ue);
Vincent Donnefort4581bea2020-05-27 17:39:14 +01004102
4103 trace_sched_util_est_se_tp(&p->se);
Patrick Bellasi7f65ea42018-03-09 09:52:42 +00004104}
4105
Vincent Donnefortef8df972021-12-07 09:57:55 +00004106static inline int task_fits_capacity(struct task_struct *p,
4107 unsigned long capacity)
Morten Rasmussen3b1baa62018-07-04 11:17:40 +01004108{
Valentin Schneidera7008c072019-12-11 11:38:50 +00004109 return fits_capacity(uclamp_task_util(p), capacity);
Morten Rasmussen3b1baa62018-07-04 11:17:40 +01004110}
4111
4112static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
4113{
4114 if (!static_branch_unlikely(&sched_asym_cpucapacity))
4115 return;
4116
Qais Yousef0ae78ee2021-01-19 12:07:55 +00004117 if (!p || p->nr_cpus_allowed == 1) {
Morten Rasmussen3b1baa62018-07-04 11:17:40 +01004118 rq->misfit_task_load = 0;
4119 return;
4120 }
4121
4122 if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
4123 rq->misfit_task_load = 0;
4124 return;
4125 }
4126
Vincent Guittot01cfcde2020-07-10 17:24:26 +02004127 /*
4128 * Make sure that misfit_task_load will not be null even if
4129 * task_h_load() returns 0.
4130 */
4131 rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
Morten Rasmussen3b1baa62018-07-04 11:17:40 +01004132}
4133
Peter Zijlstra38033c32014-01-23 20:32:21 +01004134#else /* CONFIG_SMP */
4135
Odin Ugedala7b359f2021-06-12 13:28:15 +02004136static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
4137{
4138 return true;
4139}
4140
Vincent Guittotd31b1a62016-11-08 10:53:44 +01004141#define UPDATE_TG 0x0
4142#define SKIP_AGE_LOAD 0x0
Peter Zijlstrab382a532017-05-06 17:37:03 +02004143#define DO_ATTACH 0x0
Vincent Guittotd31b1a62016-11-08 10:53:44 +01004144
Peter Zijlstra88c06162017-05-06 17:32:43 +02004145static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
Rafael J. Wysocki536bd002016-05-06 14:58:43 +02004146{
Peter Zijlstraea14b57e2018-02-02 10:27:00 +01004147 cfs_rq_util_change(cfs_rq, 0);
Rafael J. Wysocki536bd002016-05-06 14:58:43 +02004148}
4149
Yuyang Du9d89c252015-07-15 08:04:37 +08004150static inline void remove_entity_load_avg(struct sched_entity *se) {}
Peter Zijlstra6e831252014-02-11 16:11:48 +01004151
Byungchul Parka05e8c52015-08-20 20:21:56 +09004152static inline void
Vincent Guittota4f9a0e2020-01-15 11:20:20 +01004153attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
Byungchul Parka05e8c52015-08-20 20:21:56 +09004154static inline void
4155detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
4156
Chen Yud91cecc2020-04-21 18:50:34 +08004157static inline int newidle_balance(struct rq *rq, struct rq_flags *rf)
Peter Zijlstra6e831252014-02-11 16:11:48 +01004158{
4159 return 0;
4160}
4161
Patrick Bellasi7f65ea42018-03-09 09:52:42 +00004162static inline void
4163util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
4164
4165static inline void
Xuewen Yan8c1f5602020-12-18 17:27:52 +08004166util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
4167
4168static inline void
4169util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p,
4170 bool task_sleep) {}
Morten Rasmussen3b1baa62018-07-04 11:17:40 +01004171static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
Patrick Bellasi7f65ea42018-03-09 09:52:42 +00004172
Peter Zijlstra38033c32014-01-23 20:32:21 +01004173#endif /* CONFIG_SMP */
Paul Turner9d85f212012-10-04 13:18:29 +02004174
Peter Zijlstraddc97292007-10-15 17:00:10 +02004175static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
4176{
4177#ifdef CONFIG_SCHED_DEBUG
4178 s64 d = se->vruntime - cfs_rq->min_vruntime;
4179
4180 if (d < 0)
4181 d = -d;
4182
4183 if (d > 3*sysctl_sched_latency)
Josh Poimboeufae928822016-06-17 12:43:24 -05004184 schedstat_inc(cfs_rq->nr_spread_over);
Peter Zijlstraddc97292007-10-15 17:00:10 +02004185#endif
4186}
4187
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004188static void
Peter Zijlstraaeb73b02007-10-15 17:00:05 +02004189place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
4190{
Peter Zijlstra1af5f732008-10-24 11:06:13 +02004191 u64 vruntime = cfs_rq->min_vruntime;
Peter Zijlstra94dfb5e2007-10-15 17:00:05 +02004192
Peter Zijlstra2cb86002007-11-09 22:39:37 +01004193 /*
4194 * The 'current' period is already promised to the current tasks,
4195 * however the extra weight of the new task will slow them down a
4196 * little, place the new task so that it fits in the slot that
4197 * stays open at the end.
4198 */
Peter Zijlstra94dfb5e2007-10-15 17:00:05 +02004199 if (initial && sched_feat(START_DEBIT))
Peter Zijlstraf9c0b092008-10-17 19:27:04 +02004200 vruntime += sched_vslice(cfs_rq, se);
Peter Zijlstraaeb73b02007-10-15 17:00:05 +02004201
Mike Galbraitha2e7a7e2009-09-18 09:19:25 +02004202 /* sleeps up to a single latency don't count. */
Mike Galbraith5ca98802010-03-11 17:17:17 +01004203 if (!initial) {
Josh Don2cae3942021-08-19 18:04:03 -07004204 unsigned long thresh;
4205
4206 if (se_is_idle(se))
4207 thresh = sysctl_sched_min_granularity;
4208 else
4209 thresh = sysctl_sched_latency;
Peter Zijlstraa7be37a2008-06-27 13:41:11 +02004210
Mike Galbraitha2e7a7e2009-09-18 09:19:25 +02004211 /*
Mike Galbraitha2e7a7e2009-09-18 09:19:25 +02004212 * Halve their sleep time's effect, to allow
4213 * for a gentler effect of sleepers:
4214 */
4215 if (sched_feat(GENTLE_FAIR_SLEEPERS))
4216 thresh >>= 1;
Ingo Molnar51e03042009-09-16 08:54:45 +02004217
Mike Galbraitha2e7a7e2009-09-18 09:19:25 +02004218 vruntime -= thresh;
Peter Zijlstraaeb73b02007-10-15 17:00:05 +02004219 }
4220
Mike Galbraithb5d9d732009-09-08 11:12:28 +02004221 /* ensure we never gain time by being placed backwards. */
Viresh Kumar16c8f1c2012-11-08 13:33:46 +05304222 se->vruntime = max_vruntime(se->vruntime, vruntime);
Peter Zijlstraaeb73b02007-10-15 17:00:05 +02004223}
4224
Paul Turnerd3d9dc32011-07-21 09:43:39 -07004225static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
4226
Vincent Guittotfe614682020-03-06 14:52:57 +01004227static inline bool cfs_bandwidth_used(void);
Peter Zijlstrab5179ac2016-05-11 16:10:34 +02004228
4229/*
4230 * MIGRATION
4231 *
4232 * dequeue
4233 * update_curr()
4234 * update_min_vruntime()
4235 * vruntime -= min_vruntime
4236 *
4237 * enqueue
4238 * update_curr()
4239 * update_min_vruntime()
4240 * vruntime += min_vruntime
4241 *
4242 * this way the vruntime transition between RQs is done when both
4243 * min_vruntime are up-to-date.
4244 *
4245 * WAKEUP (remote)
4246 *
Peter Zijlstra59efa0b2016-05-10 18:24:37 +02004247 * ->migrate_task_rq_fair() (p->state == TASK_WAKING)
Peter Zijlstrab5179ac2016-05-11 16:10:34 +02004248 * vruntime -= min_vruntime
4249 *
4250 * enqueue
4251 * update_curr()
4252 * update_min_vruntime()
4253 * vruntime += min_vruntime
4254 *
4255 * this way we don't have the most up-to-date min_vruntime on the originating
4256 * CPU and an up-to-date min_vruntime on the destination CPU.
4257 */
4258
Peter Zijlstraaeb73b02007-10-15 17:00:05 +02004259static void
Peter Zijlstra88ec22d2009-12-16 18:04:41 +01004260enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004261{
Peter Zijlstra2f950352016-05-11 19:27:56 +02004262 bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
4263 bool curr = cfs_rq->curr == se;
Peter Zijlstra3a47d512016-03-09 13:04:03 +01004264
Ingo Molnar53d3bc72016-05-11 08:25:53 +02004265 /*
Peter Zijlstra2f950352016-05-11 19:27:56 +02004266 * If we're the current task, we must renormalise before calling
4267 * update_curr().
Ingo Molnar53d3bc72016-05-11 08:25:53 +02004268 */
Peter Zijlstra2f950352016-05-11 19:27:56 +02004269 if (renorm && curr)
4270 se->vruntime += cfs_rq->min_vruntime;
4271
Ingo Molnarb7cc0892007-08-09 11:16:47 +02004272 update_curr(cfs_rq);
Peter Zijlstra2f950352016-05-11 19:27:56 +02004273
4274 /*
4275 * Otherwise, renormalise after, such that we're placed at the current
4276 * moment in time, instead of some random moment in the past. Being
4277 * placed in the past could significantly boost this task to the
4278 * fairness detriment of existing tasks.
4279 */
4280 if (renorm && !curr)
4281 se->vruntime += cfs_rq->min_vruntime;
4282
Vincent Guittot89ee0482016-12-21 16:50:26 +01004283 /*
4284 * When enqueuing a sched_entity, we must:
4285 * - Update loads to have both entity and cfs_rq synced with now.
Vincent Guittot9f683952020-02-24 09:52:18 +00004286 * - Add its load to cfs_rq->runnable_avg
Vincent Guittot89ee0482016-12-21 16:50:26 +01004287 * - For group_entity, update its weight to reflect the new share of
4288 * its group cfs_rq
4289 * - Add its new weight to cfs_rq->load.weight
4290 */
Peter Zijlstrab382a532017-05-06 17:37:03 +02004291 update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
Vincent Guittot9f683952020-02-24 09:52:18 +00004292 se_update_runnable(se);
Peter Zijlstra1ea6c462017-05-06 15:59:54 +02004293 update_cfs_group(se);
Linus Torvalds17bc14b2012-12-14 07:20:43 -08004294 account_entity_enqueue(cfs_rq, se);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004295
Josh Poimboeuf1a3d0272016-06-17 12:43:23 -05004296 if (flags & ENQUEUE_WAKEUP)
Peter Zijlstraaeb73b02007-10-15 17:00:05 +02004297 place_entity(cfs_rq, se, 0);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004298
Mel Gormancb251762016-02-05 09:08:36 +00004299 check_schedstat_required();
Yafang Shao60f24152021-09-05 14:35:42 +00004300 update_stats_enqueue_fair(cfs_rq, se, flags);
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -05004301 check_spread(cfs_rq, se);
Peter Zijlstra2f950352016-05-11 19:27:56 +02004302 if (!curr)
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +02004303 __enqueue_entity(cfs_rq, se);
Peter Zijlstra2069dd72010-11-15 15:47:00 -08004304 se->on_rq = 1;
Peter Zijlstra3d4b47b2010-11-15 15:47:01 -08004305
Vincent Guittotfe614682020-03-06 14:52:57 +01004306 /*
4307 * When bandwidth control is enabled, cfs might have been removed
4308 * because of a parent been throttled but cfs->nr_running > 1. Try to
Ingo Molnar3b037062021-03-18 13:38:50 +01004309 * add it unconditionally.
Vincent Guittotfe614682020-03-06 14:52:57 +01004310 */
4311 if (cfs_rq->nr_running == 1 || cfs_bandwidth_used())
Peter Zijlstra3d4b47b2010-11-15 15:47:01 -08004312 list_add_leaf_cfs_rq(cfs_rq);
Vincent Guittotfe614682020-03-06 14:52:57 +01004313
4314 if (cfs_rq->nr_running == 1)
Paul Turnerd3d9dc32011-07-21 09:43:39 -07004315 check_enqueue_throttle(cfs_rq);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004316}
4317
Rik van Riel2c13c9192011-02-01 09:48:37 -05004318static void __clear_buddies_last(struct sched_entity *se)
Peter Zijlstra2002c692008-11-11 11:52:33 +01004319{
Rik van Riel2c13c9192011-02-01 09:48:37 -05004320 for_each_sched_entity(se) {
4321 struct cfs_rq *cfs_rq = cfs_rq_of(se);
Peter Zijlstraf1044792012-02-11 06:05:00 +01004322 if (cfs_rq->last != se)
Rik van Riel2c13c9192011-02-01 09:48:37 -05004323 break;
Peter Zijlstraf1044792012-02-11 06:05:00 +01004324
4325 cfs_rq->last = NULL;
Rik van Riel2c13c9192011-02-01 09:48:37 -05004326 }
4327}
Peter Zijlstra2002c692008-11-11 11:52:33 +01004328
Rik van Riel2c13c9192011-02-01 09:48:37 -05004329static void __clear_buddies_next(struct sched_entity *se)
4330{
4331 for_each_sched_entity(se) {
4332 struct cfs_rq *cfs_rq = cfs_rq_of(se);
Peter Zijlstraf1044792012-02-11 06:05:00 +01004333 if (cfs_rq->next != se)
Rik van Riel2c13c9192011-02-01 09:48:37 -05004334 break;
Peter Zijlstraf1044792012-02-11 06:05:00 +01004335
4336 cfs_rq->next = NULL;
Rik van Riel2c13c9192011-02-01 09:48:37 -05004337 }
Peter Zijlstra2002c692008-11-11 11:52:33 +01004338}
4339
Rik van Rielac53db52011-02-01 09:51:03 -05004340static void __clear_buddies_skip(struct sched_entity *se)
4341{
4342 for_each_sched_entity(se) {
4343 struct cfs_rq *cfs_rq = cfs_rq_of(se);
Peter Zijlstraf1044792012-02-11 06:05:00 +01004344 if (cfs_rq->skip != se)
Rik van Rielac53db52011-02-01 09:51:03 -05004345 break;
Peter Zijlstraf1044792012-02-11 06:05:00 +01004346
4347 cfs_rq->skip = NULL;
Rik van Rielac53db52011-02-01 09:51:03 -05004348 }
4349}
4350
Peter Zijlstraa571bbe2009-01-28 14:51:40 +01004351static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
4352{
Rik van Riel2c13c9192011-02-01 09:48:37 -05004353 if (cfs_rq->last == se)
4354 __clear_buddies_last(se);
4355
4356 if (cfs_rq->next == se)
4357 __clear_buddies_next(se);
Rik van Rielac53db52011-02-01 09:51:03 -05004358
4359 if (cfs_rq->skip == se)
4360 __clear_buddies_skip(se);
Peter Zijlstraa571bbe2009-01-28 14:51:40 +01004361}
4362
Peter Zijlstra6c16a6d2012-03-21 13:07:16 -07004363static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
Paul Turnerd8b49862011-07-21 09:43:41 -07004364
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004365static void
Peter Zijlstra371fd7e2010-03-24 16:38:48 +01004366dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004367{
Dmitry Adamushkoa2a2d682007-10-15 17:00:13 +02004368 /*
4369 * Update run-time statistics of the 'current'.
4370 */
4371 update_curr(cfs_rq);
Vincent Guittot89ee0482016-12-21 16:50:26 +01004372
4373 /*
4374 * When dequeuing a sched_entity, we must:
4375 * - Update loads to have both entity and cfs_rq synced with now.
Vincent Guittot9f683952020-02-24 09:52:18 +00004376 * - Subtract its load from the cfs_rq->runnable_avg.
Ingo Molnardfcb2452018-12-03 10:05:56 +01004377 * - Subtract its previous weight from cfs_rq->load.weight.
Vincent Guittot89ee0482016-12-21 16:50:26 +01004378 * - For group entity, update its weight to reflect the new share
4379 * of its group cfs_rq.
4380 */
Peter Zijlstra88c06162017-05-06 17:32:43 +02004381 update_load_avg(cfs_rq, se, UPDATE_TG);
Vincent Guittot9f683952020-02-24 09:52:18 +00004382 se_update_runnable(se);
Dmitry Adamushkoa2a2d682007-10-15 17:00:13 +02004383
Yafang Shao60f24152021-09-05 14:35:42 +00004384 update_stats_dequeue_fair(cfs_rq, se, flags);
Peter Zijlstra67e9fb22007-10-15 17:00:10 +02004385
Peter Zijlstra2002c692008-11-11 11:52:33 +01004386 clear_buddies(cfs_rq, se);
Peter Zijlstra47932412008-11-04 21:25:09 +01004387
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +02004388 if (se != cfs_rq->curr)
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02004389 __dequeue_entity(cfs_rq, se);
Linus Torvalds17bc14b2012-12-14 07:20:43 -08004390 se->on_rq = 0;
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02004391 account_entity_dequeue(cfs_rq, se);
Peter Zijlstra88ec22d2009-12-16 18:04:41 +01004392
4393 /*
Peter Zijlstrab60205c2016-09-20 21:58:12 +02004394 * Normalize after update_curr(); which will also have moved
4395 * min_vruntime if @se is the one holding it back. But before doing
4396 * update_min_vruntime() again, which will discount @se's position and
4397 * can move min_vruntime forward still more.
Peter Zijlstra88ec22d2009-12-16 18:04:41 +01004398 */
Peter Zijlstra371fd7e2010-03-24 16:38:48 +01004399 if (!(flags & DEQUEUE_SLEEP))
Peter Zijlstra88ec22d2009-12-16 18:04:41 +01004400 se->vruntime -= cfs_rq->min_vruntime;
Peter Zijlstra1e876232011-05-17 16:21:10 -07004401
Paul Turnerd8b49862011-07-21 09:43:41 -07004402 /* return excess runtime on last dequeue */
4403 return_cfs_rq_runtime(cfs_rq);
4404
Peter Zijlstra1ea6c462017-05-06 15:59:54 +02004405 update_cfs_group(se);
Peter Zijlstrab60205c2016-09-20 21:58:12 +02004406
4407 /*
4408 * Now advance min_vruntime if @se was the entity holding it back,
4409 * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
4410 * put back on, and if we advance min_vruntime, we'll be placed back
4411 * further than we started -- ie. we'll be penalized.
4412 */
Song Muchun9845c492018-10-14 19:26:12 +08004413 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
Peter Zijlstrab60205c2016-09-20 21:58:12 +02004414 update_min_vruntime(cfs_rq);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004415}
4416
4417/*
4418 * Preempt the current task with a newly woken task if needed:
4419 */
Peter Zijlstra7c92e542007-09-05 14:32:49 +02004420static void
Ingo Molnar2e09bf52007-10-15 17:00:05 +02004421check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004422{
Peter Zijlstra11697832007-09-05 14:32:49 +02004423 unsigned long ideal_runtime, delta_exec;
Wang Xingchaof4cfb332011-09-16 13:35:52 -04004424 struct sched_entity *se;
4425 s64 delta;
Peter Zijlstra11697832007-09-05 14:32:49 +02004426
Peter Zijlstra6d0f0eb2007-10-15 17:00:05 +02004427 ideal_runtime = sched_slice(cfs_rq, curr);
Peter Zijlstra11697832007-09-05 14:32:49 +02004428 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
Mike Galbraitha9f3e2b2009-01-28 14:51:39 +01004429 if (delta_exec > ideal_runtime) {
Kirill Tkhai88751252014-06-29 00:03:57 +04004430 resched_curr(rq_of(cfs_rq));
Mike Galbraitha9f3e2b2009-01-28 14:51:39 +01004431 /*
4432 * The current task ran long enough, ensure it doesn't get
4433 * re-elected due to buddy favours.
4434 */
4435 clear_buddies(cfs_rq, curr);
Mike Galbraithf685cea2009-10-23 23:09:22 +02004436 return;
4437 }
4438
4439 /*
4440 * Ensure that a task that missed wakeup preemption by a
4441 * narrow margin doesn't have to wait for a full slice.
4442 * This also mitigates buddy induced latencies under load.
4443 */
Mike Galbraithf685cea2009-10-23 23:09:22 +02004444 if (delta_exec < sysctl_sched_min_granularity)
4445 return;
4446
Wang Xingchaof4cfb332011-09-16 13:35:52 -04004447 se = __pick_first_entity(cfs_rq);
4448 delta = curr->vruntime - se->vruntime;
Mike Galbraithf685cea2009-10-23 23:09:22 +02004449
Wang Xingchaof4cfb332011-09-16 13:35:52 -04004450 if (delta < 0)
4451 return;
Mike Galbraithd7d82942011-01-05 05:41:17 +01004452
Wang Xingchaof4cfb332011-09-16 13:35:52 -04004453 if (delta > ideal_runtime)
Kirill Tkhai88751252014-06-29 00:03:57 +04004454 resched_curr(rq_of(cfs_rq));
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004455}
4456
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +02004457static void
Ingo Molnar8494f412007-08-09 11:16:48 +02004458set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004459{
Peter Zijlstra21f56ffe2020-11-17 18:19:32 -05004460 clear_buddies(cfs_rq, se);
4461
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +02004462 /* 'current' is not kept within the tree. */
4463 if (se->on_rq) {
4464 /*
4465 * Any task has to be enqueued before it get to execute on
4466 * a CPU. So account for the time it spent waiting on the
4467 * runqueue.
4468 */
Yafang Shao60f24152021-09-05 14:35:42 +00004469 update_stats_wait_end_fair(cfs_rq, se);
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +02004470 __dequeue_entity(cfs_rq, se);
Peter Zijlstra88c06162017-05-06 17:32:43 +02004471 update_load_avg(cfs_rq, se, UPDATE_TG);
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +02004472 }
4473
Ingo Molnar79303e92007-08-09 11:16:47 +02004474 update_stats_curr_start(cfs_rq, se);
Ingo Molnar429d43b2007-10-15 17:00:03 +02004475 cfs_rq->curr = se;
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -05004476
Ingo Molnareba1ed42007-10-15 17:00:02 +02004477 /*
4478 * Track our maximum slice length, if the CPU's load is at
4479 * least twice that of our own weight (i.e. dont track it
4480 * when there are only lesser-weight tasks around):
4481 */
Dietmar Eggemannf2bedc42019-04-24 09:45:56 +01004482 if (schedstat_enabled() &&
4483 rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
Yafang Shaoceeadb82021-09-05 14:35:41 +00004484 struct sched_statistics *stats;
4485
4486 stats = __schedstats_from_se(se);
4487 __schedstat_set(stats->slice_max,
4488 max((u64)stats->slice_max,
Yafang Shaoa2dcb272021-09-05 14:35:40 +00004489 se->sum_exec_runtime - se->prev_sum_exec_runtime));
Ingo Molnareba1ed42007-10-15 17:00:02 +02004490 }
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -05004491
Peter Zijlstra4a55b452007-09-05 14:32:49 +02004492 se->prev_sum_exec_runtime = se->sum_exec_runtime;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004493}
4494
Peter Zijlstra3f3a4902008-10-24 11:06:16 +02004495static int
4496wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
4497
Rik van Rielac53db52011-02-01 09:51:03 -05004498/*
4499 * Pick the next process, keeping these things in mind, in this order:
4500 * 1) keep things fair between processes/task groups
4501 * 2) pick the "next" process, since someone really wants that to run
4502 * 3) pick the "last" process, for cache locality
4503 * 4) do not run the "skip" process, if something else is available
4504 */
Peter Zijlstra678d5712012-02-11 06:05:00 +01004505static struct sched_entity *
4506pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
Peter Zijlstraaa2ac252008-03-14 21:12:12 +01004507{
Peter Zijlstra678d5712012-02-11 06:05:00 +01004508 struct sched_entity *left = __pick_first_entity(cfs_rq);
4509 struct sched_entity *se;
4510
4511 /*
4512 * If curr is set we have to see if its left of the leftmost entity
4513 * still in the tree, provided there was anything in the tree at all.
4514 */
4515 if (!left || (curr && entity_before(curr, left)))
4516 left = curr;
4517
4518 se = left; /* ideally we run the leftmost entity */
Peter Zijlstraf4b67552008-11-04 21:25:07 +01004519
Rik van Rielac53db52011-02-01 09:51:03 -05004520 /*
4521 * Avoid running the skip buddy, if running something else can
4522 * be done without getting too unfair.
4523 */
Peter Zijlstra21f56ffe2020-11-17 18:19:32 -05004524 if (cfs_rq->skip && cfs_rq->skip == se) {
Peter Zijlstra678d5712012-02-11 06:05:00 +01004525 struct sched_entity *second;
4526
4527 if (se == curr) {
4528 second = __pick_first_entity(cfs_rq);
4529 } else {
4530 second = __pick_next_entity(se);
4531 if (!second || (curr && entity_before(curr, second)))
4532 second = curr;
4533 }
4534
Rik van Rielac53db52011-02-01 09:51:03 -05004535 if (second && wakeup_preempt_entity(second, left) < 1)
4536 se = second;
4537 }
Peter Zijlstraaa2ac252008-03-14 21:12:12 +01004538
Peter Oskolkov9abb8972020-09-30 10:35:32 -07004539 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
4540 /*
4541 * Someone really wants this to run. If it's not unfair, run it.
4542 */
Rik van Rielac53db52011-02-01 09:51:03 -05004543 se = cfs_rq->next;
Peter Oskolkov9abb8972020-09-30 10:35:32 -07004544 } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
4545 /*
4546 * Prefer last buddy, try to return the CPU to a preempted task.
4547 */
4548 se = cfs_rq->last;
4549 }
Rik van Rielac53db52011-02-01 09:51:03 -05004550
Peter Zijlstra47932412008-11-04 21:25:09 +01004551 return se;
Peter Zijlstraaa2ac252008-03-14 21:12:12 +01004552}
4553
Peter Zijlstra678d5712012-02-11 06:05:00 +01004554static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
Paul Turnerd3d9dc32011-07-21 09:43:39 -07004555
Ingo Molnarab6cde22007-08-09 11:16:48 +02004556static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004557{
4558 /*
4559 * If still on the runqueue then deactivate_task()
4560 * was not called and update_curr() has to be done:
4561 */
4562 if (prev->on_rq)
Ingo Molnarb7cc0892007-08-09 11:16:47 +02004563 update_curr(cfs_rq);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004564
Paul Turnerd3d9dc32011-07-21 09:43:39 -07004565 /* throttle cfs_rqs exceeding runtime */
4566 check_cfs_rq_runtime(cfs_rq);
4567
Josh Poimboeuf4fa8d292016-06-17 12:43:26 -05004568 check_spread(cfs_rq, prev);
Mel Gormancb251762016-02-05 09:08:36 +00004569
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02004570 if (prev->on_rq) {
Yafang Shao60f24152021-09-05 14:35:42 +00004571 update_stats_wait_start_fair(cfs_rq, prev);
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02004572 /* Put 'current' back into the tree. */
4573 __enqueue_entity(cfs_rq, prev);
Paul Turner9d85f212012-10-04 13:18:29 +02004574 /* in !on_rq case, update occurred at dequeue */
Peter Zijlstra88c06162017-05-06 17:32:43 +02004575 update_load_avg(cfs_rq, prev, 0);
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02004576 }
Ingo Molnar429d43b2007-10-15 17:00:03 +02004577 cfs_rq->curr = NULL;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004578}
4579
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01004580static void
4581entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004582{
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004583 /*
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02004584 * Update run-time statistics of the 'current'.
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004585 */
Dmitry Adamushko30cfdcf2007-10-15 17:00:07 +02004586 update_curr(cfs_rq);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004587
Paul Turner43365bd2010-12-15 19:10:17 -08004588 /*
Paul Turner9d85f212012-10-04 13:18:29 +02004589 * Ensure that runnable average is periodically updated.
4590 */
Peter Zijlstra88c06162017-05-06 17:32:43 +02004591 update_load_avg(cfs_rq, curr, UPDATE_TG);
Peter Zijlstra1ea6c462017-05-06 15:59:54 +02004592 update_cfs_group(curr);
Paul Turner9d85f212012-10-04 13:18:29 +02004593
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01004594#ifdef CONFIG_SCHED_HRTICK
4595 /*
4596 * queued ticks are scheduled to match the slice, so don't bother
4597 * validating it and just reschedule.
4598 */
Harvey Harrison983ed7a2008-04-24 18:17:55 -07004599 if (queued) {
Kirill Tkhai88751252014-06-29 00:03:57 +04004600 resched_curr(rq_of(cfs_rq));
Harvey Harrison983ed7a2008-04-24 18:17:55 -07004601 return;
4602 }
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01004603 /*
4604 * don't let the period tick interfere with the hrtick preemption
4605 */
4606 if (!sched_feat(DOUBLE_TICK) &&
4607 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
4608 return;
4609#endif
4610
Yong Zhang2c2efae2011-07-29 16:20:33 +08004611 if (cfs_rq->nr_running > 1)
Ingo Molnar2e09bf52007-10-15 17:00:05 +02004612 check_preempt_tick(cfs_rq, curr);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02004613}
4614
Paul Turnerab84d312011-07-21 09:43:28 -07004615
4616/**************************************************
4617 * CFS bandwidth control machinery
4618 */
4619
4620#ifdef CONFIG_CFS_BANDWIDTH
Peter Zijlstra029632f2011-10-25 10:00:11 +02004621
Masahiro Yamadae9666d12018-12-31 00:14:15 +09004622#ifdef CONFIG_JUMP_LABEL
Ingo Molnarc5905af2012-02-24 08:31:31 +01004623static struct static_key __cfs_bandwidth_used;
Peter Zijlstra029632f2011-10-25 10:00:11 +02004624
4625static inline bool cfs_bandwidth_used(void)
4626{
Ingo Molnarc5905af2012-02-24 08:31:31 +01004627 return static_key_false(&__cfs_bandwidth_used);
Peter Zijlstra029632f2011-10-25 10:00:11 +02004628}
4629
Ben Segall1ee14e62013-10-16 11:16:12 -07004630void cfs_bandwidth_usage_inc(void)
Peter Zijlstra029632f2011-10-25 10:00:11 +02004631{
Peter Zijlstrace48c1462018-01-22 22:53:28 +01004632 static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
Ben Segall1ee14e62013-10-16 11:16:12 -07004633}
4634
4635void cfs_bandwidth_usage_dec(void)
4636{
Peter Zijlstrace48c1462018-01-22 22:53:28 +01004637 static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
Peter Zijlstra029632f2011-10-25 10:00:11 +02004638}
Masahiro Yamadae9666d12018-12-31 00:14:15 +09004639#else /* CONFIG_JUMP_LABEL */
Peter Zijlstra029632f2011-10-25 10:00:11 +02004640static bool cfs_bandwidth_used(void)
4641{
4642 return true;
4643}
4644
Ben Segall1ee14e62013-10-16 11:16:12 -07004645void cfs_bandwidth_usage_inc(void) {}
4646void cfs_bandwidth_usage_dec(void) {}
Masahiro Yamadae9666d12018-12-31 00:14:15 +09004647#endif /* CONFIG_JUMP_LABEL */
Peter Zijlstra029632f2011-10-25 10:00:11 +02004648
Paul Turnerab84d312011-07-21 09:43:28 -07004649/*
4650 * default period for cfs group bandwidth.
4651 * default: 0.1s, units: nanoseconds
4652 */
4653static inline u64 default_cfs_period(void)
4654{
4655 return 100000000ULL;
4656}
Paul Turnerec12cb72011-07-21 09:43:30 -07004657
4658static inline u64 sched_cfs_bandwidth_slice(void)
4659{
4660 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
4661}
4662
Paul Turnera9cf55b2011-07-21 09:43:32 -07004663/*
Qian Cai763a9ec2019-08-20 14:40:55 -04004664 * Replenish runtime according to assigned quota. We use sched_clock_cpu
4665 * directly instead of rq->clock to avoid adding additional synchronization
4666 * around rq->lock.
Paul Turnera9cf55b2011-07-21 09:43:32 -07004667 *
4668 * requires cfs_b->lock
4669 */
Peter Zijlstra029632f2011-10-25 10:00:11 +02004670void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
Paul Turnera9cf55b2011-07-21 09:43:32 -07004671{
Huaixin Changbcb17042021-08-30 11:22:14 +08004672 s64 runtime;
4673
Huaixin Changf4183712021-06-21 17:27:58 +08004674 if (unlikely(cfs_b->quota == RUNTIME_INF))
4675 return;
4676
4677 cfs_b->runtime += cfs_b->quota;
Huaixin Changbcb17042021-08-30 11:22:14 +08004678 runtime = cfs_b->runtime_snap - cfs_b->runtime;
4679 if (runtime > 0) {
4680 cfs_b->burst_time += runtime;
4681 cfs_b->nr_burst++;
4682 }
4683
Huaixin Changf4183712021-06-21 17:27:58 +08004684 cfs_b->runtime = min(cfs_b->runtime, cfs_b->quota + cfs_b->burst);
Huaixin Changbcb17042021-08-30 11:22:14 +08004685 cfs_b->runtime_snap = cfs_b->runtime;
Paul Turnera9cf55b2011-07-21 09:43:32 -07004686}
4687
Peter Zijlstra029632f2011-10-25 10:00:11 +02004688static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
4689{
4690 return &tg->cfs_bandwidth;
4691}
4692
Paul Turner85dac902011-07-21 09:43:33 -07004693/* returns 0 on failure to allocate runtime */
Paul Turnere98fa02c2020-04-10 15:52:07 -07004694static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
4695 struct cfs_rq *cfs_rq, u64 target_runtime)
Paul Turnerec12cb72011-07-21 09:43:30 -07004696{
Paul Turnere98fa02c2020-04-10 15:52:07 -07004697 u64 min_amount, amount = 0;
4698
4699 lockdep_assert_held(&cfs_b->lock);
Paul Turnerec12cb72011-07-21 09:43:30 -07004700
4701 /* note: this is a positive sum as runtime_remaining <= 0 */
Paul Turnere98fa02c2020-04-10 15:52:07 -07004702 min_amount = target_runtime - cfs_rq->runtime_remaining;
Paul Turnerec12cb72011-07-21 09:43:30 -07004703
Paul Turnerec12cb72011-07-21 09:43:30 -07004704 if (cfs_b->quota == RUNTIME_INF)
4705 amount = min_amount;
Paul Turner58088ad2011-07-21 09:43:31 -07004706 else {
Peter Zijlstra77a4d1a2015-04-15 11:41:57 +02004707 start_cfs_bandwidth(cfs_b);
Paul Turner58088ad2011-07-21 09:43:31 -07004708
4709 if (cfs_b->runtime > 0) {
4710 amount = min(cfs_b->runtime, min_amount);
4711 cfs_b->runtime -= amount;
4712 cfs_b->idle = 0;
4713 }
Paul Turnerec12cb72011-07-21 09:43:30 -07004714 }
Paul Turnerec12cb72011-07-21 09:43:30 -07004715
4716 cfs_rq->runtime_remaining += amount;
Paul Turner85dac902011-07-21 09:43:33 -07004717
4718 return cfs_rq->runtime_remaining > 0;
Paul Turnera9cf55b2011-07-21 09:43:32 -07004719}
4720
Paul Turnere98fa02c2020-04-10 15:52:07 -07004721/* returns 0 on failure to allocate runtime */
4722static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4723{
4724 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4725 int ret;
4726
4727 raw_spin_lock(&cfs_b->lock);
4728 ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
4729 raw_spin_unlock(&cfs_b->lock);
4730
4731 return ret;
4732}
4733
Peter Zijlstra9dbdb152013-11-18 18:27:06 +01004734static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
Paul Turnerec12cb72011-07-21 09:43:30 -07004735{
Paul Turnera9cf55b2011-07-21 09:43:32 -07004736 /* dock delta_exec before expiring quota (as it could span periods) */
Paul Turnerec12cb72011-07-21 09:43:30 -07004737 cfs_rq->runtime_remaining -= delta_exec;
Paul Turnera9cf55b2011-07-21 09:43:32 -07004738
4739 if (likely(cfs_rq->runtime_remaining > 0))
Paul Turnerec12cb72011-07-21 09:43:30 -07004740 return;
4741
Liangyan5e2d2cc2019-08-26 20:16:33 +08004742 if (cfs_rq->throttled)
4743 return;
Paul Turner85dac902011-07-21 09:43:33 -07004744 /*
4745 * if we're unable to extend our runtime we resched so that the active
4746 * hierarchy can be throttled
4747 */
4748 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
Kirill Tkhai88751252014-06-29 00:03:57 +04004749 resched_curr(rq_of(cfs_rq));
Paul Turnerec12cb72011-07-21 09:43:30 -07004750}
4751
Peter Zijlstra6c16a6d2012-03-21 13:07:16 -07004752static __always_inline
Peter Zijlstra9dbdb152013-11-18 18:27:06 +01004753void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
Paul Turnerec12cb72011-07-21 09:43:30 -07004754{
Paul Turner56f570e2011-11-07 20:26:33 -08004755 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
Paul Turnerec12cb72011-07-21 09:43:30 -07004756 return;
4757
4758 __account_cfs_rq_runtime(cfs_rq, delta_exec);
4759}
4760
Paul Turner85dac902011-07-21 09:43:33 -07004761static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4762{
Paul Turner56f570e2011-11-07 20:26:33 -08004763 return cfs_bandwidth_used() && cfs_rq->throttled;
Paul Turner85dac902011-07-21 09:43:33 -07004764}
4765
Paul Turner64660c82011-07-21 09:43:36 -07004766/* check whether cfs_rq, or any parent, is throttled */
4767static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4768{
Paul Turner56f570e2011-11-07 20:26:33 -08004769 return cfs_bandwidth_used() && cfs_rq->throttle_count;
Paul Turner64660c82011-07-21 09:43:36 -07004770}
4771
4772/*
4773 * Ensure that neither of the group entities corresponding to src_cpu or
4774 * dest_cpu are members of a throttled hierarchy when performing group
4775 * load-balance operations.
4776 */
4777static inline int throttled_lb_pair(struct task_group *tg,
4778 int src_cpu, int dest_cpu)
4779{
4780 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
4781
4782 src_cfs_rq = tg->cfs_rq[src_cpu];
4783 dest_cfs_rq = tg->cfs_rq[dest_cpu];
4784
4785 return throttled_hierarchy(src_cfs_rq) ||
4786 throttled_hierarchy(dest_cfs_rq);
4787}
4788
Paul Turner64660c82011-07-21 09:43:36 -07004789static int tg_unthrottle_up(struct task_group *tg, void *data)
4790{
4791 struct rq *rq = data;
4792 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4793
4794 cfs_rq->throttle_count--;
Paul Turner64660c82011-07-21 09:43:36 -07004795 if (!cfs_rq->throttle_count) {
Frederic Weisbecker78becc22013-04-12 01:51:02 +02004796 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
Paul Turnerf1b17282012-10-04 13:18:31 +02004797 cfs_rq->throttled_clock_task;
Vincent Guittot31bc6ae2019-02-06 17:14:21 +01004798
Odin Ugedala7b359f2021-06-12 13:28:15 +02004799 /* Add cfs_rq with load or one or more already running entities to the list */
4800 if (!cfs_rq_is_decayed(cfs_rq) || cfs_rq->nr_running)
Vincent Guittot31bc6ae2019-02-06 17:14:21 +01004801 list_add_leaf_cfs_rq(cfs_rq);
Paul Turner64660c82011-07-21 09:43:36 -07004802 }
Paul Turner64660c82011-07-21 09:43:36 -07004803
4804 return 0;
4805}
4806
4807static int tg_throttle_down(struct task_group *tg, void *data)
4808{
4809 struct rq *rq = data;
4810 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4811
Paul Turner82958362012-10-04 13:18:31 +02004812 /* group is entering throttled state, stop time */
Vincent Guittot31bc6ae2019-02-06 17:14:21 +01004813 if (!cfs_rq->throttle_count) {
Frederic Weisbecker78becc22013-04-12 01:51:02 +02004814 cfs_rq->throttled_clock_task = rq_clock_task(rq);
Vincent Guittot31bc6ae2019-02-06 17:14:21 +01004815 list_del_leaf_cfs_rq(cfs_rq);
4816 }
Paul Turner64660c82011-07-21 09:43:36 -07004817 cfs_rq->throttle_count++;
4818
4819 return 0;
4820}
4821
Paul Turnere98fa02c2020-04-10 15:52:07 -07004822static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
Paul Turner85dac902011-07-21 09:43:33 -07004823{
4824 struct rq *rq = rq_of(cfs_rq);
4825 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4826 struct sched_entity *se;
Viresh Kumar43e9f7f2019-06-26 10:36:29 +05304827 long task_delta, idle_task_delta, dequeue = 1;
Paul Turnere98fa02c2020-04-10 15:52:07 -07004828
4829 raw_spin_lock(&cfs_b->lock);
4830 /* This will start the period timer if necessary */
4831 if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
4832 /*
4833 * We have raced with bandwidth becoming available, and if we
4834 * actually throttled the timer might not unthrottle us for an
4835 * entire period. We additionally needed to make sure that any
4836 * subsequent check_cfs_rq_runtime calls agree not to throttle
4837 * us, as we may commit to do cfs put_prev+pick_next, so we ask
4838 * for 1ns of runtime rather than just check cfs_b.
4839 */
4840 dequeue = 0;
4841 } else {
4842 list_add_tail_rcu(&cfs_rq->throttled_list,
4843 &cfs_b->throttled_cfs_rq);
4844 }
4845 raw_spin_unlock(&cfs_b->lock);
4846
4847 if (!dequeue)
4848 return false; /* Throttle no longer required. */
Paul Turner85dac902011-07-21 09:43:33 -07004849
4850 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
4851
Paul Turnerf1b17282012-10-04 13:18:31 +02004852 /* freeze hierarchy runnable averages while throttled */
Paul Turner64660c82011-07-21 09:43:36 -07004853 rcu_read_lock();
4854 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
4855 rcu_read_unlock();
Paul Turner85dac902011-07-21 09:43:33 -07004856
4857 task_delta = cfs_rq->h_nr_running;
Viresh Kumar43e9f7f2019-06-26 10:36:29 +05304858 idle_task_delta = cfs_rq->idle_h_nr_running;
Paul Turner85dac902011-07-21 09:43:33 -07004859 for_each_sched_entity(se) {
4860 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
4861 /* throttled entity or throttle-on-deactivate */
4862 if (!se->on_rq)
Peng Wangb6d37a72020-11-10 10:11:59 +08004863 goto done;
Paul Turner85dac902011-07-21 09:43:33 -07004864
Peng Wangb6d37a72020-11-10 10:11:59 +08004865 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
Vincent Guittot62124372020-02-27 16:41:15 +01004866
Josh Don30400032021-07-29 19:00:18 -07004867 if (cfs_rq_is_idle(group_cfs_rq(se)))
4868 idle_task_delta = cfs_rq->h_nr_running;
4869
Paul Turner85dac902011-07-21 09:43:33 -07004870 qcfs_rq->h_nr_running -= task_delta;
Viresh Kumar43e9f7f2019-06-26 10:36:29 +05304871 qcfs_rq->idle_h_nr_running -= idle_task_delta;
Paul Turner85dac902011-07-21 09:43:33 -07004872
Peng Wangb6d37a72020-11-10 10:11:59 +08004873 if (qcfs_rq->load.weight) {
4874 /* Avoid re-evaluating load for this entity: */
4875 se = parent_entity(se);
4876 break;
4877 }
Paul Turner85dac902011-07-21 09:43:33 -07004878 }
4879
Peng Wangb6d37a72020-11-10 10:11:59 +08004880 for_each_sched_entity(se) {
4881 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
4882 /* throttled entity or throttle-on-deactivate */
4883 if (!se->on_rq)
4884 goto done;
Paul Turner85dac902011-07-21 09:43:33 -07004885
Peng Wangb6d37a72020-11-10 10:11:59 +08004886 update_load_avg(qcfs_rq, se, 0);
4887 se_update_runnable(se);
4888
Josh Don30400032021-07-29 19:00:18 -07004889 if (cfs_rq_is_idle(group_cfs_rq(se)))
4890 idle_task_delta = cfs_rq->h_nr_running;
4891
Peng Wangb6d37a72020-11-10 10:11:59 +08004892 qcfs_rq->h_nr_running -= task_delta;
4893 qcfs_rq->idle_h_nr_running -= idle_task_delta;
4894 }
4895
4896 /* At this point se is NULL and we are at root level*/
4897 sub_nr_running(rq, task_delta);
4898
4899done:
Paul Turnere98fa02c2020-04-10 15:52:07 -07004900 /*
4901 * Note: distribution will already see us throttled via the
4902 * throttled-list. rq->lock protects completion.
4903 */
Paul Turner85dac902011-07-21 09:43:33 -07004904 cfs_rq->throttled = 1;
Frederic Weisbecker78becc22013-04-12 01:51:02 +02004905 cfs_rq->throttled_clock = rq_clock(rq);
Paul Turnere98fa02c2020-04-10 15:52:07 -07004906 return true;
Paul Turner85dac902011-07-21 09:43:33 -07004907}
4908
Peter Zijlstra029632f2011-10-25 10:00:11 +02004909void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
Paul Turner671fd9d2011-07-21 09:43:34 -07004910{
4911 struct rq *rq = rq_of(cfs_rq);
4912 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4913 struct sched_entity *se;
Viresh Kumar43e9f7f2019-06-26 10:36:29 +05304914 long task_delta, idle_task_delta;
Paul Turner671fd9d2011-07-21 09:43:34 -07004915
Michael Wang22b958d2013-06-04 14:23:39 +08004916 se = cfs_rq->tg->se[cpu_of(rq)];
Paul Turner671fd9d2011-07-21 09:43:34 -07004917
4918 cfs_rq->throttled = 0;
Frederic Weisbecker1a55af22013-04-12 01:51:01 +02004919
4920 update_rq_clock(rq);
4921
Paul Turner671fd9d2011-07-21 09:43:34 -07004922 raw_spin_lock(&cfs_b->lock);
Frederic Weisbecker78becc22013-04-12 01:51:02 +02004923 cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
Paul Turner671fd9d2011-07-21 09:43:34 -07004924 list_del_rcu(&cfs_rq->throttled_list);
4925 raw_spin_unlock(&cfs_b->lock);
4926
Paul Turner64660c82011-07-21 09:43:36 -07004927 /* update hierarchical throttle state */
4928 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
4929
Michal Koutný2630cde2021-09-17 17:30:37 +02004930 /* Nothing to run but something to decay (on_list)? Complete the branch */
4931 if (!cfs_rq->load.weight) {
4932 if (cfs_rq->on_list)
4933 goto unthrottle_throttle;
Paul Turner671fd9d2011-07-21 09:43:34 -07004934 return;
Michal Koutný2630cde2021-09-17 17:30:37 +02004935 }
Paul Turner671fd9d2011-07-21 09:43:34 -07004936
4937 task_delta = cfs_rq->h_nr_running;
Viresh Kumar43e9f7f2019-06-26 10:36:29 +05304938 idle_task_delta = cfs_rq->idle_h_nr_running;
Paul Turner671fd9d2011-07-21 09:43:34 -07004939 for_each_sched_entity(se) {
Josh Don30400032021-07-29 19:00:18 -07004940 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
4941
Paul Turner671fd9d2011-07-21 09:43:34 -07004942 if (se->on_rq)
Vincent Guittot39f23ce2020-05-13 15:55:28 +02004943 break;
Josh Don30400032021-07-29 19:00:18 -07004944 enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
Vincent Guittot62124372020-02-27 16:41:15 +01004945
Josh Don30400032021-07-29 19:00:18 -07004946 if (cfs_rq_is_idle(group_cfs_rq(se)))
4947 idle_task_delta = cfs_rq->h_nr_running;
4948
4949 qcfs_rq->h_nr_running += task_delta;
4950 qcfs_rq->idle_h_nr_running += idle_task_delta;
Paul Turner671fd9d2011-07-21 09:43:34 -07004951
Vincent Guittot39f23ce2020-05-13 15:55:28 +02004952 /* end evaluation on encountering a throttled cfs_rq */
Josh Don30400032021-07-29 19:00:18 -07004953 if (cfs_rq_throttled(qcfs_rq))
Vincent Guittot39f23ce2020-05-13 15:55:28 +02004954 goto unthrottle_throttle;
Paul Turner671fd9d2011-07-21 09:43:34 -07004955 }
4956
Vincent Guittot39f23ce2020-05-13 15:55:28 +02004957 for_each_sched_entity(se) {
Josh Don30400032021-07-29 19:00:18 -07004958 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
Paul Turner671fd9d2011-07-21 09:43:34 -07004959
Josh Don30400032021-07-29 19:00:18 -07004960 update_load_avg(qcfs_rq, se, UPDATE_TG);
Vincent Guittot39f23ce2020-05-13 15:55:28 +02004961 se_update_runnable(se);
4962
Josh Don30400032021-07-29 19:00:18 -07004963 if (cfs_rq_is_idle(group_cfs_rq(se)))
4964 idle_task_delta = cfs_rq->h_nr_running;
Vincent Guittot39f23ce2020-05-13 15:55:28 +02004965
Josh Don30400032021-07-29 19:00:18 -07004966 qcfs_rq->h_nr_running += task_delta;
4967 qcfs_rq->idle_h_nr_running += idle_task_delta;
Vincent Guittot39f23ce2020-05-13 15:55:28 +02004968
4969 /* end evaluation on encountering a throttled cfs_rq */
Josh Don30400032021-07-29 19:00:18 -07004970 if (cfs_rq_throttled(qcfs_rq))
Vincent Guittot39f23ce2020-05-13 15:55:28 +02004971 goto unthrottle_throttle;
4972
4973 /*
4974 * One parent has been throttled and cfs_rq removed from the
4975 * list. Add it back to not break the leaf list.
4976 */
Josh Don30400032021-07-29 19:00:18 -07004977 if (throttled_hierarchy(qcfs_rq))
4978 list_add_leaf_cfs_rq(qcfs_rq);
Vincent Guittot39f23ce2020-05-13 15:55:28 +02004979 }
4980
4981 /* At this point se is NULL and we are at root level*/
4982 add_nr_running(rq, task_delta);
4983
4984unthrottle_throttle:
Vincent Guittotfe614682020-03-06 14:52:57 +01004985 /*
4986 * The cfs_rq_throttled() breaks in the above iteration can result in
4987 * incomplete leaf list maintenance, resulting in triggering the
4988 * assertion below.
4989 */
4990 for_each_sched_entity(se) {
Josh Don30400032021-07-29 19:00:18 -07004991 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
Vincent Guittotfe614682020-03-06 14:52:57 +01004992
Josh Don30400032021-07-29 19:00:18 -07004993 if (list_add_leaf_cfs_rq(qcfs_rq))
Vincent Guittot39f23ce2020-05-13 15:55:28 +02004994 break;
Vincent Guittotfe614682020-03-06 14:52:57 +01004995 }
4996
4997 assert_list_leaf_cfs_rq(rq);
4998
Ingo Molnar97fb7a02018-03-03 14:01:12 +01004999 /* Determine whether we need to wake up potentially idle CPU: */
Paul Turner671fd9d2011-07-21 09:43:34 -07005000 if (rq->curr == rq->idle && rq->cfs.nr_running)
Kirill Tkhai88751252014-06-29 00:03:57 +04005001 resched_curr(rq);
Paul Turner671fd9d2011-07-21 09:43:34 -07005002}
5003
Huaixin Chang26a8b122020-03-27 11:26:25 +08005004static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
Paul Turner671fd9d2011-07-21 09:43:34 -07005005{
5006 struct cfs_rq *cfs_rq;
Huaixin Chang26a8b122020-03-27 11:26:25 +08005007 u64 runtime, remaining = 1;
Paul Turner671fd9d2011-07-21 09:43:34 -07005008
5009 rcu_read_lock();
5010 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
5011 throttled_list) {
5012 struct rq *rq = rq_of(cfs_rq);
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02005013 struct rq_flags rf;
Paul Turner671fd9d2011-07-21 09:43:34 -07005014
Peter Zijlstrac0ad4aa2019-01-07 13:52:31 +01005015 rq_lock_irqsave(rq, &rf);
Paul Turner671fd9d2011-07-21 09:43:34 -07005016 if (!cfs_rq_throttled(cfs_rq))
5017 goto next;
5018
Liangyan5e2d2cc2019-08-26 20:16:33 +08005019 /* By the above check, this should never be true */
5020 SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
5021
Huaixin Chang26a8b122020-03-27 11:26:25 +08005022 raw_spin_lock(&cfs_b->lock);
Paul Turner671fd9d2011-07-21 09:43:34 -07005023 runtime = -cfs_rq->runtime_remaining + 1;
Huaixin Chang26a8b122020-03-27 11:26:25 +08005024 if (runtime > cfs_b->runtime)
5025 runtime = cfs_b->runtime;
5026 cfs_b->runtime -= runtime;
5027 remaining = cfs_b->runtime;
5028 raw_spin_unlock(&cfs_b->lock);
Paul Turner671fd9d2011-07-21 09:43:34 -07005029
5030 cfs_rq->runtime_remaining += runtime;
Paul Turner671fd9d2011-07-21 09:43:34 -07005031
5032 /* we check whether we're throttled above */
5033 if (cfs_rq->runtime_remaining > 0)
5034 unthrottle_cfs_rq(cfs_rq);
5035
5036next:
Peter Zijlstrac0ad4aa2019-01-07 13:52:31 +01005037 rq_unlock_irqrestore(rq, &rf);
Paul Turner671fd9d2011-07-21 09:43:34 -07005038
5039 if (!remaining)
5040 break;
5041 }
5042 rcu_read_unlock();
Paul Turner671fd9d2011-07-21 09:43:34 -07005043}
5044
Paul Turner58088ad2011-07-21 09:43:31 -07005045/*
5046 * Responsible for refilling a task_group's bandwidth and unthrottling its
5047 * cfs_rqs as appropriate. If there has been no activity within the last
5048 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
5049 * used to track this state.
5050 */
Peter Zijlstrac0ad4aa2019-01-07 13:52:31 +01005051static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
Paul Turner58088ad2011-07-21 09:43:31 -07005052{
Ben Segall51f21762014-05-19 15:49:45 -07005053 int throttled;
Paul Turner58088ad2011-07-21 09:43:31 -07005054
Paul Turner58088ad2011-07-21 09:43:31 -07005055 /* no need to continue the timer with no bandwidth constraint */
5056 if (cfs_b->quota == RUNTIME_INF)
Ben Segall51f21762014-05-19 15:49:45 -07005057 goto out_deactivate;
Paul Turner58088ad2011-07-21 09:43:31 -07005058
Paul Turner671fd9d2011-07-21 09:43:34 -07005059 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
Nikhil Raoe8da1b12011-07-21 09:43:40 -07005060 cfs_b->nr_periods += overrun;
Paul Turner671fd9d2011-07-21 09:43:34 -07005061
Huaixin Changf4183712021-06-21 17:27:58 +08005062 /* Refill extra burst quota even if cfs_b->idle */
5063 __refill_cfs_bandwidth_runtime(cfs_b);
5064
Ben Segall51f21762014-05-19 15:49:45 -07005065 /*
5066 * idle depends on !throttled (for the case of a large deficit), and if
5067 * we're going inactive then everything else can be deferred
5068 */
5069 if (cfs_b->idle && !throttled)
5070 goto out_deactivate;
Paul Turnera9cf55b2011-07-21 09:43:32 -07005071
Paul Turner671fd9d2011-07-21 09:43:34 -07005072 if (!throttled) {
5073 /* mark as potentially idle for the upcoming period */
5074 cfs_b->idle = 1;
Ben Segall51f21762014-05-19 15:49:45 -07005075 return 0;
Paul Turner671fd9d2011-07-21 09:43:34 -07005076 }
Paul Turner58088ad2011-07-21 09:43:31 -07005077
Nikhil Raoe8da1b12011-07-21 09:43:40 -07005078 /* account preceding periods in which throttling occurred */
5079 cfs_b->nr_throttled += overrun;
5080
Paul Turner671fd9d2011-07-21 09:43:34 -07005081 /*
Huaixin Chang26a8b122020-03-27 11:26:25 +08005082 * This check is repeated as we release cfs_b->lock while we unthrottle.
Paul Turner671fd9d2011-07-21 09:43:34 -07005083 */
Josh Donab93a4b2020-04-10 15:52:08 -07005084 while (throttled && cfs_b->runtime > 0) {
Peter Zijlstrac0ad4aa2019-01-07 13:52:31 +01005085 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
Paul Turner671fd9d2011-07-21 09:43:34 -07005086 /* we can't nest cfs_b->lock while distributing bandwidth */
Huaixin Chang26a8b122020-03-27 11:26:25 +08005087 distribute_cfs_runtime(cfs_b);
Peter Zijlstrac0ad4aa2019-01-07 13:52:31 +01005088 raw_spin_lock_irqsave(&cfs_b->lock, flags);
Paul Turner671fd9d2011-07-21 09:43:34 -07005089
5090 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
5091 }
5092
Paul Turner671fd9d2011-07-21 09:43:34 -07005093 /*
5094 * While we are ensured activity in the period following an
5095 * unthrottle, this also covers the case in which the new bandwidth is
5096 * insufficient to cover the existing bandwidth deficit. (Forcing the
5097 * timer to remain active while there are any throttled entities.)
5098 */
5099 cfs_b->idle = 0;
Paul Turner58088ad2011-07-21 09:43:31 -07005100
Ben Segall51f21762014-05-19 15:49:45 -07005101 return 0;
5102
5103out_deactivate:
Ben Segall51f21762014-05-19 15:49:45 -07005104 return 1;
Paul Turner58088ad2011-07-21 09:43:31 -07005105}
Paul Turnerd3d9dc32011-07-21 09:43:39 -07005106
Paul Turnerd8b49862011-07-21 09:43:41 -07005107/* a cfs_rq won't donate quota below this amount */
5108static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
5109/* minimum remaining period time to redistribute slack quota */
5110static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
5111/* how long we wait to gather additional slack before distributing */
5112static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
5113
Ben Segalldb06e782013-10-16 11:16:17 -07005114/*
5115 * Are we near the end of the current quota period?
5116 *
5117 * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
Thomas Gleixner4961b6e2015-04-14 21:09:05 +00005118 * hrtimer base being cleared by hrtimer_start. In the case of
Ben Segalldb06e782013-10-16 11:16:17 -07005119 * migrate_hrtimers, base is never cleared, so we are fine.
5120 */
Paul Turnerd8b49862011-07-21 09:43:41 -07005121static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
5122{
5123 struct hrtimer *refresh_timer = &cfs_b->period_timer;
Odin Ugedal72d0ad72021-06-29 14:14:52 +02005124 s64 remaining;
Paul Turnerd8b49862011-07-21 09:43:41 -07005125
5126 /* if the call-back is running a quota refresh is already occurring */
5127 if (hrtimer_callback_running(refresh_timer))
5128 return 1;
5129
5130 /* is a quota refresh about to occur? */
5131 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
Odin Ugedal72d0ad72021-06-29 14:14:52 +02005132 if (remaining < (s64)min_expire)
Paul Turnerd8b49862011-07-21 09:43:41 -07005133 return 1;
5134
5135 return 0;
5136}
5137
5138static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
5139{
5140 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
5141
5142 /* if there's a quota refresh soon don't bother with slack */
5143 if (runtime_refresh_within(cfs_b, min_left))
5144 return;
5145
bsegall@google.com66567fc2019-06-06 10:21:01 -07005146 /* don't push forwards an existing deferred unthrottle */
5147 if (cfs_b->slack_started)
5148 return;
5149 cfs_b->slack_started = true;
5150
Peter Zijlstra4cfafd32015-05-14 12:23:11 +02005151 hrtimer_start(&cfs_b->slack_timer,
5152 ns_to_ktime(cfs_bandwidth_slack_period),
5153 HRTIMER_MODE_REL);
Paul Turnerd8b49862011-07-21 09:43:41 -07005154}
5155
5156/* we know any runtime found here is valid as update_curr() precedes return */
5157static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5158{
5159 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
5160 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
5161
5162 if (slack_runtime <= 0)
5163 return;
5164
5165 raw_spin_lock(&cfs_b->lock);
Dave Chilukde53fd72019-07-23 11:44:26 -05005166 if (cfs_b->quota != RUNTIME_INF) {
Paul Turnerd8b49862011-07-21 09:43:41 -07005167 cfs_b->runtime += slack_runtime;
5168
5169 /* we are under rq->lock, defer unthrottling using a timer */
5170 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
5171 !list_empty(&cfs_b->throttled_cfs_rq))
5172 start_cfs_slack_bandwidth(cfs_b);
5173 }
5174 raw_spin_unlock(&cfs_b->lock);
5175
5176 /* even if it's not valid for return we don't want to try again */
5177 cfs_rq->runtime_remaining -= slack_runtime;
5178}
5179
5180static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5181{
Paul Turner56f570e2011-11-07 20:26:33 -08005182 if (!cfs_bandwidth_used())
5183 return;
5184
Paul Turnerfccfdc62011-11-07 20:26:34 -08005185 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
Paul Turnerd8b49862011-07-21 09:43:41 -07005186 return;
5187
5188 __return_cfs_rq_runtime(cfs_rq);
5189}
5190
5191/*
5192 * This is done with a timer (instead of inline with bandwidth return) since
5193 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
5194 */
5195static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
5196{
5197 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
Peter Zijlstrac0ad4aa2019-01-07 13:52:31 +01005198 unsigned long flags;
Paul Turnerd8b49862011-07-21 09:43:41 -07005199
5200 /* confirm we're still not at a refresh boundary */
Peter Zijlstrac0ad4aa2019-01-07 13:52:31 +01005201 raw_spin_lock_irqsave(&cfs_b->lock, flags);
bsegall@google.com66567fc2019-06-06 10:21:01 -07005202 cfs_b->slack_started = false;
Phil Auldbaa9be42018-10-08 10:36:40 -04005203
Ben Segalldb06e782013-10-16 11:16:17 -07005204 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
Peter Zijlstrac0ad4aa2019-01-07 13:52:31 +01005205 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
Ben Segalldb06e782013-10-16 11:16:17 -07005206 return;
5207 }
5208
Ben Segallc06f04c2014-06-20 15:21:20 -07005209 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
Paul Turnerd8b49862011-07-21 09:43:41 -07005210 runtime = cfs_b->runtime;
Ben Segallc06f04c2014-06-20 15:21:20 -07005211
Peter Zijlstrac0ad4aa2019-01-07 13:52:31 +01005212 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
Paul Turnerd8b49862011-07-21 09:43:41 -07005213
5214 if (!runtime)
5215 return;
5216
Huaixin Chang26a8b122020-03-27 11:26:25 +08005217 distribute_cfs_runtime(cfs_b);
Paul Turnerd8b49862011-07-21 09:43:41 -07005218}
5219
Paul Turnerd3d9dc32011-07-21 09:43:39 -07005220/*
5221 * When a group wakes up we want to make sure that its quota is not already
5222 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
Randy Dunlapc034f482021-02-25 17:21:10 -08005223 * runtime as update_curr() throttling can not trigger until it's on-rq.
Paul Turnerd3d9dc32011-07-21 09:43:39 -07005224 */
5225static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
5226{
Paul Turner56f570e2011-11-07 20:26:33 -08005227 if (!cfs_bandwidth_used())
5228 return;
5229
Paul Turnerd3d9dc32011-07-21 09:43:39 -07005230 /* an active group must be handled by the update_curr()->put() path */
5231 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
5232 return;
5233
5234 /* ensure the group is not already throttled */
5235 if (cfs_rq_throttled(cfs_rq))
5236 return;
5237
5238 /* update runtime allocation */
5239 account_cfs_rq_runtime(cfs_rq, 0);
5240 if (cfs_rq->runtime_remaining <= 0)
5241 throttle_cfs_rq(cfs_rq);
5242}
5243
Peter Zijlstra55e16d32016-06-22 15:14:26 +02005244static void sync_throttle(struct task_group *tg, int cpu)
5245{
5246 struct cfs_rq *pcfs_rq, *cfs_rq;
5247
5248 if (!cfs_bandwidth_used())
5249 return;
5250
5251 if (!tg->parent)
5252 return;
5253
5254 cfs_rq = tg->cfs_rq[cpu];
5255 pcfs_rq = tg->parent->cfs_rq[cpu];
5256
5257 cfs_rq->throttle_count = pcfs_rq->throttle_count;
Xunlei Pangb8922122016-07-09 15:54:22 +08005258 cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
Peter Zijlstra55e16d32016-06-22 15:14:26 +02005259}
5260
Paul Turnerd3d9dc32011-07-21 09:43:39 -07005261/* conditionally throttle active cfs_rq's from put_prev_entity() */
Peter Zijlstra678d5712012-02-11 06:05:00 +01005262static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
Paul Turnerd3d9dc32011-07-21 09:43:39 -07005263{
Paul Turner56f570e2011-11-07 20:26:33 -08005264 if (!cfs_bandwidth_used())
Peter Zijlstra678d5712012-02-11 06:05:00 +01005265 return false;
Paul Turner56f570e2011-11-07 20:26:33 -08005266
Paul Turnerd3d9dc32011-07-21 09:43:39 -07005267 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
Peter Zijlstra678d5712012-02-11 06:05:00 +01005268 return false;
Paul Turnerd3d9dc32011-07-21 09:43:39 -07005269
5270 /*
5271 * it's possible for a throttled entity to be forced into a running
5272 * state (e.g. set_curr_task), in this case we're finished.
5273 */
5274 if (cfs_rq_throttled(cfs_rq))
Peter Zijlstra678d5712012-02-11 06:05:00 +01005275 return true;
Paul Turnerd3d9dc32011-07-21 09:43:39 -07005276
Paul Turnere98fa02c2020-04-10 15:52:07 -07005277 return throttle_cfs_rq(cfs_rq);
Paul Turnerd3d9dc32011-07-21 09:43:39 -07005278}
Peter Zijlstra029632f2011-10-25 10:00:11 +02005279
Peter Zijlstra029632f2011-10-25 10:00:11 +02005280static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
5281{
5282 struct cfs_bandwidth *cfs_b =
5283 container_of(timer, struct cfs_bandwidth, slack_timer);
Peter Zijlstra77a4d1a2015-04-15 11:41:57 +02005284
Peter Zijlstra029632f2011-10-25 10:00:11 +02005285 do_sched_cfs_slack_timer(cfs_b);
5286
5287 return HRTIMER_NORESTART;
5288}
5289
Phil Auld2e8e1922019-03-19 09:00:05 -04005290extern const u64 max_cfs_quota_period;
5291
Peter Zijlstra029632f2011-10-25 10:00:11 +02005292static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
5293{
5294 struct cfs_bandwidth *cfs_b =
5295 container_of(timer, struct cfs_bandwidth, period_timer);
Peter Zijlstrac0ad4aa2019-01-07 13:52:31 +01005296 unsigned long flags;
Peter Zijlstra029632f2011-10-25 10:00:11 +02005297 int overrun;
5298 int idle = 0;
Phil Auld2e8e1922019-03-19 09:00:05 -04005299 int count = 0;
Peter Zijlstra029632f2011-10-25 10:00:11 +02005300
Peter Zijlstrac0ad4aa2019-01-07 13:52:31 +01005301 raw_spin_lock_irqsave(&cfs_b->lock, flags);
Peter Zijlstra029632f2011-10-25 10:00:11 +02005302 for (;;) {
Peter Zijlstra77a4d1a2015-04-15 11:41:57 +02005303 overrun = hrtimer_forward_now(timer, cfs_b->period);
Peter Zijlstra029632f2011-10-25 10:00:11 +02005304 if (!overrun)
5305 break;
5306
Huaixin Chang5a6d6a62020-04-20 10:44:21 +08005307 idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
5308
Phil Auld2e8e1922019-03-19 09:00:05 -04005309 if (++count > 3) {
5310 u64 new, old = ktime_to_ns(cfs_b->period);
5311
Xuewei Zhang4929a4e2019-10-03 17:12:43 -07005312 /*
5313 * Grow period by a factor of 2 to avoid losing precision.
5314 * Precision loss in the quota/period ratio can cause __cfs_schedulable
5315 * to fail.
5316 */
5317 new = old * 2;
5318 if (new < max_cfs_quota_period) {
5319 cfs_b->period = ns_to_ktime(new);
5320 cfs_b->quota *= 2;
Huaixin Changf4183712021-06-21 17:27:58 +08005321 cfs_b->burst *= 2;
Phil Auld2e8e1922019-03-19 09:00:05 -04005322
Xuewei Zhang4929a4e2019-10-03 17:12:43 -07005323 pr_warn_ratelimited(
5324 "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
5325 smp_processor_id(),
5326 div_u64(new, NSEC_PER_USEC),
5327 div_u64(cfs_b->quota, NSEC_PER_USEC));
5328 } else {
5329 pr_warn_ratelimited(
5330 "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
5331 smp_processor_id(),
5332 div_u64(old, NSEC_PER_USEC),
5333 div_u64(cfs_b->quota, NSEC_PER_USEC));
5334 }
Phil Auld2e8e1922019-03-19 09:00:05 -04005335
5336 /* reset count so we don't come right back in here */
5337 count = 0;
5338 }
Peter Zijlstra029632f2011-10-25 10:00:11 +02005339 }
Peter Zijlstra4cfafd32015-05-14 12:23:11 +02005340 if (idle)
5341 cfs_b->period_active = 0;
Peter Zijlstrac0ad4aa2019-01-07 13:52:31 +01005342 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
Peter Zijlstra029632f2011-10-25 10:00:11 +02005343
5344 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
5345}
5346
5347void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5348{
5349 raw_spin_lock_init(&cfs_b->lock);
5350 cfs_b->runtime = 0;
5351 cfs_b->quota = RUNTIME_INF;
5352 cfs_b->period = ns_to_ktime(default_cfs_period());
Huaixin Changf4183712021-06-21 17:27:58 +08005353 cfs_b->burst = 0;
Peter Zijlstra029632f2011-10-25 10:00:11 +02005354
5355 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
Peter Zijlstra4cfafd32015-05-14 12:23:11 +02005356 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
Peter Zijlstra029632f2011-10-25 10:00:11 +02005357 cfs_b->period_timer.function = sched_cfs_period_timer;
5358 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5359 cfs_b->slack_timer.function = sched_cfs_slack_timer;
bsegall@google.com66567fc2019-06-06 10:21:01 -07005360 cfs_b->slack_started = false;
Peter Zijlstra029632f2011-10-25 10:00:11 +02005361}
5362
5363static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5364{
5365 cfs_rq->runtime_enabled = 0;
5366 INIT_LIST_HEAD(&cfs_rq->throttled_list);
5367}
5368
Peter Zijlstra77a4d1a2015-04-15 11:41:57 +02005369void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
Peter Zijlstra029632f2011-10-25 10:00:11 +02005370{
Peter Zijlstra4cfafd32015-05-14 12:23:11 +02005371 lockdep_assert_held(&cfs_b->lock);
Peter Zijlstra029632f2011-10-25 10:00:11 +02005372
Xunlei Pangf1d1be82018-06-20 18:18:34 +08005373 if (cfs_b->period_active)
5374 return;
5375
5376 cfs_b->period_active = 1;
Qian Cai763a9ec2019-08-20 14:40:55 -04005377 hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
Xunlei Pangf1d1be82018-06-20 18:18:34 +08005378 hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
Peter Zijlstra029632f2011-10-25 10:00:11 +02005379}
5380
5381static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5382{
Tetsuo Handa7f1a1692014-12-25 15:51:21 +09005383 /* init_cfs_bandwidth() was not called */
5384 if (!cfs_b->throttled_cfs_rq.next)
5385 return;
5386
Peter Zijlstra029632f2011-10-25 10:00:11 +02005387 hrtimer_cancel(&cfs_b->period_timer);
5388 hrtimer_cancel(&cfs_b->slack_timer);
5389}
5390
Peter Zijlstra502ce002017-05-04 15:31:22 +02005391/*
Ingo Molnar97fb7a02018-03-03 14:01:12 +01005392 * Both these CPU hotplug callbacks race against unregister_fair_sched_group()
Peter Zijlstra502ce002017-05-04 15:31:22 +02005393 *
5394 * The race is harmless, since modifying bandwidth settings of unhooked group
5395 * bits doesn't do much.
5396 */
5397
Ingo Molnar3b037062021-03-18 13:38:50 +01005398/* cpu online callback */
Kirill Tkhai0e59bda2014-06-25 12:19:42 +04005399static void __maybe_unused update_runtime_enabled(struct rq *rq)
5400{
Peter Zijlstra502ce002017-05-04 15:31:22 +02005401 struct task_group *tg;
Kirill Tkhai0e59bda2014-06-25 12:19:42 +04005402
Peter Zijlstra5cb9eaa2020-11-17 18:19:31 -05005403 lockdep_assert_rq_held(rq);
Peter Zijlstra502ce002017-05-04 15:31:22 +02005404
5405 rcu_read_lock();
5406 list_for_each_entry_rcu(tg, &task_groups, list) {
5407 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
5408 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
Kirill Tkhai0e59bda2014-06-25 12:19:42 +04005409
5410 raw_spin_lock(&cfs_b->lock);
5411 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
5412 raw_spin_unlock(&cfs_b->lock);
5413 }
Peter Zijlstra502ce002017-05-04 15:31:22 +02005414 rcu_read_unlock();
Kirill Tkhai0e59bda2014-06-25 12:19:42 +04005415}
5416
Peter Zijlstra502ce002017-05-04 15:31:22 +02005417/* cpu offline callback */
Arnd Bergmann38dc3342013-01-25 14:14:22 +00005418static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
Peter Zijlstra029632f2011-10-25 10:00:11 +02005419{
Peter Zijlstra502ce002017-05-04 15:31:22 +02005420 struct task_group *tg;
Peter Zijlstra029632f2011-10-25 10:00:11 +02005421
Peter Zijlstra5cb9eaa2020-11-17 18:19:31 -05005422 lockdep_assert_rq_held(rq);
Peter Zijlstra502ce002017-05-04 15:31:22 +02005423
5424 rcu_read_lock();
5425 list_for_each_entry_rcu(tg, &task_groups, list) {
5426 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
5427
Peter Zijlstra029632f2011-10-25 10:00:11 +02005428 if (!cfs_rq->runtime_enabled)
5429 continue;
5430
5431 /*
5432 * clock_task is not advancing so we just need to make sure
5433 * there's some valid quota amount
5434 */
Ben Segall51f21762014-05-19 15:49:45 -07005435 cfs_rq->runtime_remaining = 1;
Kirill Tkhai0e59bda2014-06-25 12:19:42 +04005436 /*
Ingo Molnar97fb7a02018-03-03 14:01:12 +01005437 * Offline rq is schedulable till CPU is completely disabled
Kirill Tkhai0e59bda2014-06-25 12:19:42 +04005438 * in take_cpu_down(), so we prevent new cfs throttling here.
5439 */
5440 cfs_rq->runtime_enabled = 0;
5441
Peter Zijlstra029632f2011-10-25 10:00:11 +02005442 if (cfs_rq_throttled(cfs_rq))
5443 unthrottle_cfs_rq(cfs_rq);
5444 }
Peter Zijlstra502ce002017-05-04 15:31:22 +02005445 rcu_read_unlock();
Peter Zijlstra029632f2011-10-25 10:00:11 +02005446}
5447
5448#else /* CONFIG_CFS_BANDWIDTH */
Vincent Guittotf6783312019-01-30 06:22:47 +01005449
5450static inline bool cfs_bandwidth_used(void)
5451{
5452 return false;
5453}
5454
Peter Zijlstra9dbdb152013-11-18 18:27:06 +01005455static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
Peter Zijlstra678d5712012-02-11 06:05:00 +01005456static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
Paul Turnerd3d9dc32011-07-21 09:43:39 -07005457static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
Peter Zijlstra55e16d32016-06-22 15:14:26 +02005458static inline void sync_throttle(struct task_group *tg, int cpu) {}
Peter Zijlstra6c16a6d2012-03-21 13:07:16 -07005459static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
Paul Turner85dac902011-07-21 09:43:33 -07005460
5461static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
5462{
5463 return 0;
5464}
Paul Turner64660c82011-07-21 09:43:36 -07005465
5466static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
5467{
5468 return 0;
5469}
5470
5471static inline int throttled_lb_pair(struct task_group *tg,
5472 int src_cpu, int dest_cpu)
5473{
5474 return 0;
5475}
Peter Zijlstra029632f2011-10-25 10:00:11 +02005476
5477void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
5478
5479#ifdef CONFIG_FAIR_GROUP_SCHED
5480static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
Paul Turnerab84d312011-07-21 09:43:28 -07005481#endif
5482
Peter Zijlstra029632f2011-10-25 10:00:11 +02005483static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
5484{
5485 return NULL;
5486}
5487static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
Kirill Tkhai0e59bda2014-06-25 12:19:42 +04005488static inline void update_runtime_enabled(struct rq *rq) {}
Peter Boonstoppela4c96ae2012-08-09 15:34:47 -07005489static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
Peter Zijlstra029632f2011-10-25 10:00:11 +02005490
5491#endif /* CONFIG_CFS_BANDWIDTH */
5492
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005493/**************************************************
5494 * CFS operations on tasks:
5495 */
5496
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01005497#ifdef CONFIG_SCHED_HRTICK
5498static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
5499{
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01005500 struct sched_entity *se = &p->se;
5501 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5502
Peter Zijlstra9148a3a2016-09-20 22:34:51 +02005503 SCHED_WARN_ON(task_rq(p) != rq);
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01005504
Srivatsa Vaddagiri8bf46a32016-09-16 18:28:51 -07005505 if (rq->cfs.h_nr_running > 1) {
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01005506 u64 slice = sched_slice(cfs_rq, se);
5507 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
5508 s64 delta = slice - ran;
5509
5510 if (delta < 0) {
Hui Su65bcf072020-10-31 01:32:23 +08005511 if (task_current(rq, p))
Kirill Tkhai88751252014-06-29 00:03:57 +04005512 resched_curr(rq);
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01005513 return;
5514 }
Peter Zijlstra31656512008-07-18 18:01:23 +02005515 hrtick_start(rq, delta);
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01005516 }
5517}
Peter Zijlstraa4c2f002008-10-17 19:27:03 +02005518
5519/*
5520 * called from enqueue/dequeue and updates the hrtick when the
5521 * current task is from our class and nr_running is low enough
5522 * to matter.
5523 */
5524static void hrtick_update(struct rq *rq)
5525{
5526 struct task_struct *curr = rq->curr;
5527
Juri Lellie0ee4632021-02-08 08:35:54 +01005528 if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class)
Peter Zijlstraa4c2f002008-10-17 19:27:03 +02005529 return;
5530
5531 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
5532 hrtick_start_fair(rq, curr);
5533}
Dhaval Giani55e12e52008-06-24 23:39:43 +05305534#else /* !CONFIG_SCHED_HRTICK */
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01005535static inline void
5536hrtick_start_fair(struct rq *rq, struct task_struct *p)
5537{
5538}
Peter Zijlstraa4c2f002008-10-17 19:27:03 +02005539
5540static inline void hrtick_update(struct rq *rq)
5541{
5542}
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01005543#endif
5544
Morten Rasmussen2802bf32018-12-03 09:56:25 +00005545#ifdef CONFIG_SMP
Morten Rasmussen2802bf32018-12-03 09:56:25 +00005546static inline bool cpu_overutilized(int cpu)
5547{
Dietmar Eggemann82762d22021-11-18 17:42:40 +01005548 return !fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu));
Morten Rasmussen2802bf32018-12-03 09:56:25 +00005549}
5550
5551static inline void update_overutilized_status(struct rq *rq)
5552{
Qais Youseff9f240f2019-06-04 12:14:58 +01005553 if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
Morten Rasmussen2802bf32018-12-03 09:56:25 +00005554 WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
Qais Youseff9f240f2019-06-04 12:14:58 +01005555 trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
5556 }
Morten Rasmussen2802bf32018-12-03 09:56:25 +00005557}
5558#else
5559static inline void update_overutilized_status(struct rq *rq) { }
5560#endif
5561
Viresh Kumar323af6d2020-01-08 13:57:04 +05305562/* Runqueue only has SCHED_IDLE tasks enqueued */
5563static int sched_idle_rq(struct rq *rq)
5564{
5565 return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
5566 rq->nr_running);
5567}
5568
Josh Dona480add2021-08-19 18:04:01 -07005569/*
5570 * Returns true if cfs_rq only has SCHED_IDLE entities enqueued. Note the use
5571 * of idle_nr_running, which does not consider idle descendants of normal
5572 * entities.
5573 */
5574static bool sched_idle_cfs_rq(struct cfs_rq *cfs_rq)
5575{
5576 return cfs_rq->nr_running &&
5577 cfs_rq->nr_running == cfs_rq->idle_nr_running;
5578}
5579
Viresh Kumarafa70d92020-01-20 11:29:05 +05305580#ifdef CONFIG_SMP
Viresh Kumar323af6d2020-01-08 13:57:04 +05305581static int sched_idle_cpu(int cpu)
5582{
5583 return sched_idle_rq(cpu_rq(cpu));
5584}
Viresh Kumarafa70d92020-01-20 11:29:05 +05305585#endif
Viresh Kumar323af6d2020-01-08 13:57:04 +05305586
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005587/*
5588 * The enqueue_task method is called before nr_running is
5589 * increased. Here we update the fair scheduling stats and
5590 * then put the task into the rbtree:
5591 */
Thomas Gleixnerea87bb72010-01-20 20:58:57 +00005592static void
Peter Zijlstra371fd7e2010-03-24 16:38:48 +01005593enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005594{
5595 struct cfs_rq *cfs_rq;
Peter Zijlstra62fb1852008-02-25 17:34:02 +01005596 struct sched_entity *se = &p->se;
Viresh Kumar43e9f7f2019-06-26 10:36:29 +05305597 int idle_h_nr_running = task_has_idle_policy(p);
Quentin Perret8e1ac422020-11-12 11:12:01 +00005598 int task_new = !(flags & ENQUEUE_WAKEUP);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005599
Rafael J. Wysocki8c34ab12016-09-09 23:59:33 +02005600 /*
Patrick Bellasi2539fc82018-05-24 15:10:23 +01005601 * The code below (indirectly) updates schedutil which looks at
5602 * the cfs_rq utilization to select a frequency.
5603 * Let's add the task's estimated utilization to the cfs_rq's
5604 * estimated utilization, before we update schedutil.
5605 */
5606 util_est_enqueue(&rq->cfs, p);
5607
5608 /*
Rafael J. Wysocki8c34ab12016-09-09 23:59:33 +02005609 * If in_iowait is set, the code below may not trigger any cpufreq
5610 * utilization updates, so do it here explicitly with the IOWAIT flag
5611 * passed.
5612 */
5613 if (p->in_iowait)
Viresh Kumar674e7542017-07-28 12:16:38 +05305614 cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);
Rafael J. Wysocki8c34ab12016-09-09 23:59:33 +02005615
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005616 for_each_sched_entity(se) {
Peter Zijlstra62fb1852008-02-25 17:34:02 +01005617 if (se->on_rq)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005618 break;
5619 cfs_rq = cfs_rq_of(se);
Peter Zijlstra88ec22d2009-12-16 18:04:41 +01005620 enqueue_entity(cfs_rq, se, flags);
Paul Turner85dac902011-07-21 09:43:33 -07005621
Paul Turner953bfcd2011-07-21 09:43:27 -07005622 cfs_rq->h_nr_running++;
Viresh Kumar43e9f7f2019-06-26 10:36:29 +05305623 cfs_rq->idle_h_nr_running += idle_h_nr_running;
Paul Turner85dac902011-07-21 09:43:33 -07005624
Josh Don30400032021-07-29 19:00:18 -07005625 if (cfs_rq_is_idle(cfs_rq))
5626 idle_h_nr_running = 1;
5627
Vincent Guittot6d4d2242020-02-24 09:52:14 +00005628 /* end evaluation on encountering a throttled cfs_rq */
5629 if (cfs_rq_throttled(cfs_rq))
5630 goto enqueue_throttle;
5631
Peter Zijlstra88ec22d2009-12-16 18:04:41 +01005632 flags = ENQUEUE_WAKEUP;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005633 }
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01005634
Peter Zijlstra2069dd72010-11-15 15:47:00 -08005635 for_each_sched_entity(se) {
Lin Ming0f317142011-07-22 09:14:31 +08005636 cfs_rq = cfs_rq_of(se);
Peter Zijlstra2069dd72010-11-15 15:47:00 -08005637
Peter Zijlstra88c06162017-05-06 17:32:43 +02005638 update_load_avg(cfs_rq, se, UPDATE_TG);
Vincent Guittot9f683952020-02-24 09:52:18 +00005639 se_update_runnable(se);
Peter Zijlstra1ea6c462017-05-06 15:59:54 +02005640 update_cfs_group(se);
Vincent Guittot6d4d2242020-02-24 09:52:14 +00005641
5642 cfs_rq->h_nr_running++;
5643 cfs_rq->idle_h_nr_running += idle_h_nr_running;
Vincent Guittot5ab297b2020-03-06 09:42:08 +01005644
Josh Don30400032021-07-29 19:00:18 -07005645 if (cfs_rq_is_idle(cfs_rq))
5646 idle_h_nr_running = 1;
5647
Vincent Guittot5ab297b2020-03-06 09:42:08 +01005648 /* end evaluation on encountering a throttled cfs_rq */
5649 if (cfs_rq_throttled(cfs_rq))
5650 goto enqueue_throttle;
Phil Auldb34cb072020-05-12 09:52:22 -04005651
5652 /*
5653 * One parent has been throttled and cfs_rq removed from the
5654 * list. Add it back to not break the leaf list.
5655 */
5656 if (throttled_hierarchy(cfs_rq))
5657 list_add_leaf_cfs_rq(cfs_rq);
Peter Zijlstra2069dd72010-11-15 15:47:00 -08005658 }
5659
Vincent Guittot7d148be2020-05-13 15:55:02 +02005660 /* At this point se is NULL and we are at root level*/
5661 add_nr_running(rq, 1);
5662
5663 /*
5664 * Since new tasks are assigned an initial util_avg equal to
5665 * half of the spare capacity of their CPU, tiny tasks have the
5666 * ability to cross the overutilized threshold, which will
5667 * result in the load balancer ruining all the task placement
5668 * done by EAS. As a way to mitigate that effect, do not account
5669 * for the first enqueue operation of new tasks during the
5670 * overutilized flag detection.
5671 *
5672 * A better way of solving this problem would be to wait for
5673 * the PELT signals of tasks to converge before taking them
5674 * into account, but that is not straightforward to implement,
5675 * and the following generally works well enough in practice.
5676 */
Quentin Perret8e1ac422020-11-12 11:12:01 +00005677 if (!task_new)
Vincent Guittot7d148be2020-05-13 15:55:02 +02005678 update_overutilized_status(rq);
5679
Vincent Guittot6d4d2242020-02-24 09:52:14 +00005680enqueue_throttle:
Vincent Guittotf6783312019-01-30 06:22:47 +01005681 if (cfs_bandwidth_used()) {
5682 /*
5683 * When bandwidth control is enabled; the cfs_rq_throttled()
5684 * breaks in the above iteration can result in incomplete
5685 * leaf list maintenance, resulting in triggering the assertion
5686 * below.
5687 */
5688 for_each_sched_entity(se) {
5689 cfs_rq = cfs_rq_of(se);
5690
5691 if (list_add_leaf_cfs_rq(cfs_rq))
5692 break;
5693 }
5694 }
5695
Peter Zijlstra5d299ea2019-01-30 14:41:04 +01005696 assert_list_leaf_cfs_rq(rq);
5697
Peter Zijlstraa4c2f002008-10-17 19:27:03 +02005698 hrtick_update(rq);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005699}
5700
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07005701static void set_next_buddy(struct sched_entity *se);
5702
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005703/*
5704 * The dequeue_task method is called before nr_running is
5705 * decreased. We remove the task from the rbtree and
5706 * update the fair scheduling stats:
5707 */
Peter Zijlstra371fd7e2010-03-24 16:38:48 +01005708static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005709{
5710 struct cfs_rq *cfs_rq;
Peter Zijlstra62fb1852008-02-25 17:34:02 +01005711 struct sched_entity *se = &p->se;
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07005712 int task_sleep = flags & DEQUEUE_SLEEP;
Viresh Kumar43e9f7f2019-06-26 10:36:29 +05305713 int idle_h_nr_running = task_has_idle_policy(p);
Viresh Kumar323af6d2020-01-08 13:57:04 +05305714 bool was_sched_idle = sched_idle_rq(rq);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005715
Xuewen Yan8c1f5602020-12-18 17:27:52 +08005716 util_est_dequeue(&rq->cfs, p);
5717
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005718 for_each_sched_entity(se) {
5719 cfs_rq = cfs_rq_of(se);
Peter Zijlstra371fd7e2010-03-24 16:38:48 +01005720 dequeue_entity(cfs_rq, se, flags);
Paul Turner85dac902011-07-21 09:43:33 -07005721
Paul Turner953bfcd2011-07-21 09:43:27 -07005722 cfs_rq->h_nr_running--;
Viresh Kumar43e9f7f2019-06-26 10:36:29 +05305723 cfs_rq->idle_h_nr_running -= idle_h_nr_running;
Peter Zijlstra2069dd72010-11-15 15:47:00 -08005724
Josh Don30400032021-07-29 19:00:18 -07005725 if (cfs_rq_is_idle(cfs_rq))
5726 idle_h_nr_running = 1;
5727
Vincent Guittot6d4d2242020-02-24 09:52:14 +00005728 /* end evaluation on encountering a throttled cfs_rq */
5729 if (cfs_rq_throttled(cfs_rq))
5730 goto dequeue_throttle;
5731
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005732 /* Don't dequeue parent if it has other entities besides us */
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07005733 if (cfs_rq->load.weight) {
Konstantin Khlebnikov754bd592016-06-16 15:57:15 +03005734 /* Avoid re-evaluating load for this entity: */
5735 se = parent_entity(se);
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07005736 /*
5737 * Bias pick_next to pick a task from this cfs_rq, as
5738 * p is sleeping when it is within its sched_slice.
5739 */
Konstantin Khlebnikov754bd592016-06-16 15:57:15 +03005740 if (task_sleep && se && !throttled_hierarchy(cfs_rq))
5741 set_next_buddy(se);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005742 break;
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07005743 }
Peter Zijlstra371fd7e2010-03-24 16:38:48 +01005744 flags |= DEQUEUE_SLEEP;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005745 }
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01005746
Peter Zijlstra2069dd72010-11-15 15:47:00 -08005747 for_each_sched_entity(se) {
Lin Ming0f317142011-07-22 09:14:31 +08005748 cfs_rq = cfs_rq_of(se);
Peter Zijlstra2069dd72010-11-15 15:47:00 -08005749
Peter Zijlstra88c06162017-05-06 17:32:43 +02005750 update_load_avg(cfs_rq, se, UPDATE_TG);
Vincent Guittot9f683952020-02-24 09:52:18 +00005751 se_update_runnable(se);
Peter Zijlstra1ea6c462017-05-06 15:59:54 +02005752 update_cfs_group(se);
Vincent Guittot6d4d2242020-02-24 09:52:14 +00005753
5754 cfs_rq->h_nr_running--;
5755 cfs_rq->idle_h_nr_running -= idle_h_nr_running;
Vincent Guittot5ab297b2020-03-06 09:42:08 +01005756
Josh Don30400032021-07-29 19:00:18 -07005757 if (cfs_rq_is_idle(cfs_rq))
5758 idle_h_nr_running = 1;
5759
Vincent Guittot5ab297b2020-03-06 09:42:08 +01005760 /* end evaluation on encountering a throttled cfs_rq */
5761 if (cfs_rq_throttled(cfs_rq))
5762 goto dequeue_throttle;
5763
Peter Zijlstra2069dd72010-11-15 15:47:00 -08005764 }
5765
Peng Wang423d02e2020-06-16 14:04:07 +08005766 /* At this point se is NULL and we are at root level*/
5767 sub_nr_running(rq, 1);
Yuyang Ducd126af2015-07-15 08:04:36 +08005768
Viresh Kumar323af6d2020-01-08 13:57:04 +05305769 /* balance early to pull high priority tasks */
5770 if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
5771 rq->next_balance = jiffies;
5772
Peng Wang423d02e2020-06-16 14:04:07 +08005773dequeue_throttle:
Xuewen Yan8c1f5602020-12-18 17:27:52 +08005774 util_est_update(&rq->cfs, p, task_sleep);
Peter Zijlstraa4c2f002008-10-17 19:27:03 +02005775 hrtick_update(rq);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02005776}
5777
Gregory Haskinse7693a32008-01-25 21:08:09 +01005778#ifdef CONFIG_SMP
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02005779
5780/* Working cpumask for: load_balance, load_balance_newidle. */
5781DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
5782DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
5783
Frederic Weisbecker9fd81dd2016-04-19 17:36:51 +02005784#ifdef CONFIG_NO_HZ_COMMON
Peter Zijlstrae022e0d2017-12-21 11:20:23 +01005785
5786static struct {
5787 cpumask_var_t idle_cpus_mask;
5788 atomic_t nr_cpus;
Vincent Guittotf643ea22018-02-13 11:31:17 +01005789 int has_blocked; /* Idle CPUS has blocked load */
Valentin Schneider7fd7a9e2021-08-23 12:17:00 +01005790 int needs_update; /* Newly idle CPUs need their next_balance collated */
Peter Zijlstrae022e0d2017-12-21 11:20:23 +01005791 unsigned long next_balance; /* in jiffy units */
Vincent Guittotf643ea22018-02-13 11:31:17 +01005792 unsigned long next_blocked; /* Next update of blocked load in jiffies */
Peter Zijlstrae022e0d2017-12-21 11:20:23 +01005793} nohz ____cacheline_aligned;
5794
Frederic Weisbecker9fd81dd2016-04-19 17:36:51 +02005795#endif /* CONFIG_NO_HZ_COMMON */
Peter Zijlstra3289bdb2015-04-14 13:19:42 +02005796
Vincent Guittotb0fb1eb2019-10-18 15:26:33 +02005797static unsigned long cpu_load(struct rq *rq)
5798{
5799 return cfs_rq_load_avg(&rq->cfs);
5800}
5801
Vincent Guittot3318544b2019-10-22 18:46:38 +02005802/*
5803 * cpu_load_without - compute CPU load without any contributions from *p
5804 * @cpu: the CPU which load is requested
5805 * @p: the task which load should be discounted
5806 *
5807 * The load of a CPU is defined by the load of tasks currently enqueued on that
5808 * CPU as well as tasks which are currently sleeping after an execution on that
5809 * CPU.
5810 *
5811 * This method returns the load of the specified CPU by discounting the load of
5812 * the specified task, whenever the task is currently contributing to the CPU
5813 * load.
5814 */
5815static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p)
5816{
5817 struct cfs_rq *cfs_rq;
5818 unsigned int load;
5819
5820 /* Task has no contribution or is new */
5821 if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
5822 return cpu_load(rq);
5823
5824 cfs_rq = &rq->cfs;
5825 load = READ_ONCE(cfs_rq->avg.load_avg);
5826
5827 /* Discount task's util from CPU's util */
5828 lsub_positive(&load, task_h_load(p));
5829
5830 return load;
5831}
5832
Vincent Guittot9f683952020-02-24 09:52:18 +00005833static unsigned long cpu_runnable(struct rq *rq)
5834{
5835 return cfs_rq_runnable_avg(&rq->cfs);
5836}
5837
Vincent Guittot070f5e82020-02-24 09:52:19 +00005838static unsigned long cpu_runnable_without(struct rq *rq, struct task_struct *p)
5839{
5840 struct cfs_rq *cfs_rq;
5841 unsigned int runnable;
5842
5843 /* Task has no contribution or is new */
5844 if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
5845 return cpu_runnable(rq);
5846
5847 cfs_rq = &rq->cfs;
5848 runnable = READ_ONCE(cfs_rq->avg.runnable_avg);
5849
5850 /* Discount task's runnable from CPU's runnable */
5851 lsub_positive(&runnable, p->se.avg.runnable_avg);
5852
5853 return runnable;
5854}
5855
Nicolas Pitreced549f2014-05-26 18:19:38 -04005856static unsigned long capacity_of(int cpu)
Peter Zijlstra029632f2011-10-25 10:00:11 +02005857{
Nicolas Pitreced549f2014-05-26 18:19:38 -04005858 return cpu_rq(cpu)->cpu_capacity;
Peter Zijlstra029632f2011-10-25 10:00:11 +02005859}
5860
Peter Zijlstrac58d25f2016-05-12 09:19:59 +02005861static void record_wakee(struct task_struct *p)
5862{
5863 /*
5864 * Only decay a single time; tasks that have less then 1 wakeup per
5865 * jiffy will not have built up many flips.
5866 */
5867 if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
5868 current->wakee_flips >>= 1;
5869 current->wakee_flip_decay_ts = jiffies;
5870 }
5871
5872 if (current->last_wakee != p) {
5873 current->last_wakee = p;
5874 current->wakee_flips++;
5875 }
5876}
5877
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02005878/*
5879 * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
Peter Zijlstrac58d25f2016-05-12 09:19:59 +02005880 *
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02005881 * A waker of many should wake a different task than the one last awakened
Peter Zijlstrac58d25f2016-05-12 09:19:59 +02005882 * at a frequency roughly N times higher than one of its wakees.
5883 *
5884 * In order to determine whether we should let the load spread vs consolidating
5885 * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
5886 * partner, and a factor of lls_size higher frequency in the other.
5887 *
5888 * With both conditions met, we can be relatively sure that the relationship is
5889 * non-monogamous, with partner count exceeding socket size.
5890 *
5891 * Waker/wakee being client/server, worker/dispatcher, interrupt source or
5892 * whatever is irrelevant, spread criteria is apparent partner count exceeds
5893 * socket size.
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02005894 */
Michael Wang62470412013-07-04 12:55:51 +08005895static int wake_wide(struct task_struct *p)
5896{
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02005897 unsigned int master = current->wakee_flips;
5898 unsigned int slave = p->wakee_flips;
Muchun Song17c891a2020-04-21 22:41:23 +08005899 int factor = __this_cpu_read(sd_llc_size);
Michael Wang62470412013-07-04 12:55:51 +08005900
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02005901 if (master < slave)
5902 swap(master, slave);
5903 if (slave < factor || master < slave * factor)
5904 return 0;
5905 return 1;
Michael Wang62470412013-07-04 12:55:51 +08005906}
5907
Peter Zijlstra90001d62017-07-31 17:50:05 +02005908/*
Peter Zijlstrad153b152017-09-27 11:35:30 +02005909 * The purpose of wake_affine() is to quickly determine on which CPU we can run
5910 * soonest. For the purpose of speed we only consider the waking and previous
5911 * CPU.
Peter Zijlstra90001d62017-07-31 17:50:05 +02005912 *
Mel Gorman7332dec2017-12-19 08:59:47 +00005913 * wake_affine_idle() - only considers 'now', it check if the waking CPU is
5914 * cache-affine and is (or will be) idle.
Peter Zijlstraf2cdd9c2017-10-06 09:23:24 +02005915 *
5916 * wake_affine_weight() - considers the weight to reflect the average
5917 * scheduling latency of the CPUs. This seems to work
5918 * for the overloaded case.
Peter Zijlstra90001d62017-07-31 17:50:05 +02005919 */
Mel Gorman3b76c4a2018-01-30 10:45:53 +00005920static int
Mel Gorman89a55f52018-01-30 10:45:52 +00005921wake_affine_idle(int this_cpu, int prev_cpu, int sync)
Peter Zijlstra90001d62017-07-31 17:50:05 +02005922{
Mel Gorman7332dec2017-12-19 08:59:47 +00005923 /*
5924 * If this_cpu is idle, it implies the wakeup is from interrupt
5925 * context. Only allow the move if cache is shared. Otherwise an
5926 * interrupt intensive workload could force all tasks onto one
5927 * node depending on the IO topology or IRQ affinity settings.
Mel Gorman806486c2018-01-30 10:45:54 +00005928 *
5929 * If the prev_cpu is idle and cache affine then avoid a migration.
5930 * There is no guarantee that the cache hot data from an interrupt
5931 * is more important than cache hot data on the prev_cpu and from
5932 * a cpufreq perspective, it's better to have higher utilisation
5933 * on one CPU.
Mel Gorman7332dec2017-12-19 08:59:47 +00005934 */
Rohit Jain943d3552018-05-09 09:39:48 -07005935 if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
5936 return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
Peter Zijlstra90001d62017-07-31 17:50:05 +02005937
Peter Zijlstrad153b152017-09-27 11:35:30 +02005938 if (sync && cpu_rq(this_cpu)->nr_running == 1)
Mel Gorman3b76c4a2018-01-30 10:45:53 +00005939 return this_cpu;
Peter Zijlstra90001d62017-07-31 17:50:05 +02005940
Julia Lawalld8fcb812020-10-22 15:15:50 +02005941 if (available_idle_cpu(prev_cpu))
5942 return prev_cpu;
5943
Mel Gorman3b76c4a2018-01-30 10:45:53 +00005944 return nr_cpumask_bits;
Peter Zijlstra90001d62017-07-31 17:50:05 +02005945}
5946
Mel Gorman3b76c4a2018-01-30 10:45:53 +00005947static int
Peter Zijlstraf2cdd9c2017-10-06 09:23:24 +02005948wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
5949 int this_cpu, int prev_cpu, int sync)
Peter Zijlstra90001d62017-07-31 17:50:05 +02005950{
Peter Zijlstra90001d62017-07-31 17:50:05 +02005951 s64 this_eff_load, prev_eff_load;
5952 unsigned long task_load;
5953
Vincent Guittot11f10e52019-10-18 15:26:36 +02005954 this_eff_load = cpu_load(cpu_rq(this_cpu));
Peter Zijlstra90001d62017-07-31 17:50:05 +02005955
Peter Zijlstra90001d62017-07-31 17:50:05 +02005956 if (sync) {
5957 unsigned long current_load = task_h_load(current);
5958
Peter Zijlstraf2cdd9c2017-10-06 09:23:24 +02005959 if (current_load > this_eff_load)
Mel Gorman3b76c4a2018-01-30 10:45:53 +00005960 return this_cpu;
Peter Zijlstra90001d62017-07-31 17:50:05 +02005961
Peter Zijlstraf2cdd9c2017-10-06 09:23:24 +02005962 this_eff_load -= current_load;
Peter Zijlstra90001d62017-07-31 17:50:05 +02005963 }
5964
Peter Zijlstra90001d62017-07-31 17:50:05 +02005965 task_load = task_h_load(p);
5966
Peter Zijlstraf2cdd9c2017-10-06 09:23:24 +02005967 this_eff_load += task_load;
5968 if (sched_feat(WA_BIAS))
5969 this_eff_load *= 100;
5970 this_eff_load *= capacity_of(prev_cpu);
Peter Zijlstra90001d62017-07-31 17:50:05 +02005971
Vincent Guittot11f10e52019-10-18 15:26:36 +02005972 prev_eff_load = cpu_load(cpu_rq(prev_cpu));
Peter Zijlstraf2cdd9c2017-10-06 09:23:24 +02005973 prev_eff_load -= task_load;
5974 if (sched_feat(WA_BIAS))
5975 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
5976 prev_eff_load *= capacity_of(this_cpu);
Peter Zijlstra90001d62017-07-31 17:50:05 +02005977
Mel Gorman082f7642018-02-13 13:37:27 +00005978 /*
5979 * If sync, adjust the weight of prev_eff_load such that if
5980 * prev_eff == this_eff that select_idle_sibling() will consider
5981 * stacking the wakee on top of the waker if no other CPU is
5982 * idle.
5983 */
5984 if (sync)
5985 prev_eff_load += 1;
5986
5987 return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
Peter Zijlstra90001d62017-07-31 17:50:05 +02005988}
5989
Morten Rasmussen772bd008c2016-06-22 18:03:13 +01005990static int wake_affine(struct sched_domain *sd, struct task_struct *p,
Mel Gorman7ebb66a2018-02-13 13:37:25 +00005991 int this_cpu, int prev_cpu, int sync)
Ingo Molnar098fb9d2008-03-16 20:36:10 +01005992{
Mel Gorman3b76c4a2018-01-30 10:45:53 +00005993 int target = nr_cpumask_bits;
Ingo Molnar098fb9d2008-03-16 20:36:10 +01005994
Mel Gorman89a55f52018-01-30 10:45:52 +00005995 if (sched_feat(WA_IDLE))
Mel Gorman3b76c4a2018-01-30 10:45:53 +00005996 target = wake_affine_idle(this_cpu, prev_cpu, sync);
Peter Zijlstra90001d62017-07-31 17:50:05 +02005997
Mel Gorman3b76c4a2018-01-30 10:45:53 +00005998 if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits)
5999 target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
Mike Galbraithb3137bc2008-05-29 11:11:41 +02006000
Yafang Shaoceeadb82021-09-05 14:35:41 +00006001 schedstat_inc(p->stats.nr_wakeups_affine_attempts);
Mel Gorman3b76c4a2018-01-30 10:45:53 +00006002 if (target == nr_cpumask_bits)
6003 return prev_cpu;
Mike Galbraithb3137bc2008-05-29 11:11:41 +02006004
Mel Gorman3b76c4a2018-01-30 10:45:53 +00006005 schedstat_inc(sd->ttwu_move_affine);
Yafang Shaoceeadb82021-09-05 14:35:41 +00006006 schedstat_inc(p->stats.nr_wakeups_affine);
Mel Gorman3b76c4a2018-01-30 10:45:53 +00006007 return target;
Ingo Molnar098fb9d2008-03-16 20:36:10 +01006008}
6009
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006010static struct sched_group *
Valentin Schneider45da2772020-04-15 22:05:04 +01006011find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu);
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006012
6013/*
Ingo Molnar97fb7a02018-03-03 14:01:12 +01006014 * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006015 */
6016static int
Brendan Jackman18bd1b4b2017-10-05 12:45:12 +01006017find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006018{
6019 unsigned long load, min_load = ULONG_MAX;
Nicolas Pitre83a0a962014-09-04 11:32:10 -04006020 unsigned int min_exit_latency = UINT_MAX;
6021 u64 latest_idle_timestamp = 0;
6022 int least_loaded_cpu = this_cpu;
Viresh Kumar17346452019-11-14 16:19:27 +05306023 int shallowest_idle_cpu = -1;
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006024 int i;
6025
Morten Rasmusseneaecf412016-06-22 18:03:14 +01006026 /* Check if we have any choice: */
6027 if (group->group_weight == 1)
Peter Zijlstraae4df9d2017-05-01 11:03:12 +02006028 return cpumask_first(sched_group_span(group));
Morten Rasmusseneaecf412016-06-22 18:03:14 +01006029
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006030 /* Traverse only the allowed CPUs */
Sebastian Andrzej Siewior3bd37062019-04-23 16:26:36 +02006031 for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
Aubrey Li97886d92021-03-24 17:40:13 -04006032 struct rq *rq = cpu_rq(i);
6033
6034 if (!sched_core_cookie_match(rq, p))
6035 continue;
6036
Viresh Kumar17346452019-11-14 16:19:27 +05306037 if (sched_idle_cpu(i))
6038 return i;
6039
Rohit Jain943d3552018-05-09 09:39:48 -07006040 if (available_idle_cpu(i)) {
Nicolas Pitre83a0a962014-09-04 11:32:10 -04006041 struct cpuidle_state *idle = idle_get_state(rq);
6042 if (idle && idle->exit_latency < min_exit_latency) {
6043 /*
6044 * We give priority to a CPU whose idle state
6045 * has the smallest exit latency irrespective
6046 * of any idle timestamp.
6047 */
6048 min_exit_latency = idle->exit_latency;
6049 latest_idle_timestamp = rq->idle_stamp;
6050 shallowest_idle_cpu = i;
6051 } else if ((!idle || idle->exit_latency == min_exit_latency) &&
6052 rq->idle_stamp > latest_idle_timestamp) {
6053 /*
6054 * If equal or no active idle state, then
6055 * the most recently idled CPU might have
6056 * a warmer cache.
6057 */
6058 latest_idle_timestamp = rq->idle_stamp;
6059 shallowest_idle_cpu = i;
6060 }
Viresh Kumar17346452019-11-14 16:19:27 +05306061 } else if (shallowest_idle_cpu == -1) {
Vincent Guittot11f10e52019-10-18 15:26:36 +02006062 load = cpu_load(cpu_rq(i));
Joel Fernandes18cec7e2017-12-15 07:39:44 -08006063 if (load < min_load) {
Nicolas Pitre83a0a962014-09-04 11:32:10 -04006064 min_load = load;
6065 least_loaded_cpu = i;
6066 }
Gregory Haskinse7693a32008-01-25 21:08:09 +01006067 }
6068 }
6069
Viresh Kumar17346452019-11-14 16:19:27 +05306070 return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006071}
Gregory Haskinse7693a32008-01-25 21:08:09 +01006072
Brendan Jackman18bd1b4b2017-10-05 12:45:12 +01006073static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
6074 int cpu, int prev_cpu, int sd_flag)
6075{
Brendan Jackman93f50f92017-10-05 12:45:16 +01006076 int new_cpu = cpu;
Brendan Jackman18bd1b4b2017-10-05 12:45:12 +01006077
Sebastian Andrzej Siewior3bd37062019-04-23 16:26:36 +02006078 if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr))
Brendan Jackman6fee85c2017-10-05 12:45:15 +01006079 return prev_cpu;
6080
Viresh Kumarc976a862018-04-26 16:00:51 +05306081 /*
Vincent Guittot57abff02019-10-18 15:26:38 +02006082 * We need task's util for cpu_util_without, sync it up to
Patrick Bellasic4699332018-11-05 14:53:58 +00006083 * prev_cpu's last_update_time.
Viresh Kumarc976a862018-04-26 16:00:51 +05306084 */
6085 if (!(sd_flag & SD_BALANCE_FORK))
6086 sync_entity_load_avg(&p->se);
6087
Brendan Jackman18bd1b4b2017-10-05 12:45:12 +01006088 while (sd) {
6089 struct sched_group *group;
6090 struct sched_domain *tmp;
6091 int weight;
6092
6093 if (!(sd->flags & sd_flag)) {
6094 sd = sd->child;
6095 continue;
6096 }
6097
Valentin Schneider45da2772020-04-15 22:05:04 +01006098 group = find_idlest_group(sd, p, cpu);
Brendan Jackman18bd1b4b2017-10-05 12:45:12 +01006099 if (!group) {
6100 sd = sd->child;
6101 continue;
6102 }
6103
6104 new_cpu = find_idlest_group_cpu(group, p, cpu);
Brendan Jackmane90381e2017-10-05 12:45:13 +01006105 if (new_cpu == cpu) {
Ingo Molnar97fb7a02018-03-03 14:01:12 +01006106 /* Now try balancing at a lower domain level of 'cpu': */
Brendan Jackman18bd1b4b2017-10-05 12:45:12 +01006107 sd = sd->child;
6108 continue;
6109 }
6110
Ingo Molnar97fb7a02018-03-03 14:01:12 +01006111 /* Now try balancing at a lower domain level of 'new_cpu': */
Brendan Jackman18bd1b4b2017-10-05 12:45:12 +01006112 cpu = new_cpu;
6113 weight = sd->span_weight;
6114 sd = NULL;
6115 for_each_domain(cpu, tmp) {
6116 if (weight <= tmp->span_weight)
6117 break;
6118 if (tmp->flags & sd_flag)
6119 sd = tmp;
6120 }
Brendan Jackman18bd1b4b2017-10-05 12:45:12 +01006121 }
6122
6123 return new_cpu;
6124}
6125
Aubrey Li97886d92021-03-24 17:40:13 -04006126static inline int __select_idle_cpu(int cpu, struct task_struct *p)
Mel Gorman9fe1f122021-01-27 13:52:03 +00006127{
Aubrey Li97886d92021-03-24 17:40:13 -04006128 if ((available_idle_cpu(cpu) || sched_idle_cpu(cpu)) &&
6129 sched_cpu_cookie_match(cpu_rq(cpu), p))
Mel Gorman9fe1f122021-01-27 13:52:03 +00006130 return cpu;
6131
6132 return -1;
6133}
6134
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006135#ifdef CONFIG_SCHED_SMT
Peter Zijlstraba2591a2018-05-29 16:43:46 +02006136DEFINE_STATIC_KEY_FALSE(sched_smt_present);
Josh Poimboeufb2849092019-01-30 07:13:58 -06006137EXPORT_SYMBOL_GPL(sched_smt_present);
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006138
6139static inline void set_idle_cores(int cpu, int val)
6140{
6141 struct sched_domain_shared *sds;
6142
6143 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
6144 if (sds)
6145 WRITE_ONCE(sds->has_idle_cores, val);
6146}
6147
6148static inline bool test_idle_cores(int cpu, bool def)
6149{
6150 struct sched_domain_shared *sds;
6151
Rik van Rielc722f352021-03-26 15:19:32 -04006152 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
6153 if (sds)
6154 return READ_ONCE(sds->has_idle_cores);
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006155
6156 return def;
6157}
6158
6159/*
6160 * Scans the local SMT mask to see if the entire core is idle, and records this
6161 * information in sd_llc_shared->has_idle_cores.
6162 *
6163 * Since SMT siblings share all cache levels, inspecting this limited remote
6164 * state should be fairly cheap.
6165 */
Peter Zijlstra1b568f02016-05-09 10:38:41 +02006166void __update_idle_core(struct rq *rq)
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006167{
6168 int core = cpu_of(rq);
6169 int cpu;
6170
6171 rcu_read_lock();
6172 if (test_idle_cores(core, true))
6173 goto unlock;
6174
6175 for_each_cpu(cpu, cpu_smt_mask(core)) {
6176 if (cpu == core)
6177 continue;
6178
Rohit Jain943d3552018-05-09 09:39:48 -07006179 if (!available_idle_cpu(cpu))
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006180 goto unlock;
6181 }
6182
6183 set_idle_cores(core, 1);
6184unlock:
6185 rcu_read_unlock();
6186}
6187
6188/*
6189 * Scan the entire LLC domain for idle cores; this dynamically switches off if
6190 * there are no idle cores left in the system; tracked through
6191 * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
6192 */
Mel Gorman9fe1f122021-01-27 13:52:03 +00006193static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006194{
Mel Gorman9fe1f122021-01-27 13:52:03 +00006195 bool idle = true;
6196 int cpu;
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006197
Peter Zijlstra1b568f02016-05-09 10:38:41 +02006198 if (!static_branch_likely(&sched_smt_present))
Aubrey Li97886d92021-03-24 17:40:13 -04006199 return __select_idle_cpu(core, p);
Peter Zijlstra1b568f02016-05-09 10:38:41 +02006200
Mel Gorman9fe1f122021-01-27 13:52:03 +00006201 for_each_cpu(cpu, cpu_smt_mask(core)) {
6202 if (!available_idle_cpu(cpu)) {
6203 idle = false;
6204 if (*idle_cpu == -1) {
6205 if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, p->cpus_ptr)) {
6206 *idle_cpu = cpu;
6207 break;
6208 }
6209 continue;
Srikar Dronamrajubec28602019-12-06 22:54:22 +05306210 }
Mel Gorman9fe1f122021-01-27 13:52:03 +00006211 break;
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006212 }
Mel Gorman9fe1f122021-01-27 13:52:03 +00006213 if (*idle_cpu == -1 && cpumask_test_cpu(cpu, p->cpus_ptr))
6214 *idle_cpu = cpu;
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006215 }
6216
Mel Gorman9fe1f122021-01-27 13:52:03 +00006217 if (idle)
6218 return core;
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006219
Mel Gorman9fe1f122021-01-27 13:52:03 +00006220 cpumask_andnot(cpus, cpus, cpu_smt_mask(core));
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006221 return -1;
6222}
6223
Rik van Rielc722f352021-03-26 15:19:32 -04006224/*
6225 * Scan the local SMT mask for idle CPUs.
6226 */
6227static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
6228{
6229 int cpu;
6230
6231 for_each_cpu(cpu, cpu_smt_mask(target)) {
6232 if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
6233 !cpumask_test_cpu(cpu, sched_domain_span(sd)))
6234 continue;
6235 if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
6236 return cpu;
6237 }
6238
6239 return -1;
6240}
6241
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006242#else /* CONFIG_SCHED_SMT */
6243
Mel Gorman9fe1f122021-01-27 13:52:03 +00006244static inline void set_idle_cores(int cpu, int val)
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006245{
Mel Gorman9fe1f122021-01-27 13:52:03 +00006246}
6247
6248static inline bool test_idle_cores(int cpu, bool def)
6249{
6250 return def;
6251}
6252
6253static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu)
6254{
Aubrey Li97886d92021-03-24 17:40:13 -04006255 return __select_idle_cpu(core, p);
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006256}
6257
Rik van Rielc722f352021-03-26 15:19:32 -04006258static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
6259{
6260 return -1;
6261}
6262
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006263#endif /* CONFIG_SCHED_SMT */
6264
6265/*
6266 * Scan the LLC domain for idle CPUs; this is dynamically regulated by
6267 * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
6268 * average idle time for this rq (as found in rq->avg_idle).
6269 */
Rik van Rielc722f352021-03-26 15:19:32 -04006270static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target)
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006271{
Cheng Jian60588bf2019-12-13 10:45:30 +08006272 struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
Mel Gorman9fe1f122021-01-27 13:52:03 +00006273 int i, cpu, idle_cpu = -1, nr = INT_MAX;
Peter Zijlstra94aafc32021-06-15 12:16:11 +01006274 struct rq *this_rq = this_rq();
Mel Gorman9fe1f122021-01-27 13:52:03 +00006275 int this = smp_processor_id();
Wanpeng Li9cfb38a2016-10-09 08:04:03 +08006276 struct sched_domain *this_sd;
Peter Zijlstra94aafc32021-06-15 12:16:11 +01006277 u64 time = 0;
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006278
Wanpeng Li9cfb38a2016-10-09 08:04:03 +08006279 this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
6280 if (!this_sd)
6281 return -1;
6282
Mel Gormanbae4ec12021-01-25 08:59:07 +00006283 cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
6284
Rik van Rielc722f352021-03-26 15:19:32 -04006285 if (sched_feat(SIS_PROP) && !has_idle_core) {
Mel Gormane6e0dc22021-01-25 08:59:06 +00006286 u64 avg_cost, avg_idle, span_avg;
Peter Zijlstra94aafc32021-06-15 12:16:11 +01006287 unsigned long now = jiffies;
Mel Gormane6e0dc22021-01-25 08:59:06 +00006288
6289 /*
Peter Zijlstra94aafc32021-06-15 12:16:11 +01006290 * If we're busy, the assumption that the last idle period
6291 * predicts the future is flawed; age away the remaining
6292 * predicted idle time.
Mel Gormane6e0dc22021-01-25 08:59:06 +00006293 */
Peter Zijlstra94aafc32021-06-15 12:16:11 +01006294 if (unlikely(this_rq->wake_stamp < now)) {
6295 while (this_rq->wake_stamp < now && this_rq->wake_avg_idle) {
6296 this_rq->wake_stamp++;
6297 this_rq->wake_avg_idle >>= 1;
6298 }
6299 }
6300
6301 avg_idle = this_rq->wake_avg_idle;
Mel Gormane6e0dc22021-01-25 08:59:06 +00006302 avg_cost = this_sd->avg_scan_cost + 1;
6303
6304 span_avg = sd->span_weight * avg_idle;
Peter Zijlstra1ad3aaf2017-05-17 12:53:50 +02006305 if (span_avg > 4*avg_cost)
6306 nr = div_u64(span_avg, avg_cost);
6307 else
6308 nr = 4;
Mel Gormanbae4ec12021-01-25 08:59:07 +00006309
6310 time = cpu_clock(this);
Peter Zijlstra1ad3aaf2017-05-17 12:53:50 +02006311 }
6312
Mel Gorman56498cf2021-08-04 12:58:57 +01006313 for_each_cpu_wrap(cpu, cpus, target + 1) {
Rik van Rielc722f352021-03-26 15:19:32 -04006314 if (has_idle_core) {
Mel Gorman9fe1f122021-01-27 13:52:03 +00006315 i = select_idle_core(p, cpu, cpus, &idle_cpu);
6316 if ((unsigned int)i < nr_cpumask_bits)
6317 return i;
6318
6319 } else {
6320 if (!--nr)
6321 return -1;
Aubrey Li97886d92021-03-24 17:40:13 -04006322 idle_cpu = __select_idle_cpu(cpu, p);
Mel Gorman9fe1f122021-01-27 13:52:03 +00006323 if ((unsigned int)idle_cpu < nr_cpumask_bits)
6324 break;
6325 }
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006326 }
6327
Rik van Rielc722f352021-03-26 15:19:32 -04006328 if (has_idle_core)
Gautham R. Shenoy02dbb722021-05-11 20:46:09 +05306329 set_idle_cores(target, false);
Mel Gorman9fe1f122021-01-27 13:52:03 +00006330
Rik van Rielc722f352021-03-26 15:19:32 -04006331 if (sched_feat(SIS_PROP) && !has_idle_core) {
Mel Gormanbae4ec12021-01-25 08:59:07 +00006332 time = cpu_clock(this) - time;
Peter Zijlstra94aafc32021-06-15 12:16:11 +01006333
6334 /*
6335 * Account for the scan cost of wakeups against the average
6336 * idle time.
6337 */
6338 this_rq->wake_avg_idle -= min(this_rq->wake_avg_idle, time);
6339
Mel Gormanbae4ec12021-01-25 08:59:07 +00006340 update_avg(&this_sd->avg_scan_cost, time);
6341 }
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006342
Mel Gorman9fe1f122021-01-27 13:52:03 +00006343 return idle_cpu;
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006344}
6345
6346/*
Morten Rasmussenb7a33162020-02-06 19:19:54 +00006347 * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which
6348 * the task fits. If no CPU is big enough, but there are idle ones, try to
6349 * maximize capacity.
6350 */
6351static int
6352select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
6353{
Vincent Guittotb4c9c9f2020-10-29 17:18:24 +01006354 unsigned long task_util, best_cap = 0;
Morten Rasmussenb7a33162020-02-06 19:19:54 +00006355 int cpu, best_cpu = -1;
6356 struct cpumask *cpus;
6357
Morten Rasmussenb7a33162020-02-06 19:19:54 +00006358 cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
6359 cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
6360
Vincent Guittotb4c9c9f2020-10-29 17:18:24 +01006361 task_util = uclamp_task_util(p);
6362
Morten Rasmussenb7a33162020-02-06 19:19:54 +00006363 for_each_cpu_wrap(cpu, cpus, target) {
6364 unsigned long cpu_cap = capacity_of(cpu);
6365
6366 if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
6367 continue;
Vincent Guittotb4c9c9f2020-10-29 17:18:24 +01006368 if (fits_capacity(task_util, cpu_cap))
Morten Rasmussenb7a33162020-02-06 19:19:54 +00006369 return cpu;
6370
6371 if (cpu_cap > best_cap) {
6372 best_cap = cpu_cap;
6373 best_cpu = cpu;
6374 }
6375 }
6376
6377 return best_cpu;
6378}
6379
Vincent Donnefortef8df972021-12-07 09:57:55 +00006380static inline bool asym_fits_capacity(unsigned long task_util, int cpu)
Vincent Guittotb4c9c9f2020-10-29 17:18:24 +01006381{
6382 if (static_branch_unlikely(&sched_asym_cpucapacity))
6383 return fits_capacity(task_util, capacity_of(cpu));
6384
6385 return true;
6386}
6387
Morten Rasmussenb7a33162020-02-06 19:19:54 +00006388/*
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006389 * Try and locate an idle core/thread in the LLC cache domain.
Peter Zijlstraa50bde52009-11-12 15:55:28 +01006390 */
Morten Rasmussen772bd008c2016-06-22 18:03:13 +01006391static int select_idle_sibling(struct task_struct *p, int prev, int target)
Peter Zijlstraa50bde52009-11-12 15:55:28 +01006392{
Rik van Rielc722f352021-03-26 15:19:32 -04006393 bool has_idle_core = false;
Suresh Siddha99bd5e22010-03-31 16:47:45 -07006394 struct sched_domain *sd;
Vincent Guittotb4c9c9f2020-10-29 17:18:24 +01006395 unsigned long task_util;
Mel Gorman32e839d2018-01-30 10:45:55 +00006396 int i, recent_used_cpu;
Mike Galbraithe0a79f52013-01-28 12:19:25 +01006397
Morten Rasmussenb7a33162020-02-06 19:19:54 +00006398 /*
Vincent Guittotb4c9c9f2020-10-29 17:18:24 +01006399 * On asymmetric system, update task utilization because we will check
6400 * that the task fits with cpu's capacity.
Morten Rasmussenb7a33162020-02-06 19:19:54 +00006401 */
6402 if (static_branch_unlikely(&sched_asym_cpucapacity)) {
Vincent Guittotb4c9c9f2020-10-29 17:18:24 +01006403 sync_entity_load_avg(&p->se);
6404 task_util = uclamp_task_util(p);
Morten Rasmussenb7a33162020-02-06 19:19:54 +00006405 }
6406
Peter Zijlstra9099a14702020-11-17 18:19:35 -05006407 /*
6408 * per-cpu select_idle_mask usage
6409 */
6410 lockdep_assert_irqs_disabled();
6411
Vincent Guittotb4c9c9f2020-10-29 17:18:24 +01006412 if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
6413 asym_fits_capacity(task_util, target))
Mike Galbraithe0a79f52013-01-28 12:19:25 +01006414 return target;
Peter Zijlstraa50bde52009-11-12 15:55:28 +01006415
6416 /*
Ingo Molnar97fb7a02018-03-03 14:01:12 +01006417 * If the previous CPU is cache affine and idle, don't be stupid:
Peter Zijlstraa50bde52009-11-12 15:55:28 +01006418 */
Viresh Kumar3c29e652019-06-26 10:36:30 +05306419 if (prev != target && cpus_share_cache(prev, target) &&
Vincent Guittotb4c9c9f2020-10-29 17:18:24 +01006420 (available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
6421 asym_fits_capacity(task_util, prev))
Morten Rasmussen772bd008c2016-06-22 18:03:13 +01006422 return prev;
Peter Zijlstraa50bde52009-11-12 15:55:28 +01006423
Mel Gorman52262ee2020-01-28 15:40:06 +00006424 /*
6425 * Allow a per-cpu kthread to stack with the wakee if the
6426 * kworker thread and the tasks previous CPUs are the same.
6427 * The assumption is that the wakee queued work for the
6428 * per-cpu kthread that is now complete and the wakeup is
6429 * essentially a sync wakeup. An obvious example of this
6430 * pattern is IO completions.
6431 */
6432 if (is_per_cpu_kthread(current) &&
Vincent Donnefort8b4e74c2021-12-01 14:34:50 +00006433 in_task() &&
Mel Gorman52262ee2020-01-28 15:40:06 +00006434 prev == smp_processor_id() &&
Vincent Donnefort014ba44e82021-11-29 17:31:15 +00006435 this_rq()->nr_running <= 1 &&
6436 asym_fits_capacity(task_util, prev)) {
Mel Gorman52262ee2020-01-28 15:40:06 +00006437 return prev;
6438 }
6439
Ingo Molnar97fb7a02018-03-03 14:01:12 +01006440 /* Check a recently used CPU as a potential idle candidate: */
Mel Gorman32e839d2018-01-30 10:45:55 +00006441 recent_used_cpu = p->recent_used_cpu;
Mel Gorman89aafd62021-08-04 12:58:56 +01006442 p->recent_used_cpu = prev;
Mel Gorman32e839d2018-01-30 10:45:55 +00006443 if (recent_used_cpu != prev &&
6444 recent_used_cpu != target &&
6445 cpus_share_cache(recent_used_cpu, target) &&
Viresh Kumar3c29e652019-06-26 10:36:30 +05306446 (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
Vincent Guittotb4c9c9f2020-10-29 17:18:24 +01006447 cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) &&
6448 asym_fits_capacity(task_util, recent_used_cpu)) {
Mel Gorman32e839d2018-01-30 10:45:55 +00006449 return recent_used_cpu;
6450 }
6451
Vincent Guittotb4c9c9f2020-10-29 17:18:24 +01006452 /*
6453 * For asymmetric CPU capacity systems, our domain of interest is
6454 * sd_asym_cpucapacity rather than sd_llc.
6455 */
6456 if (static_branch_unlikely(&sched_asym_cpucapacity)) {
6457 sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target));
6458 /*
6459 * On an asymmetric CPU capacity system where an exclusive
6460 * cpuset defines a symmetric island (i.e. one unique
6461 * capacity_orig value through the cpuset), the key will be set
6462 * but the CPUs within that cpuset will not have a domain with
6463 * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric
6464 * capacity path.
6465 */
6466 if (sd) {
6467 i = select_idle_capacity(p, sd, target);
6468 return ((unsigned)i < nr_cpumask_bits) ? i : target;
6469 }
6470 }
6471
Peter Zijlstra518cd622011-12-07 15:07:31 +01006472 sd = rcu_dereference(per_cpu(sd_llc, target));
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006473 if (!sd)
6474 return target;
Morten Rasmussen772bd008c2016-06-22 18:03:13 +01006475
Rik van Rielc722f352021-03-26 15:19:32 -04006476 if (sched_smt_active()) {
6477 has_idle_core = test_idle_cores(target, false);
6478
6479 if (!has_idle_core && cpus_share_cache(prev, target)) {
6480 i = select_idle_smt(p, sd, prev);
6481 if ((unsigned int)i < nr_cpumask_bits)
6482 return i;
6483 }
6484 }
6485
6486 i = select_idle_cpu(p, sd, has_idle_core, target);
Peter Zijlstra10e2f1a2016-05-09 10:38:05 +02006487 if ((unsigned)i < nr_cpumask_bits)
6488 return i;
Mike Galbraith970e1782012-06-12 05:18:32 +02006489
Peter Zijlstraa50bde52009-11-12 15:55:28 +01006490 return target;
6491}
Dietmar Eggemann231678b2015-08-14 17:23:13 +01006492
Morten Rasmussen32731632016-07-25 14:34:26 +01006493/*
Patrick Bellasic4699332018-11-05 14:53:58 +00006494 * cpu_util_without: compute cpu utilization without any contributions from *p
6495 * @cpu: the CPU which utilization is requested
6496 * @p: the task which utilization should be discounted
6497 *
6498 * The utilization of a CPU is defined by the utilization of tasks currently
6499 * enqueued on that CPU as well as tasks which are currently sleeping after an
6500 * execution on that CPU.
6501 *
6502 * This method returns the utilization of the specified CPU by discounting the
6503 * utilization of the specified task, whenever the task is currently
6504 * contributing to the CPU utilization.
Morten Rasmussen104cb162016-10-14 14:41:07 +01006505 */
Patrick Bellasic4699332018-11-05 14:53:58 +00006506static unsigned long cpu_util_without(int cpu, struct task_struct *p)
Morten Rasmussen104cb162016-10-14 14:41:07 +01006507{
Patrick Bellasif9be3e52018-03-09 09:52:43 +00006508 struct cfs_rq *cfs_rq;
6509 unsigned int util;
Morten Rasmussen104cb162016-10-14 14:41:07 +01006510
6511 /* Task has no contribution or is new */
Patrick Bellasif9be3e52018-03-09 09:52:43 +00006512 if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
Dietmar Eggemann82762d22021-11-18 17:42:40 +01006513 return cpu_util_cfs(cpu);
Morten Rasmussen104cb162016-10-14 14:41:07 +01006514
Patrick Bellasif9be3e52018-03-09 09:52:43 +00006515 cfs_rq = &cpu_rq(cpu)->cfs;
6516 util = READ_ONCE(cfs_rq->avg.util_avg);
Morten Rasmussen104cb162016-10-14 14:41:07 +01006517
Patrick Bellasic4699332018-11-05 14:53:58 +00006518 /* Discount task's util from CPU's util */
Patrick Bellasib5c0ce72018-11-05 14:54:00 +00006519 lsub_positive(&util, task_util(p));
Patrick Bellasif9be3e52018-03-09 09:52:43 +00006520
6521 /*
6522 * Covered cases:
6523 *
6524 * a) if *p is the only task sleeping on this CPU, then:
6525 * cpu_util (== task_util) > util_est (== 0)
6526 * and thus we return:
Patrick Bellasic4699332018-11-05 14:53:58 +00006527 * cpu_util_without = (cpu_util - task_util) = 0
Patrick Bellasif9be3e52018-03-09 09:52:43 +00006528 *
6529 * b) if other tasks are SLEEPING on this CPU, which is now exiting
6530 * IDLE, then:
6531 * cpu_util >= task_util
6532 * cpu_util > util_est (== 0)
6533 * and thus we discount *p's blocked utilization to return:
Patrick Bellasic4699332018-11-05 14:53:58 +00006534 * cpu_util_without = (cpu_util - task_util) >= 0
Patrick Bellasif9be3e52018-03-09 09:52:43 +00006535 *
6536 * c) if other tasks are RUNNABLE on that CPU and
6537 * util_est > cpu_util
6538 * then we use util_est since it returns a more restrictive
6539 * estimation of the spare capacity on that CPU, by just
6540 * considering the expected utilization of tasks already
6541 * runnable on that CPU.
6542 *
6543 * Cases a) and b) are covered by the above code, while case c) is
6544 * covered by the following code when estimated utilization is
6545 * enabled.
6546 */
Patrick Bellasic4699332018-11-05 14:53:58 +00006547 if (sched_feat(UTIL_EST)) {
6548 unsigned int estimated =
6549 READ_ONCE(cfs_rq->avg.util_est.enqueued);
6550
6551 /*
6552 * Despite the following checks we still have a small window
6553 * for a possible race, when an execl's select_task_rq_fair()
6554 * races with LB's detach_task():
6555 *
6556 * detach_task()
6557 * p->on_rq = TASK_ON_RQ_MIGRATING;
6558 * ---------------------------------- A
6559 * deactivate_task() \
6560 * dequeue_task() + RaceTime
6561 * util_est_dequeue() /
6562 * ---------------------------------- B
6563 *
6564 * The additional check on "current == p" it's required to
6565 * properly fix the execl regression and it helps in further
6566 * reducing the chances for the above race.
6567 */
Patrick Bellasib5c0ce72018-11-05 14:54:00 +00006568 if (unlikely(task_on_rq_queued(p) || current == p))
6569 lsub_positive(&estimated, _task_util_est(p));
6570
Patrick Bellasic4699332018-11-05 14:53:58 +00006571 util = max(util, estimated);
6572 }
Patrick Bellasif9be3e52018-03-09 09:52:43 +00006573
6574 /*
6575 * Utilization (estimated) can exceed the CPU capacity, thus let's
6576 * clamp to the maximum CPU capacity to ensure consistency with
Dietmar Eggemann82762d22021-11-18 17:42:40 +01006577 * cpu_util.
Patrick Bellasif9be3e52018-03-09 09:52:43 +00006578 */
6579 return min_t(unsigned long, util, capacity_orig_of(cpu));
Morten Rasmussen104cb162016-10-14 14:41:07 +01006580}
6581
6582/*
Quentin Perret390031e42018-12-03 09:56:26 +00006583 * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
6584 * to @dst_cpu.
6585 */
6586static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
6587{
6588 struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
6589 unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
6590
6591 /*
6592 * If @p migrates from @cpu to another, remove its contribution. Or,
6593 * if @p migrates from another CPU to @cpu, add its contribution. In
6594 * the other cases, @cpu is not impacted by the migration, so the
6595 * util_avg should already be correct.
6596 */
6597 if (task_cpu(p) == cpu && dst_cpu != cpu)
Vincent Donnefort736cc6b2021-02-25 08:36:12 +00006598 lsub_positive(&util, task_util(p));
Quentin Perret390031e42018-12-03 09:56:26 +00006599 else if (task_cpu(p) != cpu && dst_cpu == cpu)
6600 util += task_util(p);
6601
6602 if (sched_feat(UTIL_EST)) {
6603 util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
6604
6605 /*
6606 * During wake-up, the task isn't enqueued yet and doesn't
6607 * appear in the cfs_rq->avg.util_est.enqueued of any rq,
6608 * so just add it (if needed) to "simulate" what will be
Dietmar Eggemann82762d22021-11-18 17:42:40 +01006609 * cpu_util after the task has been enqueued.
Quentin Perret390031e42018-12-03 09:56:26 +00006610 */
6611 if (dst_cpu == cpu)
6612 util_est += _task_util_est(p);
6613
6614 util = max(util, util_est);
6615 }
6616
6617 return min(util, capacity_orig_of(cpu));
6618}
6619
6620/*
Quentin Perreteb926922019-09-12 11:44:04 +02006621 * compute_energy(): Estimates the energy that @pd would consume if @p was
Quentin Perret390031e42018-12-03 09:56:26 +00006622 * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
Quentin Perreteb926922019-09-12 11:44:04 +02006623 * landscape of @pd's CPUs after the task migration, and uses the Energy Model
Quentin Perret390031e42018-12-03 09:56:26 +00006624 * to compute what would be the energy if we decided to actually migrate that
6625 * task.
6626 */
6627static long
6628compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
6629{
Quentin Perreteb926922019-09-12 11:44:04 +02006630 struct cpumask *pd_mask = perf_domain_span(pd);
6631 unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
6632 unsigned long max_util = 0, sum_util = 0;
Lukasz Luba489f1642021-06-14 20:11:28 +01006633 unsigned long _cpu_cap = cpu_cap;
Quentin Perret390031e42018-12-03 09:56:26 +00006634 int cpu;
6635
Lukasz Luba489f1642021-06-14 20:11:28 +01006636 _cpu_cap -= arch_scale_thermal_pressure(cpumask_first(pd_mask));
6637
Quentin Perreteb926922019-09-12 11:44:04 +02006638 /*
6639 * The capacity state of CPUs of the current rd can be driven by CPUs
6640 * of another rd if they belong to the same pd. So, account for the
6641 * utilization of these CPUs too by masking pd with cpu_online_mask
6642 * instead of the rd span.
6643 *
6644 * If an entire pd is outside of the current rd, it will not appear in
6645 * its pd list and will not be accounted by compute_energy().
6646 */
6647 for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
Vincent Donnefort0372e1c2021-02-25 08:36:11 +00006648 unsigned long util_freq = cpu_util_next(cpu, p, dst_cpu);
6649 unsigned long cpu_util, util_running = util_freq;
6650 struct task_struct *tsk = NULL;
6651
6652 /*
6653 * When @p is placed on @cpu:
6654 *
6655 * util_running = max(cpu_util, cpu_util_est) +
6656 * max(task_util, _task_util_est)
6657 *
6658 * while cpu_util_next is: max(cpu_util + task_util,
6659 * cpu_util_est + _task_util_est)
6660 */
6661 if (cpu == dst_cpu) {
6662 tsk = p;
6663 util_running =
6664 cpu_util_next(cpu, p, -1) + task_util_est(p);
6665 }
Patrick Bellasiaf24bde2019-06-21 09:42:12 +01006666
6667 /*
Quentin Perreteb926922019-09-12 11:44:04 +02006668 * Busy time computation: utilization clamping is not
6669 * required since the ratio (sum_util / cpu_capacity)
6670 * is already enough to scale the EM reported power
6671 * consumption at the (eventually clamped) cpu_capacity.
Patrick Bellasiaf24bde2019-06-21 09:42:12 +01006672 */
Lukasz Luba489f1642021-06-14 20:11:28 +01006673 cpu_util = effective_cpu_util(cpu, util_running, cpu_cap,
6674 ENERGY_UTIL, NULL);
6675
6676 sum_util += min(cpu_util, _cpu_cap);
Patrick Bellasiaf24bde2019-06-21 09:42:12 +01006677
Quentin Perret390031e42018-12-03 09:56:26 +00006678 /*
Quentin Perreteb926922019-09-12 11:44:04 +02006679 * Performance domain frequency: utilization clamping
6680 * must be considered since it affects the selection
6681 * of the performance domain frequency.
6682 * NOTE: in case RT tasks are running, by default the
6683 * FREQUENCY_UTIL's utilization can be max OPP.
Quentin Perret390031e42018-12-03 09:56:26 +00006684 */
Vincent Donnefort0372e1c2021-02-25 08:36:11 +00006685 cpu_util = effective_cpu_util(cpu, util_freq, cpu_cap,
Quentin Perreteb926922019-09-12 11:44:04 +02006686 FREQUENCY_UTIL, tsk);
Lukasz Luba489f1642021-06-14 20:11:28 +01006687 max_util = max(max_util, min(cpu_util, _cpu_cap));
Quentin Perret390031e42018-12-03 09:56:26 +00006688 }
6689
Lukasz Luba8f1b9712021-06-14 20:12:38 +01006690 return em_cpu_energy(pd->em_pd, max_util, sum_util, _cpu_cap);
Quentin Perret390031e42018-12-03 09:56:26 +00006691}
6692
6693/*
Quentin Perret732cd752018-12-03 09:56:27 +00006694 * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
6695 * waking task. find_energy_efficient_cpu() looks for the CPU with maximum
6696 * spare capacity in each performance domain and uses it as a potential
6697 * candidate to execute the task. Then, it uses the Energy Model to figure
6698 * out which of the CPU candidates is the most energy-efficient.
6699 *
6700 * The rationale for this heuristic is as follows. In a performance domain,
6701 * all the most energy efficient CPU candidates (according to the Energy
6702 * Model) are those for which we'll request a low frequency. When there are
6703 * several CPUs for which the frequency request will be the same, we don't
6704 * have enough data to break the tie between them, because the Energy Model
6705 * only includes active power costs. With this model, if we assume that
6706 * frequency requests follow utilization (e.g. using schedutil), the CPU with
6707 * the maximum spare capacity in a performance domain is guaranteed to be among
6708 * the best candidates of the performance domain.
6709 *
6710 * In practice, it could be preferable from an energy standpoint to pack
6711 * small tasks on a CPU in order to let other CPUs go in deeper idle states,
6712 * but that could also hurt our chances to go cluster idle, and we have no
6713 * ways to tell with the current Energy Model if this is actually a good
6714 * idea or not. So, find_energy_efficient_cpu() basically favors
6715 * cluster-packing, and spreading inside a cluster. That should at least be
6716 * a good thing for latency, and this is consistent with the idea that most
6717 * of the energy savings of EAS come from the asymmetry of the system, and
6718 * not so much from breaking the tie between identical CPUs. That's also the
6719 * reason why EAS is enabled in the topology code only for systems where
6720 * SD_ASYM_CPUCAPACITY is set.
6721 *
6722 * NOTE: Forkees are not accepted in the energy-aware wake-up path because
6723 * they don't have any useful utilization data yet and it's not possible to
6724 * forecast their impact on energy consumption. Consequently, they will be
6725 * placed by find_idlest_cpu() on the least loaded CPU, which might turn out
6726 * to be energy-inefficient in some use-cases. The alternative would be to
6727 * bias new tasks towards specific types of CPUs first, or to try to infer
6728 * their util_avg from the parent task, but those heuristics could hurt
6729 * other use-cases too. So, until someone finds a better way to solve this,
6730 * let's keep things simple by re-using the existing slow path.
6731 */
Quentin Perret732cd752018-12-03 09:56:27 +00006732static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
6733{
Quentin Perreteb926922019-09-12 11:44:04 +02006734 unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
Quentin Perret732cd752018-12-03 09:56:27 +00006735 struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
Pierre Gondois619e0902021-05-04 10:07:43 +01006736 int cpu, best_energy_cpu = prev_cpu, target = -1;
Quentin Perreteb926922019-09-12 11:44:04 +02006737 unsigned long cpu_cap, util, base_energy = 0;
Quentin Perret732cd752018-12-03 09:56:27 +00006738 struct sched_domain *sd;
Quentin Perreteb926922019-09-12 11:44:04 +02006739 struct perf_domain *pd;
Quentin Perret732cd752018-12-03 09:56:27 +00006740
6741 rcu_read_lock();
6742 pd = rcu_dereference(rd->pd);
6743 if (!pd || READ_ONCE(rd->overutilized))
Pierre Gondois619e0902021-05-04 10:07:43 +01006744 goto unlock;
Quentin Perret732cd752018-12-03 09:56:27 +00006745
6746 /*
6747 * Energy-aware wake-up happens on the lowest sched_domain starting
6748 * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
6749 */
6750 sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
6751 while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
6752 sd = sd->parent;
6753 if (!sd)
Pierre Gondois619e0902021-05-04 10:07:43 +01006754 goto unlock;
6755
6756 target = prev_cpu;
Quentin Perret732cd752018-12-03 09:56:27 +00006757
6758 sync_entity_load_avg(&p->se);
6759 if (!task_util_est(p))
6760 goto unlock;
6761
6762 for (; pd; pd = pd->next) {
Quentin Perreteb926922019-09-12 11:44:04 +02006763 unsigned long cur_delta, spare_cap, max_spare_cap = 0;
Pierre Gondois8d4c97c2021-05-04 10:07:42 +01006764 bool compute_prev_delta = false;
Quentin Perreteb926922019-09-12 11:44:04 +02006765 unsigned long base_energy_pd;
Quentin Perret732cd752018-12-03 09:56:27 +00006766 int max_spare_cap_cpu = -1;
6767
6768 for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
Sebastian Andrzej Siewior3bd37062019-04-23 16:26:36 +02006769 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
Quentin Perret732cd752018-12-03 09:56:27 +00006770 continue;
6771
Quentin Perret732cd752018-12-03 09:56:27 +00006772 util = cpu_util_next(cpu, p, cpu);
6773 cpu_cap = capacity_of(cpu);
Lukasz Lubada0777d2020-08-10 09:30:04 +01006774 spare_cap = cpu_cap;
6775 lsub_positive(&spare_cap, util);
Valentin Schneider1d425092019-12-11 11:38:51 +00006776
6777 /*
6778 * Skip CPUs that cannot satisfy the capacity request.
6779 * IOW, placing the task there would make the CPU
6780 * overutilized. Take uclamp into account to see how
6781 * much capacity we can get out of the CPU; this is
Viresh Kumara5418be2020-12-08 09:46:56 +05306782 * aligned with sched_cpu_util().
Valentin Schneider1d425092019-12-11 11:38:51 +00006783 */
6784 util = uclamp_rq_util_with(cpu_rq(cpu), util, p);
Viresh Kumar60e17f52019-06-04 12:31:52 +05306785 if (!fits_capacity(util, cpu_cap))
Quentin Perret732cd752018-12-03 09:56:27 +00006786 continue;
6787
Quentin Perret732cd752018-12-03 09:56:27 +00006788 if (cpu == prev_cpu) {
Pierre Gondois8d4c97c2021-05-04 10:07:42 +01006789 /* Always use prev_cpu as a candidate. */
6790 compute_prev_delta = true;
6791 } else if (spare_cap > max_spare_cap) {
6792 /*
6793 * Find the CPU with the maximum spare capacity
6794 * in the performance domain.
6795 */
Quentin Perret732cd752018-12-03 09:56:27 +00006796 max_spare_cap = spare_cap;
6797 max_spare_cap_cpu = cpu;
6798 }
6799 }
6800
Pierre Gondois8d4c97c2021-05-04 10:07:42 +01006801 if (max_spare_cap_cpu < 0 && !compute_prev_delta)
6802 continue;
6803
6804 /* Compute the 'base' energy of the pd, without @p */
6805 base_energy_pd = compute_energy(p, -1, pd);
6806 base_energy += base_energy_pd;
6807
6808 /* Evaluate the energy impact of using prev_cpu. */
6809 if (compute_prev_delta) {
6810 prev_delta = compute_energy(p, prev_cpu, pd);
Pierre Gondois619e0902021-05-04 10:07:43 +01006811 if (prev_delta < base_energy_pd)
6812 goto unlock;
Pierre Gondois8d4c97c2021-05-04 10:07:42 +01006813 prev_delta -= base_energy_pd;
6814 best_delta = min(best_delta, prev_delta);
6815 }
6816
6817 /* Evaluate the energy impact of using max_spare_cap_cpu. */
6818 if (max_spare_cap_cpu >= 0) {
Quentin Perreteb926922019-09-12 11:44:04 +02006819 cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
Pierre Gondois619e0902021-05-04 10:07:43 +01006820 if (cur_delta < base_energy_pd)
6821 goto unlock;
Quentin Perreteb926922019-09-12 11:44:04 +02006822 cur_delta -= base_energy_pd;
6823 if (cur_delta < best_delta) {
6824 best_delta = cur_delta;
Quentin Perret732cd752018-12-03 09:56:27 +00006825 best_energy_cpu = max_spare_cap_cpu;
6826 }
6827 }
6828 }
Quentin Perret732cd752018-12-03 09:56:27 +00006829 rcu_read_unlock();
6830
6831 /*
6832 * Pick the best CPU if prev_cpu cannot be used, or if it saves at
6833 * least 6% of the energy used by prev_cpu.
6834 */
Pierre Gondois619e0902021-05-04 10:07:43 +01006835 if ((prev_delta == ULONG_MAX) ||
6836 (prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
6837 target = best_energy_cpu;
Quentin Perret732cd752018-12-03 09:56:27 +00006838
Pierre Gondois619e0902021-05-04 10:07:43 +01006839 return target;
Quentin Perret732cd752018-12-03 09:56:27 +00006840
Pierre Gondois619e0902021-05-04 10:07:43 +01006841unlock:
Quentin Perret732cd752018-12-03 09:56:27 +00006842 rcu_read_unlock();
6843
Pierre Gondois619e0902021-05-04 10:07:43 +01006844 return target;
Quentin Perret732cd752018-12-03 09:56:27 +00006845}
6846
6847/*
Morten Rasmussende91b9c2014-02-18 14:14:24 +00006848 * select_task_rq_fair: Select target runqueue for the waking task in domains
Valentin Schneider3aef1552020-11-02 18:45:13 +00006849 * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
Morten Rasmussende91b9c2014-02-18 14:14:24 +00006850 * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006851 *
Ingo Molnar97fb7a02018-03-03 14:01:12 +01006852 * Balances load by selecting the idlest CPU in the idlest group, or under
6853 * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006854 *
Ingo Molnar97fb7a02018-03-03 14:01:12 +01006855 * Returns the target CPU number.
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006856 */
Peter Zijlstra0017d732010-03-24 18:34:10 +01006857static int
Valentin Schneider3aef1552020-11-02 18:45:13 +00006858select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006859{
Valentin Schneider3aef1552020-11-02 18:45:13 +00006860 int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
Viresh Kumarf1d88b42018-04-26 16:00:50 +05306861 struct sched_domain *tmp, *sd = NULL;
Peter Zijlstrac88d5912009-09-10 13:50:02 +02006862 int cpu = smp_processor_id();
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02006863 int new_cpu = prev_cpu;
Suresh Siddha99bd5e22010-03-31 16:47:45 -07006864 int want_affine = 0;
Valentin Schneider3aef1552020-11-02 18:45:13 +00006865 /* SD_flags and WF_flags share the first nibble */
6866 int sd_flag = wake_flags & 0xF;
Gregory Haskinse7693a32008-01-25 21:08:09 +01006867
Peter Zijlstra9099a14702020-11-17 18:19:35 -05006868 /*
6869 * required for stable ->cpus_allowed
6870 */
6871 lockdep_assert_held(&p->pi_lock);
Valentin Schneiderdc824eb82020-11-02 18:45:14 +00006872 if (wake_flags & WF_TTWU) {
Peter Zijlstrac58d25f2016-05-12 09:19:59 +02006873 record_wakee(p);
Quentin Perret732cd752018-12-03 09:56:27 +00006874
Peter Zijlstraf8a696f2018-12-05 11:23:56 +01006875 if (sched_energy_enabled()) {
Quentin Perret732cd752018-12-03 09:56:27 +00006876 new_cpu = find_energy_efficient_cpu(p, prev_cpu);
6877 if (new_cpu >= 0)
6878 return new_cpu;
6879 new_cpu = prev_cpu;
6880 }
6881
Morten Rasmussen00061962020-02-06 19:19:57 +00006882 want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr);
Peter Zijlstrac58d25f2016-05-12 09:19:59 +02006883 }
Gregory Haskinse7693a32008-01-25 21:08:09 +01006884
Peter Zijlstradce840a2011-04-07 14:09:50 +02006885 rcu_read_lock();
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006886 for_each_domain(cpu, tmp) {
6887 /*
Ingo Molnar97fb7a02018-03-03 14:01:12 +01006888 * If both 'cpu' and 'prev_cpu' are part of this domain,
Suresh Siddha99bd5e22010-03-31 16:47:45 -07006889 * cpu is a valid SD_WAKE_AFFINE target.
Peter Zijlstrafe3bcfe2009-11-12 15:55:29 +01006890 */
Suresh Siddha99bd5e22010-03-31 16:47:45 -07006891 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
6892 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
Viresh Kumarf1d88b42018-04-26 16:00:50 +05306893 if (cpu != prev_cpu)
6894 new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
6895
6896 sd = NULL; /* Prefer wake_affine over balance flags */
Alex Shif03542a2012-07-26 08:55:34 +08006897 break;
Peter Zijlstrac88d5912009-09-10 13:50:02 +02006898 }
6899
Barry Song29174062021-10-16 19:11:09 +08006900 /*
6901 * Usually only true for WF_EXEC and WF_FORK, as sched_domains
6902 * usually do not have SD_BALANCE_WAKE set. That means wakeup
6903 * will usually go to the fast path.
6904 */
Alex Shif03542a2012-07-26 08:55:34 +08006905 if (tmp->flags & sd_flag)
Peter Zijlstra29cd8ba2009-09-17 09:01:14 +02006906 sd = tmp;
Mike Galbraith63b0e9e2015-07-14 17:39:50 +02006907 else if (!want_affine)
6908 break;
Peter Zijlstrac88d5912009-09-10 13:50:02 +02006909 }
Peter Zijlstraaaee1202009-09-10 13:36:25 +02006910
Viresh Kumarf1d88b42018-04-26 16:00:50 +05306911 if (unlikely(sd)) {
6912 /* Slow path */
Brendan Jackman18bd1b4b2017-10-05 12:45:12 +01006913 new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
Valentin Schneiderdc824eb82020-11-02 18:45:14 +00006914 } else if (wake_flags & WF_TTWU) { /* XXX always ? */
Viresh Kumarf1d88b42018-04-26 16:00:50 +05306915 /* Fast path */
Viresh Kumarf1d88b42018-04-26 16:00:50 +05306916 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
Gregory Haskinse7693a32008-01-25 21:08:09 +01006917 }
Peter Zijlstradce840a2011-04-07 14:09:50 +02006918 rcu_read_unlock();
Gregory Haskinse7693a32008-01-25 21:08:09 +01006919
Peter Zijlstrac88d5912009-09-10 13:50:02 +02006920 return new_cpu;
Gregory Haskinse7693a32008-01-25 21:08:09 +01006921}
Paul Turner0a74bef2012-10-04 13:18:30 +02006922
Peter Zijlstra144d8482017-05-11 17:57:24 +02006923static void detach_entity_cfs_rq(struct sched_entity *se);
6924
Paul Turner0a74bef2012-10-04 13:18:30 +02006925/*
Ingo Molnar97fb7a02018-03-03 14:01:12 +01006926 * Called immediately before a task is migrated to a new CPU; task_cpu(p) and
Paul Turner0a74bef2012-10-04 13:18:30 +02006927 * cfs_rq_of(p) references at time of call are still valid and identify the
Ingo Molnar97fb7a02018-03-03 14:01:12 +01006928 * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
Paul Turner0a74bef2012-10-04 13:18:30 +02006929 */
Srikar Dronamraju3f9672b2018-09-21 23:18:58 +05306930static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
Paul Turner0a74bef2012-10-04 13:18:30 +02006931{
Paul Turneraff3e492012-10-04 13:18:30 +02006932 /*
Peter Zijlstra59efa0b2016-05-10 18:24:37 +02006933 * As blocked tasks retain absolute vruntime the migration needs to
6934 * deal with this by subtracting the old and adding the new
6935 * min_vruntime -- the latter is done by enqueue_entity() when placing
6936 * the task on the new runqueue.
6937 */
Peter Zijlstra2f064a52021-06-11 10:28:17 +02006938 if (READ_ONCE(p->__state) == TASK_WAKING) {
Peter Zijlstra59efa0b2016-05-10 18:24:37 +02006939 struct sched_entity *se = &p->se;
6940 struct cfs_rq *cfs_rq = cfs_rq_of(se);
6941 u64 min_vruntime;
6942
6943#ifndef CONFIG_64BIT
6944 u64 min_vruntime_copy;
6945
6946 do {
6947 min_vruntime_copy = cfs_rq->min_vruntime_copy;
6948 smp_rmb();
6949 min_vruntime = cfs_rq->min_vruntime;
6950 } while (min_vruntime != min_vruntime_copy);
6951#else
6952 min_vruntime = cfs_rq->min_vruntime;
6953#endif
6954
6955 se->vruntime -= min_vruntime;
6956 }
6957
Peter Zijlstra144d8482017-05-11 17:57:24 +02006958 if (p->on_rq == TASK_ON_RQ_MIGRATING) {
6959 /*
6960 * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
6961 * rq->lock and can modify state directly.
6962 */
Peter Zijlstra5cb9eaa2020-11-17 18:19:31 -05006963 lockdep_assert_rq_held(task_rq(p));
Peter Zijlstra144d8482017-05-11 17:57:24 +02006964 detach_entity_cfs_rq(&p->se);
6965
6966 } else {
6967 /*
6968 * We are supposed to update the task to "current" time, then
6969 * its up to date and ready to go to new CPU/cfs_rq. But we
6970 * have difficulty in getting what current time is, so simply
6971 * throw away the out-of-date time. This will result in the
6972 * wakee task is less decayed, but giving the wakee more load
6973 * sounds not bad.
6974 */
6975 remove_entity_load_avg(&p->se);
6976 }
Yuyang Du9d89c252015-07-15 08:04:37 +08006977
6978 /* Tell new CPU we are migrated */
6979 p->se.avg.last_update_time = 0;
Ben Segall3944a922014-05-15 15:59:20 -07006980
6981 /* We have migrated, no longer consider this task hot */
Yuyang Du9d89c252015-07-15 08:04:37 +08006982 p->se.exec_start = 0;
Srikar Dronamraju3f9672b2018-09-21 23:18:58 +05306983
6984 update_scan_period(p, new_cpu);
Paul Turner0a74bef2012-10-04 13:18:30 +02006985}
Yuyang Du12695572015-07-15 08:04:40 +08006986
6987static void task_dead_fair(struct task_struct *p)
6988{
6989 remove_entity_load_avg(&p->se);
6990}
Peter Zijlstra6e2df052019-11-08 11:11:52 +01006991
6992static int
6993balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
6994{
6995 if (rq->nr_running)
6996 return 1;
6997
6998 return newidle_balance(rq, rf) != 0;
6999}
Gregory Haskinse7693a32008-01-25 21:08:09 +01007000#endif /* CONFIG_SMP */
7001
Cheng Jiana555e9d2017-12-07 21:30:43 +08007002static unsigned long wakeup_gran(struct sched_entity *se)
Peter Zijlstra0bbd3332008-04-19 19:44:57 +02007003{
7004 unsigned long gran = sysctl_sched_wakeup_granularity;
7005
7006 /*
Peter Zijlstrae52fb7c2009-01-14 12:39:19 +01007007 * Since its curr running now, convert the gran from real-time
7008 * to virtual-time in his units.
Mike Galbraith13814d42010-03-11 17:17:04 +01007009 *
7010 * By using 'se' instead of 'curr' we penalize light tasks, so
7011 * they get preempted easier. That is, if 'se' < 'curr' then
7012 * the resulting gran will be larger, therefore penalizing the
7013 * lighter, if otoh 'se' > 'curr' then the resulting gran will
7014 * be smaller, again penalizing the lighter task.
7015 *
7016 * This is especially important for buddies when the leftmost
7017 * task is higher priority than the buddy.
Peter Zijlstra0bbd3332008-04-19 19:44:57 +02007018 */
Shaohua Lif4ad9bd2011-04-08 12:53:09 +08007019 return calc_delta_fair(gran, se);
Peter Zijlstra0bbd3332008-04-19 19:44:57 +02007020}
7021
7022/*
Peter Zijlstra464b7522008-10-24 11:06:15 +02007023 * Should 'se' preempt 'curr'.
7024 *
7025 * |s1
7026 * |s2
7027 * |s3
7028 * g
7029 * |<--->|c
7030 *
7031 * w(c, s1) = -1
7032 * w(c, s2) = 0
7033 * w(c, s3) = 1
7034 *
7035 */
7036static int
7037wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
7038{
7039 s64 gran, vdiff = curr->vruntime - se->vruntime;
7040
7041 if (vdiff <= 0)
7042 return -1;
7043
Cheng Jiana555e9d2017-12-07 21:30:43 +08007044 gran = wakeup_gran(se);
Peter Zijlstra464b7522008-10-24 11:06:15 +02007045 if (vdiff > gran)
7046 return 1;
7047
7048 return 0;
7049}
7050
Peter Zijlstra02479092008-11-04 21:25:10 +01007051static void set_last_buddy(struct sched_entity *se)
7052{
Daniel Axtensc5ae3662017-05-11 06:11:39 +10007053 for_each_sched_entity(se) {
7054 if (SCHED_WARN_ON(!se->on_rq))
7055 return;
Josh Don30400032021-07-29 19:00:18 -07007056 if (se_is_idle(se))
7057 return;
Venkatesh Pallipadi69c80f32011-04-13 18:21:09 -07007058 cfs_rq_of(se)->last = se;
Daniel Axtensc5ae3662017-05-11 06:11:39 +10007059 }
Peter Zijlstra02479092008-11-04 21:25:10 +01007060}
7061
7062static void set_next_buddy(struct sched_entity *se)
7063{
Daniel Axtensc5ae3662017-05-11 06:11:39 +10007064 for_each_sched_entity(se) {
7065 if (SCHED_WARN_ON(!se->on_rq))
7066 return;
Josh Don30400032021-07-29 19:00:18 -07007067 if (se_is_idle(se))
7068 return;
Venkatesh Pallipadi69c80f32011-04-13 18:21:09 -07007069 cfs_rq_of(se)->next = se;
Daniel Axtensc5ae3662017-05-11 06:11:39 +10007070 }
Peter Zijlstra02479092008-11-04 21:25:10 +01007071}
7072
Rik van Rielac53db52011-02-01 09:51:03 -05007073static void set_skip_buddy(struct sched_entity *se)
7074{
Venkatesh Pallipadi69c80f32011-04-13 18:21:09 -07007075 for_each_sched_entity(se)
7076 cfs_rq_of(se)->skip = se;
Rik van Rielac53db52011-02-01 09:51:03 -05007077}
7078
Peter Zijlstra464b7522008-10-24 11:06:15 +02007079/*
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02007080 * Preempt the current task with a newly woken task if needed:
7081 */
Peter Zijlstra5a9b86f2009-09-16 13:47:58 +02007082static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02007083{
7084 struct task_struct *curr = rq->curr;
Srivatsa Vaddagiri8651a862007-10-15 17:00:12 +02007085 struct sched_entity *se = &curr->se, *pse = &p->se;
Mike Galbraith03e89e42008-12-16 08:45:30 +01007086 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
Mike Galbraithf685cea2009-10-23 23:09:22 +02007087 int scale = cfs_rq->nr_running >= sched_nr_latency;
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07007088 int next_buddy_marked = 0;
Josh Don30400032021-07-29 19:00:18 -07007089 int cse_is_idle, pse_is_idle;
Mike Galbraith03e89e42008-12-16 08:45:30 +01007090
Ingo Molnar4ae7d5c2008-03-19 01:42:00 +01007091 if (unlikely(se == pse))
7092 return;
7093
Paul Turner5238cdd2011-07-21 09:43:37 -07007094 /*
Kirill Tkhai163122b2014-08-20 13:48:29 +04007095 * This is possible from callers such as attach_tasks(), in which we
Ingo Molnar3b037062021-03-18 13:38:50 +01007096 * unconditionally check_preempt_curr() after an enqueue (which may have
Paul Turner5238cdd2011-07-21 09:43:37 -07007097 * lead to a throttle). This both saves work and prevents false
7098 * next-buddy nomination below.
7099 */
7100 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
7101 return;
7102
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07007103 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
Mike Galbraith3cb63d52009-09-11 12:01:17 +02007104 set_next_buddy(pse);
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07007105 next_buddy_marked = 1;
7106 }
Peter Zijlstra57fdc262008-09-23 15:33:45 +02007107
Bharata B Raoaec0a512008-08-28 14:42:49 +05307108 /*
7109 * We can come here with TIF_NEED_RESCHED already set from new task
7110 * wake up path.
Paul Turner5238cdd2011-07-21 09:43:37 -07007111 *
7112 * Note: this also catches the edge-case of curr being in a throttled
7113 * group (e.g. via set_curr_task), since update_curr() (in the
7114 * enqueue of curr) will have resulted in resched being set. This
7115 * prevents us from potentially nominating it as a false LAST_BUDDY
7116 * below.
Bharata B Raoaec0a512008-08-28 14:42:49 +05307117 */
7118 if (test_tsk_need_resched(curr))
7119 return;
7120
Darren Harta2f5c9a2011-02-22 13:04:33 -08007121 /* Idle tasks are by definition preempted by non-idle tasks. */
Viresh Kumar1da18432018-11-05 16:51:55 +05307122 if (unlikely(task_has_idle_policy(curr)) &&
7123 likely(!task_has_idle_policy(p)))
Darren Harta2f5c9a2011-02-22 13:04:33 -08007124 goto preempt;
7125
Ingo Molnar91c234b2007-10-15 17:00:18 +02007126 /*
Darren Harta2f5c9a2011-02-22 13:04:33 -08007127 * Batch and idle tasks do not preempt non-idle tasks (their preemption
7128 * is driven by the tick):
Ingo Molnar91c234b2007-10-15 17:00:18 +02007129 */
Ingo Molnar8ed92e52012-10-14 14:28:50 +02007130 if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
Ingo Molnar91c234b2007-10-15 17:00:18 +02007131 return;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02007132
Peter Zijlstra3a7e73a2009-11-28 18:51:02 +01007133 find_matching_se(&se, &pse);
7134 BUG_ON(!pse);
Josh Don30400032021-07-29 19:00:18 -07007135
7136 cse_is_idle = se_is_idle(se);
7137 pse_is_idle = se_is_idle(pse);
7138
7139 /*
7140 * Preempt an idle group in favor of a non-idle group (and don't preempt
7141 * in the inverse case).
7142 */
7143 if (cse_is_idle && !pse_is_idle)
7144 goto preempt;
7145 if (cse_is_idle != pse_is_idle)
7146 return;
7147
7148 update_curr(cfs_rq_of(se));
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07007149 if (wakeup_preempt_entity(se, pse) == 1) {
7150 /*
7151 * Bias pick_next to pick the sched entity that is
7152 * triggering this preemption.
7153 */
7154 if (!next_buddy_marked)
7155 set_next_buddy(pse);
Peter Zijlstra3a7e73a2009-11-28 18:51:02 +01007156 goto preempt;
Venkatesh Pallipadi2f368252011-04-14 10:30:53 -07007157 }
Jupyung Leea65ac742009-11-17 18:51:40 +09007158
Peter Zijlstra3a7e73a2009-11-28 18:51:02 +01007159 return;
7160
7161preempt:
Kirill Tkhai88751252014-06-29 00:03:57 +04007162 resched_curr(rq);
Peter Zijlstra3a7e73a2009-11-28 18:51:02 +01007163 /*
7164 * Only set the backward buddy when the current task is still
7165 * on the rq. This can happen when a wakeup gets interleaved
7166 * with schedule on the ->pre_schedule() or idle_balance()
7167 * point, either of which can * drop the rq lock.
7168 *
7169 * Also, during early boot the idle thread is in the fair class,
7170 * for obvious reasons its a bad idea to schedule back to it.
7171 */
7172 if (unlikely(!se->on_rq || curr == rq->idle))
7173 return;
7174
7175 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
7176 set_last_buddy(se);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02007177}
7178
Peter Zijlstra21f56ffe2020-11-17 18:19:32 -05007179#ifdef CONFIG_SMP
7180static struct task_struct *pick_task_fair(struct rq *rq)
7181{
7182 struct sched_entity *se;
7183 struct cfs_rq *cfs_rq;
7184
7185again:
7186 cfs_rq = &rq->cfs;
7187 if (!cfs_rq->nr_running)
7188 return NULL;
7189
7190 do {
7191 struct sched_entity *curr = cfs_rq->curr;
7192
7193 /* When we pick for a remote RQ, we'll not have done put_prev_entity() */
7194 if (curr) {
7195 if (curr->on_rq)
7196 update_curr(cfs_rq);
7197 else
7198 curr = NULL;
7199
7200 if (unlikely(check_cfs_rq_runtime(cfs_rq)))
7201 goto again;
7202 }
7203
7204 se = pick_next_entity(cfs_rq, curr);
7205 cfs_rq = group_cfs_rq(se);
7206 } while (cfs_rq);
7207
7208 return task_of(se);
7209}
7210#endif
7211
Peter Zijlstra5d7d6052019-11-08 14:15:57 +01007212struct task_struct *
Matt Flemingd8ac8972016-09-21 14:38:10 +01007213pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02007214{
7215 struct cfs_rq *cfs_rq = &rq->cfs;
7216 struct sched_entity *se;
Peter Zijlstra678d5712012-02-11 06:05:00 +01007217 struct task_struct *p;
Peter Zijlstra37e117c2014-02-14 12:25:08 +01007218 int new_tasks;
Peter Zijlstra678d5712012-02-11 06:05:00 +01007219
Peter Zijlstra6e831252014-02-11 16:11:48 +01007220again:
Peter Zijlstra6e2df052019-11-08 11:11:52 +01007221 if (!sched_fair_runnable(rq))
Peter Zijlstra38033c32014-01-23 20:32:21 +01007222 goto idle;
Peter Zijlstra678d5712012-02-11 06:05:00 +01007223
Viresh Kumar9674f5c2017-05-24 10:59:55 +05307224#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra67692432019-05-29 20:36:44 +00007225 if (!prev || prev->sched_class != &fair_sched_class)
Peter Zijlstra678d5712012-02-11 06:05:00 +01007226 goto simple;
7227
7228 /*
7229 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
7230 * likely that a next task is from the same cgroup as the current.
7231 *
7232 * Therefore attempt to avoid putting and setting the entire cgroup
7233 * hierarchy, only change the part that actually changes.
7234 */
7235
7236 do {
7237 struct sched_entity *curr = cfs_rq->curr;
7238
7239 /*
7240 * Since we got here without doing put_prev_entity() we also
7241 * have to consider cfs_rq->curr. If it is still a runnable
7242 * entity, update_curr() will update its vruntime, otherwise
7243 * forget we've ever seen it.
7244 */
Ben Segall54d27362015-04-06 15:28:10 -07007245 if (curr) {
7246 if (curr->on_rq)
7247 update_curr(cfs_rq);
7248 else
7249 curr = NULL;
Peter Zijlstra678d5712012-02-11 06:05:00 +01007250
Ben Segall54d27362015-04-06 15:28:10 -07007251 /*
7252 * This call to check_cfs_rq_runtime() will do the
7253 * throttle and dequeue its entity in the parent(s).
Viresh Kumar9674f5c2017-05-24 10:59:55 +05307254 * Therefore the nr_running test will indeed
Ben Segall54d27362015-04-06 15:28:10 -07007255 * be correct.
7256 */
Viresh Kumar9674f5c2017-05-24 10:59:55 +05307257 if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
7258 cfs_rq = &rq->cfs;
7259
7260 if (!cfs_rq->nr_running)
7261 goto idle;
7262
Ben Segall54d27362015-04-06 15:28:10 -07007263 goto simple;
Viresh Kumar9674f5c2017-05-24 10:59:55 +05307264 }
Ben Segall54d27362015-04-06 15:28:10 -07007265 }
Peter Zijlstra678d5712012-02-11 06:05:00 +01007266
7267 se = pick_next_entity(cfs_rq, curr);
7268 cfs_rq = group_cfs_rq(se);
7269 } while (cfs_rq);
7270
7271 p = task_of(se);
7272
7273 /*
7274 * Since we haven't yet done put_prev_entity and if the selected task
7275 * is a different task than we started out with, try and touch the
7276 * least amount of cfs_rqs.
7277 */
7278 if (prev != p) {
7279 struct sched_entity *pse = &prev->se;
7280
7281 while (!(cfs_rq = is_same_group(se, pse))) {
7282 int se_depth = se->depth;
7283 int pse_depth = pse->depth;
7284
7285 if (se_depth <= pse_depth) {
7286 put_prev_entity(cfs_rq_of(pse), pse);
7287 pse = parent_entity(pse);
7288 }
7289 if (se_depth >= pse_depth) {
7290 set_next_entity(cfs_rq_of(se), se);
7291 se = parent_entity(se);
7292 }
7293 }
7294
7295 put_prev_entity(cfs_rq, pse);
7296 set_next_entity(cfs_rq, se);
7297 }
7298
Uladzislau Rezki93824902017-09-13 12:24:30 +02007299 goto done;
Peter Zijlstra678d5712012-02-11 06:05:00 +01007300simple:
Peter Zijlstra678d5712012-02-11 06:05:00 +01007301#endif
Peter Zijlstra67692432019-05-29 20:36:44 +00007302 if (prev)
7303 put_prev_task(rq, prev);
Peter Zijlstra606dba22012-02-11 06:05:00 +01007304
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02007305 do {
Peter Zijlstra678d5712012-02-11 06:05:00 +01007306 se = pick_next_entity(cfs_rq, NULL);
Peter Zijlstraf4b67552008-11-04 21:25:07 +01007307 set_next_entity(cfs_rq, se);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02007308 cfs_rq = group_cfs_rq(se);
7309 } while (cfs_rq);
7310
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01007311 p = task_of(se);
Peter Zijlstra678d5712012-02-11 06:05:00 +01007312
Norbert Manthey13a453c2018-02-27 08:47:40 +01007313done: __maybe_unused;
Uladzislau Rezki93824902017-09-13 12:24:30 +02007314#ifdef CONFIG_SMP
7315 /*
7316 * Move the next running task to the front of
7317 * the list, so our cfs_tasks list becomes MRU
7318 * one.
7319 */
7320 list_move(&p->se.group_node, &rq->cfs_tasks);
7321#endif
7322
Juri Lellie0ee4632021-02-08 08:35:54 +01007323 if (hrtick_enabled_fair(rq))
Mike Galbraithb39e66e2011-11-22 15:20:07 +01007324 hrtick_start_fair(rq, p);
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01007325
Morten Rasmussen3b1baa62018-07-04 11:17:40 +01007326 update_misfit_status(p, rq);
7327
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01007328 return p;
Peter Zijlstra38033c32014-01-23 20:32:21 +01007329
7330idle:
Peter Zijlstra67692432019-05-29 20:36:44 +00007331 if (!rf)
7332 return NULL;
7333
Peter Zijlstra5ba553e2019-05-29 20:36:42 +00007334 new_tasks = newidle_balance(rq, rf);
Matt Fleming46f69fa2016-09-21 14:38:12 +01007335
Peter Zijlstra37e117c2014-02-14 12:25:08 +01007336 /*
Peter Zijlstra5ba553e2019-05-29 20:36:42 +00007337 * Because newidle_balance() releases (and re-acquires) rq->lock, it is
Peter Zijlstra37e117c2014-02-14 12:25:08 +01007338 * possible for any higher priority task to appear. In that case we
7339 * must re-start the pick_next_entity() loop.
7340 */
Kirill Tkhaie4aa3582014-03-06 13:31:55 +04007341 if (new_tasks < 0)
Peter Zijlstra37e117c2014-02-14 12:25:08 +01007342 return RETRY_TASK;
7343
Kirill Tkhaie4aa3582014-03-06 13:31:55 +04007344 if (new_tasks > 0)
Peter Zijlstra38033c32014-01-23 20:32:21 +01007345 goto again;
Peter Zijlstra38033c32014-01-23 20:32:21 +01007346
Vincent Guittot23127292019-01-23 16:26:53 +01007347 /*
7348 * rq is about to be idle, check if we need to update the
7349 * lost_idle_time of clock_pelt
7350 */
7351 update_idle_rq_clock_pelt(rq);
7352
Peter Zijlstra38033c32014-01-23 20:32:21 +01007353 return NULL;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02007354}
7355
Peter Zijlstra98c2f702019-11-08 14:15:58 +01007356static struct task_struct *__pick_next_task_fair(struct rq *rq)
7357{
7358 return pick_next_task_fair(rq, NULL, NULL);
7359}
7360
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02007361/*
7362 * Account for a descheduled task:
7363 */
Peter Zijlstra6e2df052019-11-08 11:11:52 +01007364static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02007365{
7366 struct sched_entity *se = &prev->se;
7367 struct cfs_rq *cfs_rq;
7368
7369 for_each_sched_entity(se) {
7370 cfs_rq = cfs_rq_of(se);
Ingo Molnarab6cde22007-08-09 11:16:48 +02007371 put_prev_entity(cfs_rq, se);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02007372 }
7373}
7374
Rik van Rielac53db52011-02-01 09:51:03 -05007375/*
7376 * sched_yield() is very simple
7377 *
7378 * The magic of dealing with the ->skip buddy is in pick_next_entity.
7379 */
7380static void yield_task_fair(struct rq *rq)
7381{
7382 struct task_struct *curr = rq->curr;
7383 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
7384 struct sched_entity *se = &curr->se;
7385
7386 /*
7387 * Are we the only task in the tree?
7388 */
7389 if (unlikely(rq->nr_running == 1))
7390 return;
7391
7392 clear_buddies(cfs_rq, se);
7393
7394 if (curr->policy != SCHED_BATCH) {
7395 update_rq_clock(rq);
7396 /*
7397 * Update run-time statistics of the 'current'.
7398 */
7399 update_curr(cfs_rq);
Mike Galbraith916671c2011-11-22 15:21:26 +01007400 /*
7401 * Tell update_rq_clock() that we've just updated,
7402 * so we don't do microscopic update in schedule()
7403 * and double the fastpath cost.
7404 */
Davidlohr Buesoadcc8da2018-04-04 09:15:39 -07007405 rq_clock_skip_update(rq);
Rik van Rielac53db52011-02-01 09:51:03 -05007406 }
7407
7408 set_skip_buddy(se);
7409}
7410
Dietmar Eggemann0900acf2020-06-03 10:03:02 +02007411static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
Mike Galbraithd95f4122011-02-01 09:50:51 -05007412{
7413 struct sched_entity *se = &p->se;
7414
Paul Turner5238cdd2011-07-21 09:43:37 -07007415 /* throttled hierarchies are not runnable */
7416 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
Mike Galbraithd95f4122011-02-01 09:50:51 -05007417 return false;
7418
7419 /* Tell the scheduler that we'd really like pse to run next. */
7420 set_next_buddy(se);
7421
Mike Galbraithd95f4122011-02-01 09:50:51 -05007422 yield_task_fair(rq);
7423
7424 return true;
7425}
7426
Peter Williams681f3e62007-10-24 18:23:51 +02007427#ifdef CONFIG_SMP
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02007428/**************************************************
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02007429 * Fair scheduling class load-balancing methods.
7430 *
7431 * BASICS
7432 *
7433 * The purpose of load-balancing is to achieve the same basic fairness the
Ingo Molnar97fb7a02018-03-03 14:01:12 +01007434 * per-CPU scheduler provides, namely provide a proportional amount of compute
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02007435 * time to each task. This is expressed in the following equation:
7436 *
7437 * W_i,n/P_i == W_j,n/P_j for all i,j (1)
7438 *
Ingo Molnar97fb7a02018-03-03 14:01:12 +01007439 * Where W_i,n is the n-th weight average for CPU i. The instantaneous weight
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02007440 * W_i,0 is defined as:
7441 *
7442 * W_i,0 = \Sum_j w_i,j (2)
7443 *
Ingo Molnar97fb7a02018-03-03 14:01:12 +01007444 * Where w_i,j is the weight of the j-th runnable task on CPU i. This weight
Yuyang Du1c3de5e2016-03-30 07:07:51 +08007445 * is derived from the nice value as per sched_prio_to_weight[].
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02007446 *
7447 * The weight average is an exponential decay average of the instantaneous
7448 * weight:
7449 *
7450 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
7451 *
Ingo Molnar97fb7a02018-03-03 14:01:12 +01007452 * C_i is the compute capacity of CPU i, typically it is the
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02007453 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
7454 * can also include other factors [XXX].
7455 *
7456 * To achieve this balance we define a measure of imbalance which follows
7457 * directly from (1):
7458 *
Nicolas Pitreced549f2014-05-26 18:19:38 -04007459 * imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4)
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02007460 *
7461 * We them move tasks around to minimize the imbalance. In the continuous
7462 * function space it is obvious this converges, in the discrete case we get
7463 * a few fun cases generally called infeasible weight scenarios.
7464 *
7465 * [XXX expand on:
7466 * - infeasible weights;
7467 * - local vs global optima in the discrete case. ]
7468 *
7469 *
7470 * SCHED DOMAINS
7471 *
7472 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
Ingo Molnar97fb7a02018-03-03 14:01:12 +01007473 * for all i,j solution, we create a tree of CPUs that follows the hardware
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02007474 * topology where each level pairs two lower groups (or better). This results
Ingo Molnar97fb7a02018-03-03 14:01:12 +01007475 * in O(log n) layers. Furthermore we reduce the number of CPUs going up the
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02007476 * tree to only the first of the previous level and we decrease the frequency
Ingo Molnar97fb7a02018-03-03 14:01:12 +01007477 * of load-balance at each level inv. proportional to the number of CPUs in
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02007478 * the groups.
7479 *
7480 * This yields:
7481 *
7482 * log_2 n 1 n
7483 * \Sum { --- * --- * 2^i } = O(n) (5)
7484 * i = 0 2^i 2^i
7485 * `- size of each group
Ingo Molnar97fb7a02018-03-03 14:01:12 +01007486 * | | `- number of CPUs doing load-balance
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02007487 * | `- freq
7488 * `- sum over all levels
7489 *
7490 * Coupled with a limit on how many tasks we can migrate every balance pass,
7491 * this makes (5) the runtime complexity of the balancer.
7492 *
7493 * An important property here is that each CPU is still (indirectly) connected
Ingo Molnar97fb7a02018-03-03 14:01:12 +01007494 * to every other CPU in at most O(log n) steps:
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02007495 *
7496 * The adjacency matrix of the resulting graph is given by:
7497 *
Byungchul Park97a71422015-07-05 18:33:48 +09007498 * log_2 n
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02007499 * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
7500 * k = 0
7501 *
7502 * And you'll find that:
7503 *
7504 * A^(log_2 n)_i,j != 0 for all i,j (7)
7505 *
Ingo Molnar97fb7a02018-03-03 14:01:12 +01007506 * Showing there's indeed a path between every CPU in at most O(log n) steps.
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02007507 * The task movement gives a factor of O(m), giving a convergence complexity
7508 * of:
7509 *
7510 * O(nm log n), n := nr_cpus, m := nr_tasks (8)
7511 *
7512 *
7513 * WORK CONSERVING
7514 *
7515 * In order to avoid CPUs going idle while there's still work to do, new idle
Ingo Molnar97fb7a02018-03-03 14:01:12 +01007516 * balancing is more aggressive and has the newly idle CPU iterate up the domain
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02007517 * tree itself instead of relying on other CPUs to bring it work.
7518 *
7519 * This adds some complexity to both (5) and (8) but it reduces the total idle
7520 * time.
7521 *
7522 * [XXX more?]
7523 *
7524 *
7525 * CGROUPS
7526 *
7527 * Cgroups make a horror show out of (2), instead of a simple sum we get:
7528 *
7529 * s_k,i
7530 * W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
7531 * S_k
7532 *
7533 * Where
7534 *
7535 * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
7536 *
Ingo Molnar97fb7a02018-03-03 14:01:12 +01007537 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i.
Peter Zijlstrae9c84cb2012-07-03 13:53:26 +02007538 *
7539 * The big problem is S_k, its a global sum needed to compute a local (W_i)
7540 * property.
7541 *
7542 * [XXX write more on how we solve this.. _after_ merging pjt's patches that
7543 * rewrite all of this once again.]
Byungchul Park97a71422015-07-05 18:33:48 +09007544 */
Ingo Molnarbf0f6f22007-07-09 18:51:58 +02007545
Hiroshi Shimamotoed387b72012-01-31 11:40:32 +09007546static unsigned long __read_mostly max_load_balance_interval = HZ/10;
7547
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01007548enum fbq_type { regular, remote, all };
7549
Vincent Guittot0b0695f2019-10-18 15:26:31 +02007550/*
Vincent Guittota9723382019-11-12 15:50:43 +01007551 * 'group_type' describes the group of CPUs at the moment of load balancing.
7552 *
Vincent Guittot0b0695f2019-10-18 15:26:31 +02007553 * The enum is ordered by pulling priority, with the group with lowest priority
Vincent Guittota9723382019-11-12 15:50:43 +01007554 * first so the group_type can simply be compared when selecting the busiest
7555 * group. See update_sd_pick_busiest().
Vincent Guittot0b0695f2019-10-18 15:26:31 +02007556 */
Morten Rasmussen3b1baa62018-07-04 11:17:40 +01007557enum group_type {
Vincent Guittota9723382019-11-12 15:50:43 +01007558 /* The group has spare capacity that can be used to run more tasks. */
Vincent Guittot0b0695f2019-10-18 15:26:31 +02007559 group_has_spare = 0,
Vincent Guittota9723382019-11-12 15:50:43 +01007560 /*
7561 * The group is fully used and the tasks don't compete for more CPU
7562 * cycles. Nevertheless, some tasks might wait before running.
7563 */
Vincent Guittot0b0695f2019-10-18 15:26:31 +02007564 group_fully_busy,
Vincent Guittota9723382019-11-12 15:50:43 +01007565 /*
7566 * SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity
7567 * and must be migrated to a more powerful CPU.
7568 */
Morten Rasmussen3b1baa62018-07-04 11:17:40 +01007569 group_misfit_task,
Vincent Guittota9723382019-11-12 15:50:43 +01007570 /*
7571 * SD_ASYM_PACKING only: One local CPU with higher capacity is available,
7572 * and the task should be migrated to it instead of running on the
7573 * current CPU.
7574 */
Vincent Guittot0b0695f2019-10-18 15:26:31 +02007575 group_asym_packing,
Vincent Guittota9723382019-11-12 15:50:43 +01007576 /*
7577 * The tasks' affinity constraints previously prevented the scheduler
7578 * from balancing the load across the system.
7579 */
Morten Rasmussen3b1baa62018-07-04 11:17:40 +01007580 group_imbalanced,
Vincent Guittota9723382019-11-12 15:50:43 +01007581 /*
7582 * The CPU is overloaded and can't provide expected CPU cycles to all
7583 * tasks.
7584 */
Vincent Guittot0b0695f2019-10-18 15:26:31 +02007585 group_overloaded
7586};
7587
7588enum migration_type {
7589 migrate_load = 0,
7590 migrate_util,
7591 migrate_task,
7592 migrate_misfit
Morten Rasmussen3b1baa62018-07-04 11:17:40 +01007593};
7594
Peter Zijlstraddcdf6e2012-02-22 19:27:40 +01007595#define LBF_ALL_PINNED 0x01
Peter Zijlstra367456c2012-02-20 21:49:09 +01007596#define LBF_NEED_BREAK 0x02
Peter Zijlstra62633222013-08-19 12:41:09 +02007597#define LBF_DST_PINNED 0x04
7598#define LBF_SOME_PINNED 0x08
Valentin Schneider23fb06d2021-04-07 23:06:27 +01007599#define LBF_ACTIVE_LB 0x10
Peter Zijlstraddcdf6e2012-02-22 19:27:40 +01007600
7601struct lb_env {
7602 struct sched_domain *sd;
7603
Peter Zijlstraddcdf6e2012-02-22 19:27:40 +01007604 struct rq *src_rq;
Prashanth Nageshappa85c1e7d2012-06-19 17:47:34 +05307605 int src_cpu;
Peter Zijlstraddcdf6e2012-02-22 19:27:40 +01007606
7607 int dst_cpu;
7608 struct rq *dst_rq;
7609
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05307610 struct cpumask *dst_grpmask;
7611 int new_dst_cpu;
Peter Zijlstraddcdf6e2012-02-22 19:27:40 +01007612 enum cpu_idle_type idle;
Peter Zijlstrabd939f42012-05-02 14:20:37 +02007613 long imbalance;
Michael Wangb94031302012-07-12 16:10:13 +08007614 /* The set of CPUs under consideration for load-balancing */
7615 struct cpumask *cpus;
7616
Peter Zijlstraddcdf6e2012-02-22 19:27:40 +01007617 unsigned int flags;
Peter Zijlstra367456c2012-02-20 21:49:09 +01007618
7619 unsigned int loop;
7620 unsigned int loop_break;
7621 unsigned int loop_max;
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01007622
7623 enum fbq_type fbq_type;
Vincent Guittot0b0695f2019-10-18 15:26:31 +02007624 enum migration_type migration_type;
Kirill Tkhai163122b2014-08-20 13:48:29 +04007625 struct list_head tasks;
Peter Zijlstraddcdf6e2012-02-22 19:27:40 +01007626};
7627
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007628/*
Peter Zijlstra029632f2011-10-25 10:00:11 +02007629 * Is this task likely cache-hot:
7630 */
Hillf Danton5d5e2b12014-06-10 10:58:43 +02007631static int task_hot(struct task_struct *p, struct lb_env *env)
Peter Zijlstra029632f2011-10-25 10:00:11 +02007632{
7633 s64 delta;
7634
Peter Zijlstra5cb9eaa2020-11-17 18:19:31 -05007635 lockdep_assert_rq_held(env->src_rq);
Kirill Tkhaie5673f22014-08-20 13:48:01 +04007636
Peter Zijlstra029632f2011-10-25 10:00:11 +02007637 if (p->sched_class != &fair_sched_class)
7638 return 0;
7639
Viresh Kumar1da18432018-11-05 16:51:55 +05307640 if (unlikely(task_has_idle_policy(p)))
Peter Zijlstra029632f2011-10-25 10:00:11 +02007641 return 0;
7642
Josh Donec732402020-08-04 12:34:13 -07007643 /* SMT siblings share cache */
7644 if (env->sd->flags & SD_SHARE_CPUCAPACITY)
7645 return 0;
7646
Peter Zijlstra029632f2011-10-25 10:00:11 +02007647 /*
7648 * Buddy candidates are cache hot:
7649 */
Hillf Danton5d5e2b12014-06-10 10:58:43 +02007650 if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
Peter Zijlstra029632f2011-10-25 10:00:11 +02007651 (&p->se == cfs_rq_of(&p->se)->next ||
7652 &p->se == cfs_rq_of(&p->se)->last))
7653 return 1;
7654
7655 if (sysctl_sched_migration_cost == -1)
7656 return 1;
Aubrey Li97886d92021-03-24 17:40:13 -04007657
7658 /*
7659 * Don't migrate task if the task's cookie does not match
7660 * with the destination CPU's core cookie.
7661 */
7662 if (!sched_core_cookie_match(cpu_rq(env->dst_cpu), p))
7663 return 1;
7664
Peter Zijlstra029632f2011-10-25 10:00:11 +02007665 if (sysctl_sched_migration_cost == 0)
7666 return 0;
7667
Hillf Danton5d5e2b12014-06-10 10:58:43 +02007668 delta = rq_clock_task(env->src_rq) - p->se.exec_start;
Peter Zijlstra029632f2011-10-25 10:00:11 +02007669
7670 return delta < (s64)sysctl_sched_migration_cost;
7671}
7672
Mel Gorman3a7053b2013-10-07 11:29:00 +01007673#ifdef CONFIG_NUMA_BALANCING
Rik van Rielc1ceac62015-05-14 22:59:36 -04007674/*
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05307675 * Returns 1, if task migration degrades locality
7676 * Returns 0, if task migration improves locality i.e migration preferred.
7677 * Returns -1, if task migration is not affected by locality.
Rik van Rielc1ceac62015-05-14 22:59:36 -04007678 */
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05307679static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
Mel Gorman3a7053b2013-10-07 11:29:00 +01007680{
Rik van Rielb1ad0652014-05-15 13:03:06 -04007681 struct numa_group *numa_group = rcu_dereference(p->numa_group);
Srikar Dronamrajuf35678b2018-06-20 22:32:56 +05307682 unsigned long src_weight, dst_weight;
7683 int src_nid, dst_nid, dist;
Mel Gorman3a7053b2013-10-07 11:29:00 +01007684
Srikar Dronamraju2a595722015-08-11 21:54:21 +05307685 if (!static_branch_likely(&sched_numa_balancing))
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05307686 return -1;
7687
Srikar Dronamrajuc3b9bc52015-08-11 16:30:12 +05307688 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05307689 return -1;
Mel Gorman7a0f3082013-10-07 11:29:01 +01007690
7691 src_nid = cpu_to_node(env->src_cpu);
7692 dst_nid = cpu_to_node(env->dst_cpu);
7693
Mel Gorman83e1d2c2013-10-07 11:29:27 +01007694 if (src_nid == dst_nid)
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05307695 return -1;
Mel Gorman7a0f3082013-10-07 11:29:01 +01007696
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05307697 /* Migrating away from the preferred node is always bad. */
7698 if (src_nid == p->numa_preferred_nid) {
7699 if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
7700 return 1;
7701 else
7702 return -1;
7703 }
Mel Gorman83e1d2c2013-10-07 11:29:27 +01007704
Rik van Rielc1ceac62015-05-14 22:59:36 -04007705 /* Encourage migration to the preferred node. */
7706 if (dst_nid == p->numa_preferred_nid)
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05307707 return 0;
Rik van Rielc1ceac62015-05-14 22:59:36 -04007708
Rik van Riel739294f2017-06-23 12:55:27 -04007709 /* Leaving a core idle is often worse than degrading locality. */
Srikar Dronamrajuf35678b2018-06-20 22:32:56 +05307710 if (env->idle == CPU_IDLE)
Rik van Riel739294f2017-06-23 12:55:27 -04007711 return -1;
7712
Srikar Dronamrajuf35678b2018-06-20 22:32:56 +05307713 dist = node_distance(src_nid, dst_nid);
Rik van Rielc1ceac62015-05-14 22:59:36 -04007714 if (numa_group) {
Srikar Dronamrajuf35678b2018-06-20 22:32:56 +05307715 src_weight = group_weight(p, src_nid, dist);
7716 dst_weight = group_weight(p, dst_nid, dist);
Rik van Rielc1ceac62015-05-14 22:59:36 -04007717 } else {
Srikar Dronamrajuf35678b2018-06-20 22:32:56 +05307718 src_weight = task_weight(p, src_nid, dist);
7719 dst_weight = task_weight(p, dst_nid, dist);
Rik van Rielc1ceac62015-05-14 22:59:36 -04007720 }
7721
Srikar Dronamrajuf35678b2018-06-20 22:32:56 +05307722 return dst_weight < src_weight;
Mel Gorman7a0f3082013-10-07 11:29:01 +01007723}
7724
Mel Gorman3a7053b2013-10-07 11:29:00 +01007725#else
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05307726static inline int migrate_degrades_locality(struct task_struct *p,
Mel Gorman3a7053b2013-10-07 11:29:00 +01007727 struct lb_env *env)
7728{
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05307729 return -1;
Mel Gorman7a0f3082013-10-07 11:29:01 +01007730}
Mel Gorman3a7053b2013-10-07 11:29:00 +01007731#endif
7732
Peter Zijlstra029632f2011-10-25 10:00:11 +02007733/*
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007734 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
7735 */
7736static
Peter Zijlstra8e45cb52012-02-22 12:47:19 +01007737int can_migrate_task(struct task_struct *p, struct lb_env *env)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007738{
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05307739 int tsk_cache_hot;
Kirill Tkhaie5673f22014-08-20 13:48:01 +04007740
Peter Zijlstra5cb9eaa2020-11-17 18:19:31 -05007741 lockdep_assert_rq_held(env->src_rq);
Kirill Tkhaie5673f22014-08-20 13:48:01 +04007742
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007743 /*
7744 * We do not migrate tasks that are:
Joonsoo Kimd3198082013-04-23 17:27:40 +09007745 * 1) throttled_lb_pair, or
Sebastian Andrzej Siewior3bd37062019-04-23 16:26:36 +02007746 * 2) cannot be migrated to this CPU due to cpus_ptr, or
Joonsoo Kimd3198082013-04-23 17:27:40 +09007747 * 3) running (obviously), or
7748 * 4) are cache-hot on their current CPU.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007749 */
Joonsoo Kimd3198082013-04-23 17:27:40 +09007750 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
7751 return 0;
7752
Lingutla Chandrasekhar9bcb959d2021-04-07 23:06:26 +01007753 /* Disregard pcpu kthreads; they are where they need to be. */
Peter Zijlstra3a7956e2021-04-20 10:18:17 +02007754 if (kthread_is_per_cpu(p))
Lingutla Chandrasekhar9bcb959d2021-04-07 23:06:26 +01007755 return 0;
7756
Sebastian Andrzej Siewior3bd37062019-04-23 16:26:36 +02007757 if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
Joonsoo Kime02e60c2013-04-23 17:27:42 +09007758 int cpu;
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05307759
Yafang Shaoceeadb82021-09-05 14:35:41 +00007760 schedstat_inc(p->stats.nr_failed_migrations_affine);
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05307761
Peter Zijlstra62633222013-08-19 12:41:09 +02007762 env->flags |= LBF_SOME_PINNED;
7763
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05307764 /*
Ingo Molnar97fb7a02018-03-03 14:01:12 +01007765 * Remember if this task can be migrated to any other CPU in
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05307766 * our sched_group. We may want to revisit it if we couldn't
7767 * meet load balance goals by pulling other tasks on src_cpu.
7768 *
Valentin Schneider23fb06d2021-04-07 23:06:27 +01007769 * Avoid computing new_dst_cpu
7770 * - for NEWLY_IDLE
7771 * - if we have already computed one in current iteration
7772 * - if it's an active balance
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05307773 */
Valentin Schneider23fb06d2021-04-07 23:06:27 +01007774 if (env->idle == CPU_NEWLY_IDLE ||
7775 env->flags & (LBF_DST_PINNED | LBF_ACTIVE_LB))
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05307776 return 0;
7777
Ingo Molnar97fb7a02018-03-03 14:01:12 +01007778 /* Prevent to re-select dst_cpu via env's CPUs: */
Joonsoo Kime02e60c2013-04-23 17:27:42 +09007779 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
Sebastian Andrzej Siewior3bd37062019-04-23 16:26:36 +02007780 if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
Peter Zijlstra62633222013-08-19 12:41:09 +02007781 env->flags |= LBF_DST_PINNED;
Joonsoo Kime02e60c2013-04-23 17:27:42 +09007782 env->new_dst_cpu = cpu;
7783 break;
7784 }
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05307785 }
Joonsoo Kime02e60c2013-04-23 17:27:42 +09007786
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007787 return 0;
7788 }
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05307789
Ingo Molnar3b037062021-03-18 13:38:50 +01007790 /* Record that we found at least one task that could run on dst_cpu */
Peter Zijlstra8e45cb52012-02-22 12:47:19 +01007791 env->flags &= ~LBF_ALL_PINNED;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007792
Peter Zijlstraddcdf6e2012-02-22 19:27:40 +01007793 if (task_running(env->src_rq, p)) {
Yafang Shaoceeadb82021-09-05 14:35:41 +00007794 schedstat_inc(p->stats.nr_failed_migrations_running);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007795 return 0;
7796 }
7797
7798 /*
7799 * Aggressive migration if:
Valentin Schneider23fb06d2021-04-07 23:06:27 +01007800 * 1) active balance
7801 * 2) destination numa is preferred
7802 * 3) task is cache cold, or
7803 * 4) too many balance attempts have failed.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007804 */
Valentin Schneider23fb06d2021-04-07 23:06:27 +01007805 if (env->flags & LBF_ACTIVE_LB)
7806 return 1;
7807
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05307808 tsk_cache_hot = migrate_degrades_locality(p, env);
7809 if (tsk_cache_hot == -1)
7810 tsk_cache_hot = task_hot(p, env);
Mel Gorman3a7053b2013-10-07 11:29:00 +01007811
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05307812 if (tsk_cache_hot <= 0 ||
Kirill Tkhai7a96c232014-09-22 22:36:12 +04007813 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
Srikar Dronamraju2a1ed242015-06-16 17:25:59 +05307814 if (tsk_cache_hot == 1) {
Josh Poimboeufae928822016-06-17 12:43:24 -05007815 schedstat_inc(env->sd->lb_hot_gained[env->idle]);
Yafang Shaoceeadb82021-09-05 14:35:41 +00007816 schedstat_inc(p->stats.nr_forced_migrations);
Mel Gorman3a7053b2013-10-07 11:29:00 +01007817 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007818 return 1;
7819 }
7820
Yafang Shaoceeadb82021-09-05 14:35:41 +00007821 schedstat_inc(p->stats.nr_failed_migrations_hot);
Zhang Hang4e2dcb72013-04-10 14:04:55 +08007822 return 0;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007823}
7824
Peter Zijlstra897c3952009-12-17 17:45:42 +01007825/*
Kirill Tkhai163122b2014-08-20 13:48:29 +04007826 * detach_task() -- detach the task for the migration specified in env
Peter Zijlstra897c3952009-12-17 17:45:42 +01007827 */
Kirill Tkhai163122b2014-08-20 13:48:29 +04007828static void detach_task(struct task_struct *p, struct lb_env *env)
7829{
Peter Zijlstra5cb9eaa2020-11-17 18:19:31 -05007830 lockdep_assert_rq_held(env->src_rq);
Kirill Tkhai163122b2014-08-20 13:48:29 +04007831
Peter Zijlstra5704ac02017-02-21 17:15:21 +01007832 deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
Kirill Tkhai163122b2014-08-20 13:48:29 +04007833 set_task_cpu(p, env->dst_cpu);
7834}
7835
7836/*
Kirill Tkhaie5673f22014-08-20 13:48:01 +04007837 * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
Peter Zijlstra897c3952009-12-17 17:45:42 +01007838 * part of active balancing operations within "domain".
Peter Zijlstra897c3952009-12-17 17:45:42 +01007839 *
Kirill Tkhaie5673f22014-08-20 13:48:01 +04007840 * Returns a task if successful and NULL otherwise.
Peter Zijlstra897c3952009-12-17 17:45:42 +01007841 */
Kirill Tkhaie5673f22014-08-20 13:48:01 +04007842static struct task_struct *detach_one_task(struct lb_env *env)
Peter Zijlstra897c3952009-12-17 17:45:42 +01007843{
Uladzislau Rezki93824902017-09-13 12:24:30 +02007844 struct task_struct *p;
Peter Zijlstra897c3952009-12-17 17:45:42 +01007845
Peter Zijlstra5cb9eaa2020-11-17 18:19:31 -05007846 lockdep_assert_rq_held(env->src_rq);
Kirill Tkhaie5673f22014-08-20 13:48:01 +04007847
Uladzislau Rezki93824902017-09-13 12:24:30 +02007848 list_for_each_entry_reverse(p,
7849 &env->src_rq->cfs_tasks, se.group_node) {
Peter Zijlstra367456c2012-02-20 21:49:09 +01007850 if (!can_migrate_task(p, env))
7851 continue;
Peter Zijlstra897c3952009-12-17 17:45:42 +01007852
Kirill Tkhai163122b2014-08-20 13:48:29 +04007853 detach_task(p, env);
Kirill Tkhaie5673f22014-08-20 13:48:01 +04007854
Peter Zijlstra367456c2012-02-20 21:49:09 +01007855 /*
Kirill Tkhaie5673f22014-08-20 13:48:01 +04007856 * Right now, this is only the second place where
Kirill Tkhai163122b2014-08-20 13:48:29 +04007857 * lb_gained[env->idle] is updated (other is detach_tasks)
Kirill Tkhaie5673f22014-08-20 13:48:01 +04007858 * so we can safely collect stats here rather than
Kirill Tkhai163122b2014-08-20 13:48:29 +04007859 * inside detach_tasks().
Peter Zijlstra367456c2012-02-20 21:49:09 +01007860 */
Josh Poimboeufae928822016-06-17 12:43:24 -05007861 schedstat_inc(env->sd->lb_gained[env->idle]);
Kirill Tkhaie5673f22014-08-20 13:48:01 +04007862 return p;
Peter Zijlstra897c3952009-12-17 17:45:42 +01007863 }
Kirill Tkhaie5673f22014-08-20 13:48:01 +04007864 return NULL;
Peter Zijlstra897c3952009-12-17 17:45:42 +01007865}
7866
Peter Zijlstraeb953082012-04-17 13:38:40 +02007867static const unsigned int sched_nr_migrate_break = 32;
7868
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01007869/*
Vincent Guittot0b0695f2019-10-18 15:26:31 +02007870 * detach_tasks() -- tries to detach up to imbalance load/util/tasks from
Kirill Tkhai163122b2014-08-20 13:48:29 +04007871 * busiest_rq, as part of a balancing operation within domain "sd".
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01007872 *
Kirill Tkhai163122b2014-08-20 13:48:29 +04007873 * Returns number of detached tasks if successful and 0 otherwise.
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01007874 */
Kirill Tkhai163122b2014-08-20 13:48:29 +04007875static int detach_tasks(struct lb_env *env)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007876{
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01007877 struct list_head *tasks = &env->src_rq->cfs_tasks;
Vincent Guittot0b0695f2019-10-18 15:26:31 +02007878 unsigned long util, load;
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01007879 struct task_struct *p;
Kirill Tkhai163122b2014-08-20 13:48:29 +04007880 int detached = 0;
7881
Peter Zijlstra5cb9eaa2020-11-17 18:19:31 -05007882 lockdep_assert_rq_held(env->src_rq);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007883
Aubrey Liacb4dec2021-02-24 16:15:49 +08007884 /*
7885 * Source run queue has been emptied by another CPU, clear
7886 * LBF_ALL_PINNED flag as we will not test any task.
7887 */
7888 if (env->src_rq->nr_running <= 1) {
7889 env->flags &= ~LBF_ALL_PINNED;
7890 return 0;
7891 }
7892
Peter Zijlstrabd939f42012-05-02 14:20:37 +02007893 if (env->imbalance <= 0)
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01007894 return 0;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007895
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01007896 while (!list_empty(tasks)) {
Yuyang Du985d3a42015-07-06 06:11:51 +08007897 /*
7898 * We don't want to steal all, otherwise we may be treated likewise,
7899 * which could at worst lead to a livelock crash.
7900 */
7901 if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
7902 break;
7903
Uladzislau Rezki93824902017-09-13 12:24:30 +02007904 p = list_last_entry(tasks, struct task_struct, se.group_node);
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01007905
Peter Zijlstra367456c2012-02-20 21:49:09 +01007906 env->loop++;
7907 /* We've more or less seen every task there is, call it quits */
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01007908 if (env->loop > env->loop_max)
Peter Zijlstra367456c2012-02-20 21:49:09 +01007909 break;
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01007910
7911 /* take a breather every nr_migrate tasks */
Peter Zijlstra367456c2012-02-20 21:49:09 +01007912 if (env->loop > env->loop_break) {
Peter Zijlstraeb953082012-04-17 13:38:40 +02007913 env->loop_break += sched_nr_migrate_break;
Peter Zijlstra8e45cb52012-02-22 12:47:19 +01007914 env->flags |= LBF_NEED_BREAK;
Peter Zijlstraee00e662009-12-17 17:25:20 +01007915 break;
Peter Zijlstraa195f002011-09-22 15:30:18 +02007916 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007917
Joonsoo Kimd3198082013-04-23 17:27:40 +09007918 if (!can_migrate_task(p, env))
Peter Zijlstra367456c2012-02-20 21:49:09 +01007919 goto next;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007920
Vincent Guittot0b0695f2019-10-18 15:26:31 +02007921 switch (env->migration_type) {
7922 case migrate_load:
Vincent Guittot01cfcde2020-07-10 17:24:26 +02007923 /*
7924 * Depending of the number of CPUs and tasks and the
7925 * cgroup hierarchy, task_h_load() can return a null
7926 * value. Make sure that env->imbalance decreases
7927 * otherwise detach_tasks() will stop only after
7928 * detaching up to loop_max tasks.
7929 */
7930 load = max_t(unsigned long, task_h_load(p), 1);
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01007931
Vincent Guittot0b0695f2019-10-18 15:26:31 +02007932 if (sched_feat(LB_MIN) &&
7933 load < 16 && !env->sd->nr_balance_failed)
7934 goto next;
Peter Zijlstra367456c2012-02-20 21:49:09 +01007935
Vincent Guittot6cf82d52019-11-29 15:04:47 +01007936 /*
7937 * Make sure that we don't migrate too much load.
7938 * Nevertheless, let relax the constraint if
7939 * scheduler fails to find a good waiting task to
7940 * migrate.
7941 */
Valentin Schneider39a2a6e2021-02-25 17:56:56 +00007942 if (shr_bound(load, env->sd->nr_balance_failed) > env->imbalance)
Vincent Guittot0b0695f2019-10-18 15:26:31 +02007943 goto next;
7944
7945 env->imbalance -= load;
7946 break;
7947
7948 case migrate_util:
7949 util = task_util_est(p);
7950
7951 if (util > env->imbalance)
7952 goto next;
7953
7954 env->imbalance -= util;
7955 break;
7956
7957 case migrate_task:
7958 env->imbalance--;
7959 break;
7960
7961 case migrate_misfit:
Vincent Guittotc63be7b2019-10-18 15:26:35 +02007962 /* This is not a misfit task */
7963 if (task_fits_capacity(p, capacity_of(env->src_cpu)))
Vincent Guittot0b0695f2019-10-18 15:26:31 +02007964 goto next;
7965
7966 env->imbalance = 0;
7967 break;
7968 }
Peter Zijlstra367456c2012-02-20 21:49:09 +01007969
Kirill Tkhai163122b2014-08-20 13:48:29 +04007970 detach_task(p, env);
7971 list_add(&p->se.group_node, &env->tasks);
7972
7973 detached++;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007974
Thomas Gleixnerc1a280b2019-07-26 23:19:37 +02007975#ifdef CONFIG_PREEMPTION
Peter Zijlstraee00e662009-12-17 17:25:20 +01007976 /*
7977 * NEWIDLE balancing is a source of latency, so preemptible
Kirill Tkhai163122b2014-08-20 13:48:29 +04007978 * kernels will stop after the first task is detached to minimize
Peter Zijlstraee00e662009-12-17 17:25:20 +01007979 * the critical section.
7980 */
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01007981 if (env->idle == CPU_NEWLY_IDLE)
Peter Zijlstraee00e662009-12-17 17:25:20 +01007982 break;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007983#endif
7984
Peter Zijlstraee00e662009-12-17 17:25:20 +01007985 /*
7986 * We only want to steal up to the prescribed amount of
Vincent Guittot0b0695f2019-10-18 15:26:31 +02007987 * load/util/tasks.
Peter Zijlstraee00e662009-12-17 17:25:20 +01007988 */
Peter Zijlstrabd939f42012-05-02 14:20:37 +02007989 if (env->imbalance <= 0)
Peter Zijlstraee00e662009-12-17 17:25:20 +01007990 break;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007991
Peter Zijlstra367456c2012-02-20 21:49:09 +01007992 continue;
7993next:
Uladzislau Rezki93824902017-09-13 12:24:30 +02007994 list_move(&p->se.group_node, tasks);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007995 }
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01007996
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01007997 /*
Kirill Tkhai163122b2014-08-20 13:48:29 +04007998 * Right now, this is one of only two places we collect this stat
7999 * so we can safely collect detach_one_task() stats here rather
8000 * than inside detach_one_task().
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008001 */
Josh Poimboeufae928822016-06-17 12:43:24 -05008002 schedstat_add(env->sd->lb_gained[env->idle], detached);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008003
Kirill Tkhai163122b2014-08-20 13:48:29 +04008004 return detached;
8005}
8006
8007/*
8008 * attach_task() -- attach the task detached by detach_task() to its new rq.
8009 */
8010static void attach_task(struct rq *rq, struct task_struct *p)
8011{
Peter Zijlstra5cb9eaa2020-11-17 18:19:31 -05008012 lockdep_assert_rq_held(rq);
Kirill Tkhai163122b2014-08-20 13:48:29 +04008013
8014 BUG_ON(task_rq(p) != rq);
Peter Zijlstra5704ac02017-02-21 17:15:21 +01008015 activate_task(rq, p, ENQUEUE_NOCLOCK);
Kirill Tkhai163122b2014-08-20 13:48:29 +04008016 check_preempt_curr(rq, p, 0);
8017}
8018
8019/*
8020 * attach_one_task() -- attaches the task returned from detach_one_task() to
8021 * its new rq.
8022 */
8023static void attach_one_task(struct rq *rq, struct task_struct *p)
8024{
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02008025 struct rq_flags rf;
8026
8027 rq_lock(rq, &rf);
Peter Zijlstra5704ac02017-02-21 17:15:21 +01008028 update_rq_clock(rq);
Kirill Tkhai163122b2014-08-20 13:48:29 +04008029 attach_task(rq, p);
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02008030 rq_unlock(rq, &rf);
Kirill Tkhai163122b2014-08-20 13:48:29 +04008031}
8032
8033/*
8034 * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
8035 * new rq.
8036 */
8037static void attach_tasks(struct lb_env *env)
8038{
8039 struct list_head *tasks = &env->tasks;
8040 struct task_struct *p;
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02008041 struct rq_flags rf;
Kirill Tkhai163122b2014-08-20 13:48:29 +04008042
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02008043 rq_lock(env->dst_rq, &rf);
Peter Zijlstra5704ac02017-02-21 17:15:21 +01008044 update_rq_clock(env->dst_rq);
Kirill Tkhai163122b2014-08-20 13:48:29 +04008045
8046 while (!list_empty(tasks)) {
8047 p = list_first_entry(tasks, struct task_struct, se.group_node);
8048 list_del_init(&p->se.group_node);
8049
8050 attach_task(env->dst_rq, p);
8051 }
8052
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02008053 rq_unlock(env->dst_rq, &rf);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008054}
8055
Valentin Schneiderb0c79222019-06-03 12:54:24 +01008056#ifdef CONFIG_NO_HZ_COMMON
Vincent Guittot1936c532018-02-13 11:31:18 +01008057static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
8058{
8059 if (cfs_rq->avg.load_avg)
8060 return true;
8061
8062 if (cfs_rq->avg.util_avg)
8063 return true;
8064
8065 return false;
8066}
8067
Vincent Guittot91c27492018-06-28 17:45:09 +02008068static inline bool others_have_blocked(struct rq *rq)
Vincent Guittot371bf422018-06-28 17:45:05 +02008069{
8070 if (READ_ONCE(rq->avg_rt.util_avg))
8071 return true;
8072
Vincent Guittot3727e0e2018-06-28 17:45:07 +02008073 if (READ_ONCE(rq->avg_dl.util_avg))
8074 return true;
8075
Thara Gopinathb4eccf52020-02-21 19:52:10 -05008076 if (thermal_load_avg(rq))
8077 return true;
8078
Vincent Guittot11d4afd2018-09-25 11:17:42 +02008079#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
Vincent Guittot91c27492018-06-28 17:45:09 +02008080 if (READ_ONCE(rq->avg_irq.util_avg))
8081 return true;
8082#endif
8083
Vincent Guittot371bf422018-06-28 17:45:05 +02008084 return false;
8085}
8086
Vincent Guittot39b6a422021-02-24 14:30:07 +01008087static inline void update_blocked_load_tick(struct rq *rq)
8088{
8089 WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies);
8090}
8091
Valentin Schneiderb0c79222019-06-03 12:54:24 +01008092static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
8093{
Valentin Schneiderb0c79222019-06-03 12:54:24 +01008094 if (!has_blocked)
8095 rq->has_blocked_load = 0;
8096}
8097#else
8098static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
8099static inline bool others_have_blocked(struct rq *rq) { return false; }
Vincent Guittot39b6a422021-02-24 14:30:07 +01008100static inline void update_blocked_load_tick(struct rq *rq) {}
Valentin Schneiderb0c79222019-06-03 12:54:24 +01008101static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
8102#endif
8103
Vincent Guittotbef69dd2019-11-18 14:21:19 +01008104static bool __update_blocked_others(struct rq *rq, bool *done)
8105{
8106 const struct sched_class *curr_class;
8107 u64 now = rq_clock_pelt(rq);
Thara Gopinathb4eccf52020-02-21 19:52:10 -05008108 unsigned long thermal_pressure;
Vincent Guittotbef69dd2019-11-18 14:21:19 +01008109 bool decayed;
8110
8111 /*
8112 * update_load_avg() can call cpufreq_update_util(). Make sure that RT,
8113 * DL and IRQ signals have been updated before updating CFS.
8114 */
8115 curr_class = rq->curr->sched_class;
8116
Thara Gopinathb4eccf52020-02-21 19:52:10 -05008117 thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
8118
Vincent Guittotbef69dd2019-11-18 14:21:19 +01008119 decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) |
8120 update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) |
Thara Gopinath05289b92020-02-21 19:52:13 -05008121 update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure) |
Vincent Guittotbef69dd2019-11-18 14:21:19 +01008122 update_irq_load_avg(rq, 0);
8123
8124 if (others_have_blocked(rq))
8125 *done = false;
8126
8127 return decayed;
8128}
8129
Vincent Guittot1936c532018-02-13 11:31:18 +01008130#ifdef CONFIG_FAIR_GROUP_SCHED
8131
Vincent Guittotbef69dd2019-11-18 14:21:19 +01008132static bool __update_blocked_fair(struct rq *rq, bool *done)
Peter Zijlstra9e3081c2010-11-15 15:47:02 -08008133{
Vincent Guittot039ae8b2019-02-06 17:14:22 +01008134 struct cfs_rq *cfs_rq, *pos;
Vincent Guittotbef69dd2019-11-18 14:21:19 +01008135 bool decayed = false;
8136 int cpu = cpu_of(rq);
Vincent Guittotb90f7c92019-10-30 12:18:29 +01008137
8138 /*
Peter Zijlstra9763b672011-07-13 13:09:25 +02008139 * Iterates the task_group tree in a bottom up fashion, see
8140 * list_add_leaf_cfs_rq() for details.
8141 */
Vincent Guittot039ae8b2019-02-06 17:14:22 +01008142 for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
Vincent Guittotbc427892017-03-17 14:47:22 +01008143 struct sched_entity *se;
8144
Vincent Guittotbef69dd2019-11-18 14:21:19 +01008145 if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
Xianting Tianfe749152020-09-24 09:47:55 +08008146 update_tg_load_avg(cfs_rq);
Vincent Guittot4e516072016-11-08 10:53:46 +01008147
Vincent Guittotbef69dd2019-11-18 14:21:19 +01008148 if (cfs_rq == &rq->cfs)
8149 decayed = true;
8150 }
8151
Vincent Guittotbc427892017-03-17 14:47:22 +01008152 /* Propagate pending load changes to the parent, if any: */
8153 se = cfs_rq->tg->se[cpu];
8154 if (se && !skip_blocked_update(se))
Vincent Guittot02da26a2021-05-27 14:29:16 +02008155 update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
Tejun Heoa9e7f652017-04-25 17:43:50 -07008156
Vincent Guittot039ae8b2019-02-06 17:14:22 +01008157 /*
8158 * There can be a lot of idle CPU cgroups. Don't let fully
8159 * decayed cfs_rqs linger on the list.
8160 */
8161 if (cfs_rq_is_decayed(cfs_rq))
8162 list_del_leaf_cfs_rq(cfs_rq);
8163
Vincent Guittot1936c532018-02-13 11:31:18 +01008164 /* Don't need periodic decay once load/util_avg are null */
8165 if (cfs_rq_has_blocked(cfs_rq))
Vincent Guittotbef69dd2019-11-18 14:21:19 +01008166 *done = false;
Yuyang Du9d89c252015-07-15 08:04:37 +08008167 }
Vincent Guittot12b04872018-08-31 17:22:55 +02008168
Vincent Guittotbef69dd2019-11-18 14:21:19 +01008169 return decayed;
Peter Zijlstra9e3081c2010-11-15 15:47:02 -08008170}
8171
Peter Zijlstra9763b672011-07-13 13:09:25 +02008172/*
Vladimir Davydov68520792013-07-15 17:49:19 +04008173 * Compute the hierarchical load factor for cfs_rq and all its ascendants.
Peter Zijlstra9763b672011-07-13 13:09:25 +02008174 * This needs to be done in a top-down fashion because the load of a child
8175 * group is a fraction of its parents load.
8176 */
Vladimir Davydov68520792013-07-15 17:49:19 +04008177static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
Peter Zijlstra9763b672011-07-13 13:09:25 +02008178{
Vladimir Davydov68520792013-07-15 17:49:19 +04008179 struct rq *rq = rq_of(cfs_rq);
8180 struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
Peter Zijlstraa35b6462012-08-08 21:46:40 +02008181 unsigned long now = jiffies;
Vladimir Davydov68520792013-07-15 17:49:19 +04008182 unsigned long load;
Peter Zijlstraa35b6462012-08-08 21:46:40 +02008183
Vladimir Davydov68520792013-07-15 17:49:19 +04008184 if (cfs_rq->last_h_load_update == now)
Peter Zijlstraa35b6462012-08-08 21:46:40 +02008185 return;
8186
Mel Gorman0e9f0242019-03-19 12:36:10 +00008187 WRITE_ONCE(cfs_rq->h_load_next, NULL);
Vladimir Davydov68520792013-07-15 17:49:19 +04008188 for_each_sched_entity(se) {
8189 cfs_rq = cfs_rq_of(se);
Mel Gorman0e9f0242019-03-19 12:36:10 +00008190 WRITE_ONCE(cfs_rq->h_load_next, se);
Vladimir Davydov68520792013-07-15 17:49:19 +04008191 if (cfs_rq->last_h_load_update == now)
8192 break;
8193 }
Peter Zijlstraa35b6462012-08-08 21:46:40 +02008194
Vladimir Davydov68520792013-07-15 17:49:19 +04008195 if (!se) {
Yuyang Du7ea241a2015-07-15 08:04:42 +08008196 cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
Vladimir Davydov68520792013-07-15 17:49:19 +04008197 cfs_rq->last_h_load_update = now;
8198 }
8199
Mel Gorman0e9f0242019-03-19 12:36:10 +00008200 while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
Vladimir Davydov68520792013-07-15 17:49:19 +04008201 load = cfs_rq->h_load;
Yuyang Du7ea241a2015-07-15 08:04:42 +08008202 load = div64_ul(load * se->avg.load_avg,
8203 cfs_rq_load_avg(cfs_rq) + 1);
Vladimir Davydov68520792013-07-15 17:49:19 +04008204 cfs_rq = group_cfs_rq(se);
8205 cfs_rq->h_load = load;
8206 cfs_rq->last_h_load_update = now;
8207 }
Peter Zijlstra9763b672011-07-13 13:09:25 +02008208}
8209
Peter Zijlstra367456c2012-02-20 21:49:09 +01008210static unsigned long task_h_load(struct task_struct *p)
Peter Zijlstra230059de2009-12-17 17:47:12 +01008211{
Peter Zijlstra367456c2012-02-20 21:49:09 +01008212 struct cfs_rq *cfs_rq = task_cfs_rq(p);
Peter Zijlstra230059de2009-12-17 17:47:12 +01008213
Vladimir Davydov68520792013-07-15 17:49:19 +04008214 update_cfs_rq_h_load(cfs_rq);
Yuyang Du9d89c252015-07-15 08:04:37 +08008215 return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
Yuyang Du7ea241a2015-07-15 08:04:42 +08008216 cfs_rq_load_avg(cfs_rq) + 1);
Peter Zijlstra230059de2009-12-17 17:47:12 +01008217}
8218#else
Vincent Guittotbef69dd2019-11-18 14:21:19 +01008219static bool __update_blocked_fair(struct rq *rq, bool *done)
Peter Zijlstra9e3081c2010-11-15 15:47:02 -08008220{
Vincent Guittot6c1d47c2015-07-15 08:04:38 +08008221 struct cfs_rq *cfs_rq = &rq->cfs;
Vincent Guittotbef69dd2019-11-18 14:21:19 +01008222 bool decayed;
Vincent Guittot6c1d47c2015-07-15 08:04:38 +08008223
Vincent Guittotbef69dd2019-11-18 14:21:19 +01008224 decayed = update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
8225 if (cfs_rq_has_blocked(cfs_rq))
8226 *done = false;
Vincent Guittot12b04872018-08-31 17:22:55 +02008227
Vincent Guittotbef69dd2019-11-18 14:21:19 +01008228 return decayed;
Peter Zijlstra9e3081c2010-11-15 15:47:02 -08008229}
8230
Peter Zijlstra367456c2012-02-20 21:49:09 +01008231static unsigned long task_h_load(struct task_struct *p)
8232{
Yuyang Du9d89c252015-07-15 08:04:37 +08008233 return p->se.avg.load_avg;
Peter Zijlstra230059de2009-12-17 17:47:12 +01008234}
8235#endif
8236
Vincent Guittotbef69dd2019-11-18 14:21:19 +01008237static void update_blocked_averages(int cpu)
8238{
8239 bool decayed = false, done = true;
8240 struct rq *rq = cpu_rq(cpu);
8241 struct rq_flags rf;
8242
8243 rq_lock_irqsave(rq, &rf);
Vincent Guittot39b6a422021-02-24 14:30:07 +01008244 update_blocked_load_tick(rq);
Vincent Guittotbef69dd2019-11-18 14:21:19 +01008245 update_rq_clock(rq);
8246
8247 decayed |= __update_blocked_others(rq, &done);
8248 decayed |= __update_blocked_fair(rq, &done);
8249
8250 update_blocked_load_status(rq, !done);
8251 if (decayed)
8252 cpufreq_update_util(rq, 0);
8253 rq_unlock_irqrestore(rq, &rf);
8254}
8255
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008256/********** Helpers for find_busiest_group ************************/
Rik van Rielcaeb1782014-07-28 14:16:28 -04008257
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008258/*
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008259 * sg_lb_stats - stats of a sched_group required for load_balancing
8260 */
8261struct sg_lb_stats {
8262 unsigned long avg_load; /*Avg load across the CPUs of the group */
8263 unsigned long group_load; /* Total load over the CPUs of the group */
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008264 unsigned long group_capacity;
Vincent Guittot070f5e82020-02-24 09:52:19 +00008265 unsigned long group_util; /* Total utilization over the CPUs of the group */
8266 unsigned long group_runnable; /* Total runnable time over the CPUs of the group */
Vincent Guittot5e23e472019-10-18 15:26:32 +02008267 unsigned int sum_nr_running; /* Nr of tasks running in the group */
Vincent Guittota3498342019-10-18 15:26:29 +02008268 unsigned int sum_h_nr_running; /* Nr of CFS tasks running in the group */
Peter Zijlstra147c5fc2013-08-19 15:22:57 +02008269 unsigned int idle_cpus;
8270 unsigned int group_weight;
Rik van Rielcaeb1782014-07-28 14:16:28 -04008271 enum group_type group_type;
Vincent Guittot490ba972019-10-18 15:26:28 +02008272 unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */
Morten Rasmussen3b1baa62018-07-04 11:17:40 +01008273 unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01008274#ifdef CONFIG_NUMA_BALANCING
8275 unsigned int nr_numa_running;
8276 unsigned int nr_preferred_running;
8277#endif
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008278};
8279
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008280/*
8281 * sd_lb_stats - Structure to store the statistics of a sched_domain
8282 * during load balancing.
8283 */
8284struct sd_lb_stats {
8285 struct sched_group *busiest; /* Busiest group in this sd */
8286 struct sched_group *local; /* Local group in this sd */
8287 unsigned long total_load; /* Total load of all groups in sd */
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008288 unsigned long total_capacity; /* Total capacity of all groups in sd */
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008289 unsigned long avg_load; /* Average load across all groups in sd */
Vincent Guittot0b0695f2019-10-18 15:26:31 +02008290 unsigned int prefer_sibling; /* tasks should go to sibling first */
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008291
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008292 struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
Peter Zijlstra147c5fc2013-08-19 15:22:57 +02008293 struct sg_lb_stats local_stat; /* Statistics of the local group */
Joonsoo Kim56cf5152013-08-06 17:36:43 +09008294};
8295
Peter Zijlstra147c5fc2013-08-19 15:22:57 +02008296static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
8297{
8298 /*
8299 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
8300 * local_stat because update_sg_lb_stats() does a full clear/assignment.
Vincent Guittot0b0695f2019-10-18 15:26:31 +02008301 * We must however set busiest_stat::group_type and
8302 * busiest_stat::idle_cpus to the worst busiest group because
8303 * update_sd_pick_busiest() reads these before assignment.
Peter Zijlstra147c5fc2013-08-19 15:22:57 +02008304 */
8305 *sds = (struct sd_lb_stats){
8306 .busiest = NULL,
8307 .local = NULL,
8308 .total_load = 0UL,
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008309 .total_capacity = 0UL,
Peter Zijlstra147c5fc2013-08-19 15:22:57 +02008310 .busiest_stat = {
Vincent Guittot0b0695f2019-10-18 15:26:31 +02008311 .idle_cpus = UINT_MAX,
8312 .group_type = group_has_spare,
Peter Zijlstra147c5fc2013-08-19 15:22:57 +02008313 },
8314 };
8315}
8316
Dietmar Eggemann1ca20342020-06-03 10:03:04 +02008317static unsigned long scale_rt_capacity(int cpu)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008318{
8319 struct rq *rq = cpu_rq(cpu);
Vincent Guittot8ec59c02019-06-17 17:00:17 +02008320 unsigned long max = arch_scale_cpu_capacity(cpu);
Vincent Guittot523e9792018-06-28 17:45:12 +02008321 unsigned long used, free;
Vincent Guittot523e9792018-06-28 17:45:12 +02008322 unsigned long irq;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008323
Vincent Guittot2e62c472018-07-19 14:00:06 +02008324 irq = cpu_util_irq(rq);
Venkatesh Pallipadiaa483802010-10-04 17:03:22 -07008325
Vincent Guittot523e9792018-06-28 17:45:12 +02008326 if (unlikely(irq >= max))
8327 return 1;
Peter Zijlstracadefd32014-02-27 10:40:35 +01008328
Thara Gopinath467b7d02020-02-21 19:52:11 -05008329 /*
8330 * avg_rt.util_avg and avg_dl.util_avg track binary signals
8331 * (running and not running) with weights 0 and 1024 respectively.
8332 * avg_thermal.load_avg tracks thermal pressure and the weighted
8333 * average uses the actual delta max capacity(load).
8334 */
Vincent Guittot523e9792018-06-28 17:45:12 +02008335 used = READ_ONCE(rq->avg_rt.util_avg);
8336 used += READ_ONCE(rq->avg_dl.util_avg);
Thara Gopinath467b7d02020-02-21 19:52:11 -05008337 used += thermal_load_avg(rq);
Peter Zijlstrab654f7d2012-05-22 14:04:28 +02008338
Vincent Guittot523e9792018-06-28 17:45:12 +02008339 if (unlikely(used >= max))
8340 return 1;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008341
Vincent Guittot523e9792018-06-28 17:45:12 +02008342 free = max - used;
Vincent Guittot2e62c472018-07-19 14:00:06 +02008343
8344 return scale_irq_capacity(free, irq, max);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008345}
8346
Nicolas Pitreced549f2014-05-26 18:19:38 -04008347static void update_cpu_capacity(struct sched_domain *sd, int cpu)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008348{
Dietmar Eggemann1ca20342020-06-03 10:03:04 +02008349 unsigned long capacity = scale_rt_capacity(cpu);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008350 struct sched_group *sdg = sd->groups;
8351
Vincent Guittot8ec59c02019-06-17 17:00:17 +02008352 cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008353
Nicolas Pitreced549f2014-05-26 18:19:38 -04008354 if (!capacity)
8355 capacity = 1;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008356
Nicolas Pitreced549f2014-05-26 18:19:38 -04008357 cpu_rq(cpu)->cpu_capacity = capacity;
Vincent Donnefort51cf18c2020-08-28 10:00:49 +01008358 trace_sched_cpu_capacity_tp(cpu_rq(cpu));
8359
Nicolas Pitreced549f2014-05-26 18:19:38 -04008360 sdg->sgc->capacity = capacity;
Morten Rasmussenbf475ce2016-10-14 14:41:09 +01008361 sdg->sgc->min_capacity = capacity;
Morten Rasmussene3d6d0c2018-07-04 11:17:41 +01008362 sdg->sgc->max_capacity = capacity;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008363}
8364
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008365void update_group_capacity(struct sched_domain *sd, int cpu)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008366{
8367 struct sched_domain *child = sd->child;
8368 struct sched_group *group, *sdg = sd->groups;
Morten Rasmussene3d6d0c2018-07-04 11:17:41 +01008369 unsigned long capacity, min_capacity, max_capacity;
Vincent Guittot4ec44122011-12-12 20:21:08 +01008370 unsigned long interval;
8371
8372 interval = msecs_to_jiffies(sd->balance_interval);
8373 interval = clamp(interval, 1UL, max_load_balance_interval);
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008374 sdg->sgc->next_update = jiffies + interval;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008375
8376 if (!child) {
Nicolas Pitreced549f2014-05-26 18:19:38 -04008377 update_cpu_capacity(sd, cpu);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008378 return;
8379 }
8380
Vincent Guittotdc7ff762015-03-03 11:35:03 +01008381 capacity = 0;
Morten Rasmussenbf475ce2016-10-14 14:41:09 +01008382 min_capacity = ULONG_MAX;
Morten Rasmussene3d6d0c2018-07-04 11:17:41 +01008383 max_capacity = 0;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008384
Peter Zijlstra74a5ce22012-05-23 18:00:43 +02008385 if (child->flags & SD_OVERLAP) {
8386 /*
8387 * SD_OVERLAP domains cannot assume that child groups
8388 * span the current group.
8389 */
8390
Peter Zijlstraae4df9d2017-05-01 11:03:12 +02008391 for_each_cpu(cpu, sched_group_span(sdg)) {
Peng Liu4c58f572020-01-04 21:08:28 +08008392 unsigned long cpu_cap = capacity_of(cpu);
Peter Zijlstra863bffc2013-08-28 11:44:39 +02008393
Peng Liu4c58f572020-01-04 21:08:28 +08008394 capacity += cpu_cap;
8395 min_capacity = min(cpu_cap, min_capacity);
8396 max_capacity = max(cpu_cap, max_capacity);
Peter Zijlstra863bffc2013-08-28 11:44:39 +02008397 }
Peter Zijlstra74a5ce22012-05-23 18:00:43 +02008398 } else {
8399 /*
8400 * !SD_OVERLAP domains can assume that child groups
8401 * span the current group.
Byungchul Park97a71422015-07-05 18:33:48 +09008402 */
Peter Zijlstra74a5ce22012-05-23 18:00:43 +02008403
8404 group = child->groups;
8405 do {
Morten Rasmussenbf475ce2016-10-14 14:41:09 +01008406 struct sched_group_capacity *sgc = group->sgc;
8407
8408 capacity += sgc->capacity;
8409 min_capacity = min(sgc->min_capacity, min_capacity);
Morten Rasmussene3d6d0c2018-07-04 11:17:41 +01008410 max_capacity = max(sgc->max_capacity, max_capacity);
Peter Zijlstra74a5ce22012-05-23 18:00:43 +02008411 group = group->next;
8412 } while (group != child->groups);
8413 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008414
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008415 sdg->sgc->capacity = capacity;
Morten Rasmussenbf475ce2016-10-14 14:41:09 +01008416 sdg->sgc->min_capacity = min_capacity;
Morten Rasmussene3d6d0c2018-07-04 11:17:41 +01008417 sdg->sgc->max_capacity = max_capacity;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008418}
8419
Srivatsa Vaddagiri9d5efe02010-06-08 14:57:02 +10008420/*
Vincent Guittotea678212015-02-27 16:54:11 +01008421 * Check whether the capacity of the rq has been noticeably reduced by side
8422 * activity. The imbalance_pct is used for the threshold.
8423 * Return true is the capacity is reduced
Srivatsa Vaddagiri9d5efe02010-06-08 14:57:02 +10008424 */
8425static inline int
Vincent Guittotea678212015-02-27 16:54:11 +01008426check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
Srivatsa Vaddagiri9d5efe02010-06-08 14:57:02 +10008427{
Vincent Guittotea678212015-02-27 16:54:11 +01008428 return ((rq->cpu_capacity * sd->imbalance_pct) <
8429 (rq->cpu_capacity_orig * 100));
Srivatsa Vaddagiri9d5efe02010-06-08 14:57:02 +10008430}
8431
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02008432/*
Valentin Schneidera0fe2cf2019-02-11 17:59:45 +00008433 * Check whether a rq has a misfit task and if it looks like we can actually
8434 * help that task: we can migrate the task to a CPU of higher capacity, or
8435 * the task's current CPU is heavily pressured.
8436 */
8437static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
8438{
8439 return rq->misfit_task_load &&
8440 (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity ||
8441 check_cpu_capacity(rq, sd));
8442}
8443
8444/*
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02008445 * Group imbalance indicates (and tries to solve) the problem where balancing
Sebastian Andrzej Siewior3bd37062019-04-23 16:26:36 +02008446 * groups is inadequate due to ->cpus_ptr constraints.
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02008447 *
Ingo Molnar97fb7a02018-03-03 14:01:12 +01008448 * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
8449 * cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02008450 * Something like:
8451 *
Ingo Molnar2b4d5b22016-11-23 07:37:00 +01008452 * { 0 1 2 3 } { 4 5 6 7 }
8453 * * * * *
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02008454 *
8455 * If we were to balance group-wise we'd place two tasks in the first group and
8456 * two tasks in the second group. Clearly this is undesired as it will overload
Ingo Molnar97fb7a02018-03-03 14:01:12 +01008457 * cpu 3 and leave one of the CPUs in the second group unused.
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02008458 *
8459 * The current solution to this issue is detecting the skew in the first group
Peter Zijlstra62633222013-08-19 12:41:09 +02008460 * by noticing the lower domain failed to reach balance and had difficulty
8461 * moving tasks due to affinity constraints.
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02008462 *
8463 * When this is so detected; this group becomes a candidate for busiest; see
Kamalesh Babulaled1b7732013-10-13 23:06:15 +05308464 * update_sd_pick_busiest(). And calculate_imbalance() and
Peter Zijlstra62633222013-08-19 12:41:09 +02008465 * find_busiest_group() avoid some of the usual balance conditions to allow it
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02008466 * to create an effective group imbalance.
8467 *
8468 * This is a somewhat tricky proposition since the next run might not find the
8469 * group imbalance and decide the groups need to be balanced again. A most
8470 * subtle and fragile situation.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008471 */
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02008472
Peter Zijlstra62633222013-08-19 12:41:09 +02008473static inline int sg_imbalanced(struct sched_group *group)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008474{
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008475 return group->sgc->imbalance;
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02008476}
8477
Peter Zijlstrab37d9312013-08-28 11:50:34 +02008478/*
Vincent Guittotea678212015-02-27 16:54:11 +01008479 * group_has_capacity returns true if the group has spare capacity that could
8480 * be used by some tasks.
8481 * We consider that a group has spare capacity if the * number of task is
Dietmar Eggemann9e91d612015-08-14 17:23:12 +01008482 * smaller than the number of CPUs or if the utilization is lower than the
8483 * available capacity for CFS tasks.
Vincent Guittotea678212015-02-27 16:54:11 +01008484 * For the latter, we use a threshold to stabilize the state, to take into
8485 * account the variance of the tasks' load and to return true if the available
8486 * capacity in meaningful for the load balancer.
8487 * As an example, an available capacity of 1% can appear but it doesn't make
8488 * any benefit for the load balance.
Peter Zijlstrab37d9312013-08-28 11:50:34 +02008489 */
Vincent Guittotea678212015-02-27 16:54:11 +01008490static inline bool
Vincent Guittot57abff02019-10-18 15:26:38 +02008491group_has_capacity(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
Peter Zijlstrab37d9312013-08-28 11:50:34 +02008492{
Vincent Guittot5e23e472019-10-18 15:26:32 +02008493 if (sgs->sum_nr_running < sgs->group_weight)
Vincent Guittotea678212015-02-27 16:54:11 +01008494 return true;
Peter Zijlstrab37d9312013-08-28 11:50:34 +02008495
Vincent Guittot070f5e82020-02-24 09:52:19 +00008496 if ((sgs->group_capacity * imbalance_pct) <
8497 (sgs->group_runnable * 100))
8498 return false;
8499
Vincent Guittotea678212015-02-27 16:54:11 +01008500 if ((sgs->group_capacity * 100) >
Vincent Guittot57abff02019-10-18 15:26:38 +02008501 (sgs->group_util * imbalance_pct))
Vincent Guittotea678212015-02-27 16:54:11 +01008502 return true;
Peter Zijlstrab37d9312013-08-28 11:50:34 +02008503
Vincent Guittotea678212015-02-27 16:54:11 +01008504 return false;
Peter Zijlstrab37d9312013-08-28 11:50:34 +02008505}
8506
Vincent Guittotea678212015-02-27 16:54:11 +01008507/*
8508 * group_is_overloaded returns true if the group has more tasks than it can
8509 * handle.
8510 * group_is_overloaded is not equals to !group_has_capacity because a group
8511 * with the exact right number of tasks, has no more spare capacity but is not
8512 * overloaded so both group_has_capacity and group_is_overloaded return
8513 * false.
8514 */
8515static inline bool
Vincent Guittot57abff02019-10-18 15:26:38 +02008516group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
Rik van Rielcaeb1782014-07-28 14:16:28 -04008517{
Vincent Guittot5e23e472019-10-18 15:26:32 +02008518 if (sgs->sum_nr_running <= sgs->group_weight)
Vincent Guittotea678212015-02-27 16:54:11 +01008519 return false;
8520
8521 if ((sgs->group_capacity * 100) <
Vincent Guittot57abff02019-10-18 15:26:38 +02008522 (sgs->group_util * imbalance_pct))
Vincent Guittotea678212015-02-27 16:54:11 +01008523 return true;
8524
Vincent Guittot070f5e82020-02-24 09:52:19 +00008525 if ((sgs->group_capacity * imbalance_pct) <
8526 (sgs->group_runnable * 100))
8527 return true;
8528
Vincent Guittotea678212015-02-27 16:54:11 +01008529 return false;
8530}
8531
Leo Yan79a89f92015-09-15 18:56:45 +08008532static inline enum
Vincent Guittot57abff02019-10-18 15:26:38 +02008533group_type group_classify(unsigned int imbalance_pct,
Vincent Guittot0b0695f2019-10-18 15:26:31 +02008534 struct sched_group *group,
Leo Yan79a89f92015-09-15 18:56:45 +08008535 struct sg_lb_stats *sgs)
Vincent Guittotea678212015-02-27 16:54:11 +01008536{
Vincent Guittot57abff02019-10-18 15:26:38 +02008537 if (group_is_overloaded(imbalance_pct, sgs))
Rik van Rielcaeb1782014-07-28 14:16:28 -04008538 return group_overloaded;
8539
8540 if (sg_imbalanced(group))
8541 return group_imbalanced;
8542
Vincent Guittot0b0695f2019-10-18 15:26:31 +02008543 if (sgs->group_asym_packing)
8544 return group_asym_packing;
8545
Morten Rasmussen3b1baa62018-07-04 11:17:40 +01008546 if (sgs->group_misfit_task_load)
8547 return group_misfit_task;
8548
Vincent Guittot57abff02019-10-18 15:26:38 +02008549 if (!group_has_capacity(imbalance_pct, sgs))
Vincent Guittot0b0695f2019-10-18 15:26:31 +02008550 return group_fully_busy;
8551
8552 return group_has_spare;
Rik van Rielcaeb1782014-07-28 14:16:28 -04008553}
8554
Ricardo Neri4006a722021-09-10 18:18:19 -07008555/**
8556 * asym_smt_can_pull_tasks - Check whether the load balancing CPU can pull tasks
8557 * @dst_cpu: Destination CPU of the load balancing
8558 * @sds: Load-balancing data with statistics of the local group
8559 * @sgs: Load-balancing statistics of the candidate busiest group
8560 * @sg: The candidate busiest group
8561 *
8562 * Check the state of the SMT siblings of both @sds::local and @sg and decide
8563 * if @dst_cpu can pull tasks.
8564 *
8565 * If @dst_cpu does not have SMT siblings, it can pull tasks if two or more of
8566 * the SMT siblings of @sg are busy. If only one CPU in @sg is busy, pull tasks
8567 * only if @dst_cpu has higher priority.
8568 *
8569 * If both @dst_cpu and @sg have SMT siblings, and @sg has exactly one more
8570 * busy CPU than @sds::local, let @dst_cpu pull tasks if it has higher priority.
8571 * Bigger imbalances in the number of busy CPUs will be dealt with in
8572 * update_sd_pick_busiest().
8573 *
8574 * If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings
8575 * of @dst_cpu are idle and @sg has lower priority.
Randy Dunlapa315da52021-12-17 21:59:00 -08008576 *
8577 * Return: true if @dst_cpu can pull tasks, false otherwise.
Ricardo Neri4006a722021-09-10 18:18:19 -07008578 */
8579static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds,
8580 struct sg_lb_stats *sgs,
8581 struct sched_group *sg)
8582{
8583#ifdef CONFIG_SCHED_SMT
8584 bool local_is_smt, sg_is_smt;
8585 int sg_busy_cpus;
8586
8587 local_is_smt = sds->local->flags & SD_SHARE_CPUCAPACITY;
8588 sg_is_smt = sg->flags & SD_SHARE_CPUCAPACITY;
8589
8590 sg_busy_cpus = sgs->group_weight - sgs->idle_cpus;
8591
8592 if (!local_is_smt) {
8593 /*
8594 * If we are here, @dst_cpu is idle and does not have SMT
8595 * siblings. Pull tasks if candidate group has two or more
8596 * busy CPUs.
8597 */
8598 if (sg_busy_cpus >= 2) /* implies sg_is_smt */
8599 return true;
8600
8601 /*
8602 * @dst_cpu does not have SMT siblings. @sg may have SMT
8603 * siblings and only one is busy. In such case, @dst_cpu
8604 * can help if it has higher priority and is idle (i.e.,
8605 * it has no running tasks).
8606 */
8607 return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
8608 }
8609
8610 /* @dst_cpu has SMT siblings. */
8611
8612 if (sg_is_smt) {
8613 int local_busy_cpus = sds->local->group_weight -
8614 sds->local_stat.idle_cpus;
8615 int busy_cpus_delta = sg_busy_cpus - local_busy_cpus;
8616
8617 if (busy_cpus_delta == 1)
8618 return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
8619
8620 return false;
8621 }
8622
8623 /*
8624 * @sg does not have SMT siblings. Ensure that @sds::local does not end
8625 * up with more than one busy SMT sibling and only pull tasks if there
8626 * are not busy CPUs (i.e., no CPU has running tasks).
8627 */
8628 if (!sds->local_stat.sum_nr_running)
8629 return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
8630
8631 return false;
8632#else
8633 /* Always return false so that callers deal with non-SMT cases. */
8634 return false;
8635#endif
8636}
8637
Ricardo Neriaafc9172021-09-10 18:18:18 -07008638static inline bool
8639sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs,
8640 struct sched_group *group)
8641{
Ricardo Neri4006a722021-09-10 18:18:19 -07008642 /* Only do SMT checks if either local or candidate have SMT siblings */
8643 if ((sds->local->flags & SD_SHARE_CPUCAPACITY) ||
8644 (group->flags & SD_SHARE_CPUCAPACITY))
8645 return asym_smt_can_pull_tasks(env->dst_cpu, sds, sgs, group);
8646
Ricardo Neriaafc9172021-09-10 18:18:18 -07008647 return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
8648}
8649
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008650/**
8651 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
8652 * @env: The load balancing environment.
Randy Dunlapa315da52021-12-17 21:59:00 -08008653 * @sds: Load-balancing data with statistics of the local group.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008654 * @group: sched_group whose statistics are to be updated.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008655 * @sgs: variable to hold the statistics for this group.
Quentin Perret630246a2018-12-03 09:56:24 +00008656 * @sg_status: Holds flag indicating the status of the sched_group
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008657 */
8658static inline void update_sg_lb_stats(struct lb_env *env,
Ricardo Neric0d14b52021-09-10 18:18:17 -07008659 struct sd_lb_stats *sds,
Quentin Perret630246a2018-12-03 09:56:24 +00008660 struct sched_group *group,
8661 struct sg_lb_stats *sgs,
8662 int *sg_status)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008663{
Vincent Guittot0b0695f2019-10-18 15:26:31 +02008664 int i, nr_running, local_group;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008665
Peter Zijlstrab72ff132013-08-28 10:32:32 +02008666 memset(sgs, 0, sizeof(*sgs));
8667
Ricardo Neric0d14b52021-09-10 18:18:17 -07008668 local_group = group == sds->local;
Vincent Guittot0b0695f2019-10-18 15:26:31 +02008669
Peter Zijlstraae4df9d2017-05-01 11:03:12 +02008670 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008671 struct rq *rq = cpu_rq(i);
8672
Vincent Guittotb0fb1eb2019-10-18 15:26:33 +02008673 sgs->group_load += cpu_load(rq);
Dietmar Eggemann82762d22021-11-18 17:42:40 +01008674 sgs->group_util += cpu_util_cfs(i);
Vincent Guittot070f5e82020-02-24 09:52:19 +00008675 sgs->group_runnable += cpu_runnable(rq);
Vincent Guittota3498342019-10-18 15:26:29 +02008676 sgs->sum_h_nr_running += rq->cfs.h_nr_running;
Tim Chen4486edd2014-06-23 12:16:49 -07008677
Waiman Longa426f992015-11-25 14:09:38 -05008678 nr_running = rq->nr_running;
Vincent Guittot5e23e472019-10-18 15:26:32 +02008679 sgs->sum_nr_running += nr_running;
8680
Waiman Longa426f992015-11-25 14:09:38 -05008681 if (nr_running > 1)
Quentin Perret630246a2018-12-03 09:56:24 +00008682 *sg_status |= SG_OVERLOAD;
Tim Chen4486edd2014-06-23 12:16:49 -07008683
Morten Rasmussen2802bf32018-12-03 09:56:25 +00008684 if (cpu_overutilized(i))
8685 *sg_status |= SG_OVERUTILIZED;
Kamalesh Babulal380c9072013-11-15 15:06:52 +05308686
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01008687#ifdef CONFIG_NUMA_BALANCING
8688 sgs->nr_numa_running += rq->nr_numa_running;
8689 sgs->nr_preferred_running += rq->nr_preferred_running;
8690#endif
Waiman Longa426f992015-11-25 14:09:38 -05008691 /*
8692 * No need to call idle_cpu() if nr_running is not 0
8693 */
Vincent Guittot0b0695f2019-10-18 15:26:31 +02008694 if (!nr_running && idle_cpu(i)) {
Suresh Siddhaaae6d3d2010-09-17 15:02:32 -07008695 sgs->idle_cpus++;
Vincent Guittot0b0695f2019-10-18 15:26:31 +02008696 /* Idle cpu can't have misfit task */
8697 continue;
8698 }
Morten Rasmussen3b1baa62018-07-04 11:17:40 +01008699
Vincent Guittot0b0695f2019-10-18 15:26:31 +02008700 if (local_group)
8701 continue;
8702
8703 /* Check for a misfit task on the cpu */
Morten Rasmussen3b1baa62018-07-04 11:17:40 +01008704 if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
Valentin Schneider757ffdd2018-07-04 11:17:47 +01008705 sgs->group_misfit_task_load < rq->misfit_task_load) {
Morten Rasmussen3b1baa62018-07-04 11:17:40 +01008706 sgs->group_misfit_task_load = rq->misfit_task_load;
Quentin Perret630246a2018-12-03 09:56:24 +00008707 *sg_status |= SG_OVERLOAD;
Valentin Schneider757ffdd2018-07-04 11:17:47 +01008708 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008709 }
8710
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04008711 sgs->group_capacity = group->sgc->capacity;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008712
Suresh Siddhaaae6d3d2010-09-17 15:02:32 -07008713 sgs->group_weight = group->group_weight;
Peter Zijlstrab37d9312013-08-28 11:50:34 +02008714
Ricardo Neriaafc9172021-09-10 18:18:18 -07008715 /* Check if dst CPU is idle and preferred to this group */
8716 if (!local_group && env->sd->flags & SD_ASYM_PACKING &&
8717 env->idle != CPU_NOT_IDLE && sgs->sum_h_nr_running &&
8718 sched_asym(env, sds, sgs, group)) {
8719 sgs->group_asym_packing = 1;
8720 }
8721
Vincent Guittot57abff02019-10-18 15:26:38 +02008722 sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
Vincent Guittot0b0695f2019-10-18 15:26:31 +02008723
8724 /* Computing avg_load makes sense only when group is overloaded */
8725 if (sgs->group_type == group_overloaded)
8726 sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
8727 sgs->group_capacity;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01008728}
8729
8730/**
Michael Neuling532cb4c2010-06-08 14:57:02 +10008731 * update_sd_pick_busiest - return 1 on busiest group
Randy Dunlapcd968912012-06-08 13:18:33 -07008732 * @env: The load balancing environment.
Michael Neuling532cb4c2010-06-08 14:57:02 +10008733 * @sds: sched_domain statistics
8734 * @sg: sched_group candidate to be checked for being the busiest
Michael Neulingb6b12292010-06-10 12:06:21 +10008735 * @sgs: sched_group statistics
Michael Neuling532cb4c2010-06-08 14:57:02 +10008736 *
8737 * Determine if @sg is a busier group than the previously selected
8738 * busiest group.
Yacine Belkadie69f6182013-07-12 20:45:47 +02008739 *
8740 * Return: %true if @sg is a busier group than the previously selected
8741 * busiest group. %false otherwise.
Michael Neuling532cb4c2010-06-08 14:57:02 +10008742 */
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008743static bool update_sd_pick_busiest(struct lb_env *env,
Michael Neuling532cb4c2010-06-08 14:57:02 +10008744 struct sd_lb_stats *sds,
8745 struct sched_group *sg,
Peter Zijlstrabd939f42012-05-02 14:20:37 +02008746 struct sg_lb_stats *sgs)
Michael Neuling532cb4c2010-06-08 14:57:02 +10008747{
Rik van Rielcaeb1782014-07-28 14:16:28 -04008748 struct sg_lb_stats *busiest = &sds->busiest_stat;
Michael Neuling532cb4c2010-06-08 14:57:02 +10008749
Vincent Guittot0b0695f2019-10-18 15:26:31 +02008750 /* Make sure that there is at least one task to pull */
8751 if (!sgs->sum_h_nr_running)
8752 return false;
8753
Morten Rasmussencad68e52018-07-04 11:17:42 +01008754 /*
8755 * Don't try to pull misfit tasks we can't help.
8756 * We can use max_capacity here as reduction in capacity on some
8757 * CPUs in the group should either be possible to resolve
8758 * internally or be covered by avg_load imbalance (eventually).
8759 */
8760 if (sgs->group_type == group_misfit_task &&
Valentin Schneider4aed8aa2021-04-07 23:06:28 +01008761 (!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
Vincent Guittot0b0695f2019-10-18 15:26:31 +02008762 sds->local_stat.group_type != group_has_spare))
Morten Rasmussencad68e52018-07-04 11:17:42 +01008763 return false;
8764
Rik van Rielcaeb1782014-07-28 14:16:28 -04008765 if (sgs->group_type > busiest->group_type)
Michael Neuling532cb4c2010-06-08 14:57:02 +10008766 return true;
8767
Rik van Rielcaeb1782014-07-28 14:16:28 -04008768 if (sgs->group_type < busiest->group_type)
8769 return false;
8770
Vincent Guittot0b0695f2019-10-18 15:26:31 +02008771 /*
8772 * The candidate and the current busiest group are the same type of
8773 * group. Let check which one is the busiest according to the type.
8774 */
8775
8776 switch (sgs->group_type) {
8777 case group_overloaded:
8778 /* Select the overloaded group with highest avg_load. */
8779 if (sgs->avg_load <= busiest->avg_load)
8780 return false;
8781 break;
8782
8783 case group_imbalanced:
8784 /*
8785 * Select the 1st imbalanced group as we don't have any way to
8786 * choose one more than another.
8787 */
Rik van Rielcaeb1782014-07-28 14:16:28 -04008788 return false;
8789
Vincent Guittot0b0695f2019-10-18 15:26:31 +02008790 case group_asym_packing:
Ingo Molnar97fb7a02018-03-03 14:01:12 +01008791 /* Prefer to move from lowest priority CPU's work */
Vincent Guittot0b0695f2019-10-18 15:26:31 +02008792 if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu))
8793 return false;
8794 break;
8795
8796 case group_misfit_task:
8797 /*
8798 * If we have more than one misfit sg go with the biggest
8799 * misfit.
8800 */
8801 if (sgs->group_misfit_task_load < busiest->group_misfit_task_load)
8802 return false;
8803 break;
8804
8805 case group_fully_busy:
8806 /*
8807 * Select the fully busy group with highest avg_load. In
8808 * theory, there is no need to pull task from such kind of
8809 * group because tasks have all compute capacity that they need
8810 * but we can still improve the overall throughput by reducing
8811 * contention when accessing shared HW resources.
8812 *
8813 * XXX for now avg_load is not computed and always 0 so we
8814 * select the 1st one.
8815 */
8816 if (sgs->avg_load <= busiest->avg_load)
8817 return false;
8818 break;
8819
8820 case group_has_spare:
8821 /*
Vincent Guittot5f68eb12019-12-20 12:04:53 +01008822 * Select not overloaded group with lowest number of idle cpus
8823 * and highest number of running tasks. We could also compare
8824 * the spare capacity which is more stable but it can end up
8825 * that the group has less spare capacity but finally more idle
Vincent Guittot0b0695f2019-10-18 15:26:31 +02008826 * CPUs which means less opportunity to pull tasks.
8827 */
Vincent Guittot5f68eb12019-12-20 12:04:53 +01008828 if (sgs->idle_cpus > busiest->idle_cpus)
Vincent Guittot0b0695f2019-10-18 15:26:31 +02008829 return false;
Vincent Guittot5f68eb12019-12-20 12:04:53 +01008830 else if ((sgs->idle_cpus == busiest->idle_cpus) &&
8831 (sgs->sum_nr_running <= busiest->sum_nr_running))
8832 return false;
8833
Vincent Guittot0b0695f2019-10-18 15:26:31 +02008834 break;
Michael Neuling532cb4c2010-06-08 14:57:02 +10008835 }
8836
Vincent Guittot0b0695f2019-10-18 15:26:31 +02008837 /*
8838 * Candidate sg has no more than one task per CPU and has higher
8839 * per-CPU capacity. Migrating tasks to less capable CPUs may harm
8840 * throughput. Maximize throughput, power/energy consequences are not
8841 * considered.
8842 */
8843 if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
8844 (sgs->group_type <= group_fully_busy) &&
Valentin Schneider4aed8aa2021-04-07 23:06:28 +01008845 (capacity_greater(sg->sgc->min_capacity, capacity_of(env->dst_cpu))))
Vincent Guittot0b0695f2019-10-18 15:26:31 +02008846 return false;
8847
8848 return true;
Michael Neuling532cb4c2010-06-08 14:57:02 +10008849}
8850
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01008851#ifdef CONFIG_NUMA_BALANCING
8852static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
8853{
Vincent Guittota3498342019-10-18 15:26:29 +02008854 if (sgs->sum_h_nr_running > sgs->nr_numa_running)
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01008855 return regular;
Vincent Guittota3498342019-10-18 15:26:29 +02008856 if (sgs->sum_h_nr_running > sgs->nr_preferred_running)
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01008857 return remote;
8858 return all;
8859}
8860
8861static inline enum fbq_type fbq_classify_rq(struct rq *rq)
8862{
8863 if (rq->nr_running > rq->nr_numa_running)
8864 return regular;
8865 if (rq->nr_running > rq->nr_preferred_running)
8866 return remote;
8867 return all;
8868}
8869#else
8870static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
8871{
8872 return all;
8873}
8874
8875static inline enum fbq_type fbq_classify_rq(struct rq *rq)
8876{
8877 return regular;
8878}
8879#endif /* CONFIG_NUMA_BALANCING */
8880
Vincent Guittot57abff02019-10-18 15:26:38 +02008881
8882struct sg_lb_stats;
8883
8884/*
Vincent Guittot3318544b2019-10-22 18:46:38 +02008885 * task_running_on_cpu - return 1 if @p is running on @cpu.
8886 */
8887
8888static unsigned int task_running_on_cpu(int cpu, struct task_struct *p)
8889{
8890 /* Task has no contribution or is new */
8891 if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
8892 return 0;
8893
8894 if (task_on_rq_queued(p))
8895 return 1;
8896
8897 return 0;
8898}
8899
8900/**
8901 * idle_cpu_without - would a given CPU be idle without p ?
8902 * @cpu: the processor on which idleness is tested.
8903 * @p: task which should be ignored.
8904 *
8905 * Return: 1 if the CPU would be idle. 0 otherwise.
8906 */
8907static int idle_cpu_without(int cpu, struct task_struct *p)
8908{
8909 struct rq *rq = cpu_rq(cpu);
8910
8911 if (rq->curr != rq->idle && rq->curr != p)
8912 return 0;
8913
8914 /*
8915 * rq->nr_running can't be used but an updated version without the
8916 * impact of p on cpu must be used instead. The updated nr_running
8917 * be computed and tested before calling idle_cpu_without().
8918 */
8919
8920#ifdef CONFIG_SMP
Peter Zijlstra126c2092020-05-26 18:11:03 +02008921 if (rq->ttwu_pending)
Vincent Guittot3318544b2019-10-22 18:46:38 +02008922 return 0;
8923#endif
8924
8925 return 1;
8926}
8927
8928/*
Vincent Guittot57abff02019-10-18 15:26:38 +02008929 * update_sg_wakeup_stats - Update sched_group's statistics for wakeup.
Vincent Guittot3318544b2019-10-22 18:46:38 +02008930 * @sd: The sched_domain level to look for idlest group.
Vincent Guittot57abff02019-10-18 15:26:38 +02008931 * @group: sched_group whose statistics are to be updated.
8932 * @sgs: variable to hold the statistics for this group.
Vincent Guittot3318544b2019-10-22 18:46:38 +02008933 * @p: The task for which we look for the idlest group/CPU.
Vincent Guittot57abff02019-10-18 15:26:38 +02008934 */
8935static inline void update_sg_wakeup_stats(struct sched_domain *sd,
8936 struct sched_group *group,
8937 struct sg_lb_stats *sgs,
8938 struct task_struct *p)
8939{
8940 int i, nr_running;
8941
8942 memset(sgs, 0, sizeof(*sgs));
8943
8944 for_each_cpu(i, sched_group_span(group)) {
8945 struct rq *rq = cpu_rq(i);
Vincent Guittot3318544b2019-10-22 18:46:38 +02008946 unsigned int local;
Vincent Guittot57abff02019-10-18 15:26:38 +02008947
Vincent Guittot3318544b2019-10-22 18:46:38 +02008948 sgs->group_load += cpu_load_without(rq, p);
Vincent Guittot57abff02019-10-18 15:26:38 +02008949 sgs->group_util += cpu_util_without(i, p);
Vincent Guittot070f5e82020-02-24 09:52:19 +00008950 sgs->group_runnable += cpu_runnable_without(rq, p);
Vincent Guittot3318544b2019-10-22 18:46:38 +02008951 local = task_running_on_cpu(i, p);
8952 sgs->sum_h_nr_running += rq->cfs.h_nr_running - local;
Vincent Guittot57abff02019-10-18 15:26:38 +02008953
Vincent Guittot3318544b2019-10-22 18:46:38 +02008954 nr_running = rq->nr_running - local;
Vincent Guittot57abff02019-10-18 15:26:38 +02008955 sgs->sum_nr_running += nr_running;
8956
8957 /*
Vincent Guittot3318544b2019-10-22 18:46:38 +02008958 * No need to call idle_cpu_without() if nr_running is not 0
Vincent Guittot57abff02019-10-18 15:26:38 +02008959 */
Vincent Guittot3318544b2019-10-22 18:46:38 +02008960 if (!nr_running && idle_cpu_without(i, p))
Vincent Guittot57abff02019-10-18 15:26:38 +02008961 sgs->idle_cpus++;
8962
Vincent Guittot57abff02019-10-18 15:26:38 +02008963 }
8964
8965 /* Check if task fits in the group */
8966 if (sd->flags & SD_ASYM_CPUCAPACITY &&
8967 !task_fits_capacity(p, group->sgc->max_capacity)) {
8968 sgs->group_misfit_task_load = 1;
8969 }
8970
8971 sgs->group_capacity = group->sgc->capacity;
8972
Vincent Guittot289de352020-02-18 15:45:34 +01008973 sgs->group_weight = group->group_weight;
8974
Vincent Guittot57abff02019-10-18 15:26:38 +02008975 sgs->group_type = group_classify(sd->imbalance_pct, group, sgs);
8976
8977 /*
8978 * Computing avg_load makes sense only when group is fully busy or
8979 * overloaded
8980 */
Tao Zhou6c8116c2020-03-19 11:39:20 +08008981 if (sgs->group_type == group_fully_busy ||
8982 sgs->group_type == group_overloaded)
Vincent Guittot57abff02019-10-18 15:26:38 +02008983 sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
8984 sgs->group_capacity;
8985}
8986
8987static bool update_pick_idlest(struct sched_group *idlest,
8988 struct sg_lb_stats *idlest_sgs,
8989 struct sched_group *group,
8990 struct sg_lb_stats *sgs)
8991{
8992 if (sgs->group_type < idlest_sgs->group_type)
8993 return true;
8994
8995 if (sgs->group_type > idlest_sgs->group_type)
8996 return false;
8997
8998 /*
8999 * The candidate and the current idlest group are the same type of
9000 * group. Let check which one is the idlest according to the type.
9001 */
9002
9003 switch (sgs->group_type) {
9004 case group_overloaded:
9005 case group_fully_busy:
9006 /* Select the group with lowest avg_load. */
9007 if (idlest_sgs->avg_load <= sgs->avg_load)
9008 return false;
9009 break;
9010
9011 case group_imbalanced:
9012 case group_asym_packing:
9013 /* Those types are not used in the slow wakeup path */
9014 return false;
9015
9016 case group_misfit_task:
9017 /* Select group with the highest max capacity */
9018 if (idlest->sgc->max_capacity >= group->sgc->max_capacity)
9019 return false;
9020 break;
9021
9022 case group_has_spare:
9023 /* Select group with most idle CPUs */
Peter Puhov3edecfe2020-07-14 08:59:41 -04009024 if (idlest_sgs->idle_cpus > sgs->idle_cpus)
Vincent Guittot57abff02019-10-18 15:26:38 +02009025 return false;
Peter Puhov3edecfe2020-07-14 08:59:41 -04009026
9027 /* Select group with lowest group_util */
9028 if (idlest_sgs->idle_cpus == sgs->idle_cpus &&
9029 idlest_sgs->group_util <= sgs->group_util)
9030 return false;
9031
Vincent Guittot57abff02019-10-18 15:26:38 +02009032 break;
9033 }
9034
9035 return true;
9036}
9037
9038/*
Mel Gorman23e60822020-11-20 09:06:30 +00009039 * Allow a NUMA imbalance if busy CPUs is less than 25% of the domain.
9040 * This is an approximation as the number of running tasks may not be
9041 * related to the number of busy CPUs due to sched_setaffinity.
9042 */
9043static inline bool allow_numa_imbalance(int dst_running, int dst_weight)
9044{
9045 return (dst_running < (dst_weight >> 2));
9046}
9047
9048/*
Vincent Guittot57abff02019-10-18 15:26:38 +02009049 * find_idlest_group() finds and returns the least busy CPU group within the
9050 * domain.
9051 *
9052 * Assumes p is allowed on at least one CPU in sd.
9053 */
9054static struct sched_group *
Valentin Schneider45da2772020-04-15 22:05:04 +01009055find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
Vincent Guittot57abff02019-10-18 15:26:38 +02009056{
9057 struct sched_group *idlest = NULL, *local = NULL, *group = sd->groups;
9058 struct sg_lb_stats local_sgs, tmp_sgs;
9059 struct sg_lb_stats *sgs;
9060 unsigned long imbalance;
9061 struct sg_lb_stats idlest_sgs = {
9062 .avg_load = UINT_MAX,
9063 .group_type = group_overloaded,
9064 };
9065
Vincent Guittot57abff02019-10-18 15:26:38 +02009066 do {
9067 int local_group;
9068
9069 /* Skip over this group if it has no CPUs allowed */
9070 if (!cpumask_intersects(sched_group_span(group),
9071 p->cpus_ptr))
9072 continue;
9073
Aubrey Li97886d92021-03-24 17:40:13 -04009074 /* Skip over this group if no cookie matched */
9075 if (!sched_group_cookie_match(cpu_rq(this_cpu), p, group))
9076 continue;
9077
Vincent Guittot57abff02019-10-18 15:26:38 +02009078 local_group = cpumask_test_cpu(this_cpu,
9079 sched_group_span(group));
9080
9081 if (local_group) {
9082 sgs = &local_sgs;
9083 local = group;
9084 } else {
9085 sgs = &tmp_sgs;
9086 }
9087
9088 update_sg_wakeup_stats(sd, group, sgs, p);
9089
9090 if (!local_group && update_pick_idlest(idlest, &idlest_sgs, group, sgs)) {
9091 idlest = group;
9092 idlest_sgs = *sgs;
9093 }
9094
9095 } while (group = group->next, group != sd->groups);
9096
9097
9098 /* There is no idlest group to push tasks to */
9099 if (!idlest)
9100 return NULL;
9101
Vincent Guittot7ed735c2019-12-04 19:21:40 +01009102 /* The local group has been skipped because of CPU affinity */
9103 if (!local)
9104 return idlest;
9105
Vincent Guittot57abff02019-10-18 15:26:38 +02009106 /*
9107 * If the local group is idler than the selected idlest group
9108 * don't try and push the task.
9109 */
9110 if (local_sgs.group_type < idlest_sgs.group_type)
9111 return NULL;
9112
9113 /*
9114 * If the local group is busier than the selected idlest group
9115 * try and push the task.
9116 */
9117 if (local_sgs.group_type > idlest_sgs.group_type)
9118 return idlest;
9119
9120 switch (local_sgs.group_type) {
9121 case group_overloaded:
9122 case group_fully_busy:
Mel Gorman5c339002020-11-20 09:06:28 +00009123
9124 /* Calculate allowed imbalance based on load */
9125 imbalance = scale_load_down(NICE_0_LOAD) *
9126 (sd->imbalance_pct-100) / 100;
9127
Vincent Guittot57abff02019-10-18 15:26:38 +02009128 /*
9129 * When comparing groups across NUMA domains, it's possible for
9130 * the local domain to be very lightly loaded relative to the
9131 * remote domains but "imbalance" skews the comparison making
9132 * remote CPUs look much more favourable. When considering
9133 * cross-domain, add imbalance to the load on the remote node
9134 * and consider staying local.
9135 */
9136
9137 if ((sd->flags & SD_NUMA) &&
9138 ((idlest_sgs.avg_load + imbalance) >= local_sgs.avg_load))
9139 return NULL;
9140
9141 /*
9142 * If the local group is less loaded than the selected
9143 * idlest group don't try and push any tasks.
9144 */
9145 if (idlest_sgs.avg_load >= (local_sgs.avg_load + imbalance))
9146 return NULL;
9147
9148 if (100 * local_sgs.avg_load <= sd->imbalance_pct * idlest_sgs.avg_load)
9149 return NULL;
9150 break;
9151
9152 case group_imbalanced:
9153 case group_asym_packing:
9154 /* Those type are not used in the slow wakeup path */
9155 return NULL;
9156
9157 case group_misfit_task:
9158 /* Select group with the highest max capacity */
9159 if (local->sgc->max_capacity >= idlest->sgc->max_capacity)
9160 return NULL;
9161 break;
9162
9163 case group_has_spare:
9164 if (sd->flags & SD_NUMA) {
9165#ifdef CONFIG_NUMA_BALANCING
9166 int idlest_cpu;
9167 /*
9168 * If there is spare capacity at NUMA, try to select
9169 * the preferred node
9170 */
9171 if (cpu_to_node(this_cpu) == p->numa_preferred_nid)
9172 return NULL;
9173
9174 idlest_cpu = cpumask_first(sched_group_span(idlest));
9175 if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
9176 return idlest;
9177#endif
9178 /*
9179 * Otherwise, keep the task on this node to stay close
9180 * its wakeup source and improve locality. If there is
9181 * a real need of migration, periodic load balance will
9182 * take care of it.
9183 */
Mel Gorman23e60822020-11-20 09:06:30 +00009184 if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->span_weight))
Vincent Guittot57abff02019-10-18 15:26:38 +02009185 return NULL;
9186 }
9187
9188 /*
9189 * Select group with highest number of idle CPUs. We could also
9190 * compare the utilization which is more stable but it can end
9191 * up that the group has less spare capacity but finally more
9192 * idle CPUs which means more opportunity to run task.
9193 */
9194 if (local_sgs.idle_cpus >= idlest_sgs.idle_cpus)
9195 return NULL;
9196 break;
9197 }
9198
9199 return idlest;
9200}
9201
Michael Neuling532cb4c2010-06-08 14:57:02 +10009202/**
Hui Kang461819a2011-10-11 23:00:59 -04009203 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
Randy Dunlapcd968912012-06-08 13:18:33 -07009204 * @env: The load balancing environment.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009205 * @sds: variable to hold the statistics for this sched_domain.
9206 */
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009207
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01009208static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009209{
Peter Zijlstrabd939f42012-05-02 14:20:37 +02009210 struct sched_domain *child = env->sd->child;
9211 struct sched_group *sg = env->sd->groups;
Srikar Dronamraju05b40e02017-03-22 23:27:50 +05309212 struct sg_lb_stats *local = &sds->local_stat;
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009213 struct sg_lb_stats tmp_sgs;
Quentin Perret630246a2018-12-03 09:56:24 +00009214 int sg_status = 0;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009215
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009216 do {
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009217 struct sg_lb_stats *sgs = &tmp_sgs;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009218 int local_group;
9219
Peter Zijlstraae4df9d2017-05-01 11:03:12 +02009220 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(sg));
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009221 if (local_group) {
9222 sds->local = sg;
Srikar Dronamraju05b40e02017-03-22 23:27:50 +05309223 sgs = local;
Peter Zijlstrab72ff132013-08-28 10:32:32 +02009224
9225 if (env->idle != CPU_NEWLY_IDLE ||
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04009226 time_after_eq(jiffies, sg->sgc->next_update))
9227 update_group_capacity(env->sd, env->dst_cpu);
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009228 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009229
Ricardo Neric0d14b52021-09-10 18:18:17 -07009230 update_sg_lb_stats(env, sds, sg, sgs, &sg_status);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009231
Peter Zijlstrab72ff132013-08-28 10:32:32 +02009232 if (local_group)
9233 goto next_group;
9234
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009235
Peter Zijlstrab72ff132013-08-28 10:32:32 +02009236 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
Michael Neuling532cb4c2010-06-08 14:57:02 +10009237 sds->busiest = sg;
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009238 sds->busiest_stat = *sgs;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009239 }
9240
Peter Zijlstrab72ff132013-08-28 10:32:32 +02009241next_group:
9242 /* Now, start updating sd_lb_stats */
9243 sds->total_load += sgs->group_load;
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04009244 sds->total_capacity += sgs->group_capacity;
Peter Zijlstrab72ff132013-08-28 10:32:32 +02009245
Michael Neuling532cb4c2010-06-08 14:57:02 +10009246 sg = sg->next;
Peter Zijlstrabd939f42012-05-02 14:20:37 +02009247 } while (sg != env->sd->groups);
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01009248
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009249 /* Tag domain that child domain prefers tasks go to siblings first */
9250 sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
9251
Vincent Guittotf643ea22018-02-13 11:31:17 +01009252
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01009253 if (env->sd->flags & SD_NUMA)
9254 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
Tim Chen4486edd2014-06-23 12:16:49 -07009255
9256 if (!env->sd->parent) {
Morten Rasmussen2802bf32018-12-03 09:56:25 +00009257 struct root_domain *rd = env->dst_rq->rd;
9258
Tim Chen4486edd2014-06-23 12:16:49 -07009259 /* update overload indicator if we are at root domain */
Morten Rasmussen2802bf32018-12-03 09:56:25 +00009260 WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
9261
9262 /* Update over-utilization (tipping point, U >= 0) indicator */
9263 WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
Qais Youseff9f240f2019-06-04 12:14:58 +01009264 trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
Morten Rasmussen2802bf32018-12-03 09:56:25 +00009265 } else if (sg_status & SG_OVERUTILIZED) {
Qais Youseff9f240f2019-06-04 12:14:58 +01009266 struct root_domain *rd = env->dst_rq->rd;
9267
9268 WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
9269 trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
Tim Chen4486edd2014-06-23 12:16:49 -07009270 }
Michael Neuling532cb4c2010-06-08 14:57:02 +10009271}
9272
Mel Gormanabeae76a2020-11-20 09:06:27 +00009273#define NUMA_IMBALANCE_MIN 2
9274
Mel Gorman7d2b5dd2020-11-20 09:06:29 +00009275static inline long adjust_numa_imbalance(int imbalance,
9276 int dst_running, int dst_weight)
Mel Gormanfb86f5b2020-02-24 09:52:16 +00009277{
Mel Gorman23e60822020-11-20 09:06:30 +00009278 if (!allow_numa_imbalance(dst_running, dst_weight))
9279 return imbalance;
9280
Mel Gormanfb86f5b2020-02-24 09:52:16 +00009281 /*
9282 * Allow a small imbalance based on a simple pair of communicating
Mel Gorman7d2b5dd2020-11-20 09:06:29 +00009283 * tasks that remain local when the destination is lightly loaded.
Mel Gormanfb86f5b2020-02-24 09:52:16 +00009284 */
Mel Gorman23e60822020-11-20 09:06:30 +00009285 if (imbalance <= NUMA_IMBALANCE_MIN)
Mel Gormanfb86f5b2020-02-24 09:52:16 +00009286 return 0;
9287
9288 return imbalance;
9289}
9290
Michael Neuling532cb4c2010-06-08 14:57:02 +10009291/**
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009292 * calculate_imbalance - Calculate the amount of imbalance present within the
9293 * groups of a given sched_domain during load balance.
Peter Zijlstrabd939f42012-05-02 14:20:37 +02009294 * @env: load balance environment
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009295 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009296 */
Peter Zijlstrabd939f42012-05-02 14:20:37 +02009297static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009298{
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009299 struct sg_lb_stats *local, *busiest;
Suresh Siddhadd5feea2010-02-23 16:13:52 -08009300
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009301 local = &sds->local_stat;
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009302 busiest = &sds->busiest_stat;
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009303
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009304 if (busiest->group_type == group_misfit_task) {
9305 /* Set imbalance to allow misfit tasks to be balanced. */
9306 env->migration_type = migrate_misfit;
Vincent Guittotc63be7b2019-10-18 15:26:35 +02009307 env->imbalance = 1;
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009308 return;
9309 }
9310
9311 if (busiest->group_type == group_asym_packing) {
9312 /*
9313 * In case of asym capacity, we will try to migrate all load to
9314 * the preferred CPU.
9315 */
9316 env->migration_type = migrate_task;
9317 env->imbalance = busiest->sum_h_nr_running;
9318 return;
9319 }
9320
9321 if (busiest->group_type == group_imbalanced) {
9322 /*
9323 * In the group_imb case we cannot rely on group-wide averages
9324 * to ensure CPU-load equilibrium, try to move any task to fix
9325 * the imbalance. The next load balance will take care of
9326 * balancing back the system.
9327 */
9328 env->migration_type = migrate_task;
9329 env->imbalance = 1;
Vincent Guittot490ba972019-10-18 15:26:28 +02009330 return;
9331 }
9332
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009333 /*
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009334 * Try to use spare capacity of local group without overloading it or
Vincent Guittota9723382019-11-12 15:50:43 +01009335 * emptying busiest.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009336 */
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009337 if (local->group_type == group_has_spare) {
Vincent Guittot16b0a7a2020-11-02 11:24:57 +01009338 if ((busiest->group_type > group_fully_busy) &&
9339 !(env->sd->flags & SD_SHARE_PKG_RESOURCES)) {
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009340 /*
9341 * If busiest is overloaded, try to fill spare
9342 * capacity. This might end up creating spare capacity
9343 * in busiest or busiest still being overloaded but
9344 * there is no simple way to directly compute the
9345 * amount of load to migrate in order to balance the
9346 * system.
9347 */
9348 env->migration_type = migrate_util;
9349 env->imbalance = max(local->group_capacity, local->group_util) -
9350 local->group_util;
9351
9352 /*
9353 * In some cases, the group's utilization is max or even
9354 * higher than capacity because of migrations but the
9355 * local CPU is (newly) idle. There is at least one
9356 * waiting task in this overloaded busiest group. Let's
9357 * try to pull it.
9358 */
9359 if (env->idle != CPU_NOT_IDLE && env->imbalance == 0) {
9360 env->migration_type = migrate_task;
9361 env->imbalance = 1;
9362 }
9363
9364 return;
9365 }
9366
9367 if (busiest->group_weight == 1 || sds->prefer_sibling) {
Vincent Guittot5e23e472019-10-18 15:26:32 +02009368 unsigned int nr_diff = busiest->sum_nr_running;
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009369 /*
9370 * When prefer sibling, evenly spread running tasks on
9371 * groups.
9372 */
9373 env->migration_type = migrate_task;
Vincent Guittot5e23e472019-10-18 15:26:32 +02009374 lsub_positive(&nr_diff, local->sum_nr_running);
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009375 env->imbalance = nr_diff >> 1;
Mel Gormanb396f522020-01-14 10:13:20 +00009376 } else {
9377
9378 /*
9379 * If there is no overload, we just want to even the number of
9380 * idle cpus.
9381 */
9382 env->migration_type = migrate_task;
9383 env->imbalance = max_t(long, 0, (local->idle_cpus -
9384 busiest->idle_cpus) >> 1);
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009385 }
9386
Mel Gormanb396f522020-01-14 10:13:20 +00009387 /* Consider allowing a small imbalance between NUMA groups */
Mel Gorman7d2b5dd2020-11-20 09:06:29 +00009388 if (env->sd->flags & SD_NUMA) {
Mel Gormanfb86f5b2020-02-24 09:52:16 +00009389 env->imbalance = adjust_numa_imbalance(env->imbalance,
Mel Gorman7d2b5dd2020-11-20 09:06:29 +00009390 busiest->sum_nr_running, busiest->group_weight);
9391 }
Mel Gormanb396f522020-01-14 10:13:20 +00009392
Vincent Guittotfcf05532019-10-18 15:26:30 +02009393 return;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009394 }
9395
Peter Zijlstra9a5d9ba2014-07-29 17:15:11 +02009396 /*
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009397 * Local is fully busy but has to take more load to relieve the
9398 * busiest group
Peter Zijlstra9a5d9ba2014-07-29 17:15:11 +02009399 */
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009400 if (local->group_type < group_overloaded) {
9401 /*
9402 * Local will become overloaded so the avg_load metrics are
9403 * finally needed.
9404 */
9405
9406 local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
9407 local->group_capacity;
9408
9409 sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
9410 sds->total_capacity;
Aubrey Li111688c2020-03-26 13:42:29 +08009411 /*
9412 * If the local group is more loaded than the selected
9413 * busiest group don't try to pull any tasks.
9414 */
9415 if (local->avg_load >= busiest->avg_load) {
9416 env->imbalance = 0;
9417 return;
9418 }
Suresh Siddhadd5feea2010-02-23 16:13:52 -08009419 }
9420
9421 /*
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009422 * Both group are or will become overloaded and we're trying to get all
9423 * the CPUs to the average_load, so we don't want to push ourselves
9424 * above the average load, nor do we wish to reduce the max loaded CPU
9425 * below the average load. At the same time, we also don't want to
9426 * reduce the group load below the group capacity. Thus we look for
9427 * the minimum possible imbalance.
Suresh Siddhadd5feea2010-02-23 16:13:52 -08009428 */
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009429 env->migration_type = migrate_load;
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009430 env->imbalance = min(
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009431 (busiest->avg_load - sds->avg_load) * busiest->group_capacity,
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04009432 (sds->avg_load - local->avg_load) * local->group_capacity
Nicolas Pitreca8ce3d2014-05-26 18:19:39 -04009433 ) / SCHED_CAPACITY_SCALE;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009434}
Nikhil Raofab47622010-10-15 13:12:29 -07009435
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009436/******* find_busiest_group() helpers end here *********************/
9437
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009438/*
9439 * Decision matrix according to the local and busiest group type:
9440 *
9441 * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
9442 * has_spare nr_idle balanced N/A N/A balanced balanced
9443 * fully_busy nr_idle nr_idle N/A N/A balanced balanced
9444 * misfit_task force N/A N/A N/A force force
9445 * asym_packing force force N/A N/A force force
9446 * imbalanced force force N/A N/A force force
9447 * overloaded force force N/A N/A force avg_load
9448 *
9449 * N/A : Not Applicable because already filtered while updating
9450 * statistics.
9451 * balanced : The system is balanced for these 2 groups.
9452 * force : Calculate the imbalance as load migration is probably needed.
9453 * avg_load : Only if imbalance is significant enough.
9454 * nr_idle : dst_cpu is not busy and the number of idle CPUs is quite
9455 * different in groups.
9456 */
9457
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009458/**
9459 * find_busiest_group - Returns the busiest group within the sched_domain
Dietmar Eggemann0a9b23c2016-04-29 20:32:38 +01009460 * if there is an imbalance.
Randy Dunlapa315da52021-12-17 21:59:00 -08009461 * @env: The load balancing environment.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009462 *
Dietmar Eggemanna3df0672019-06-18 14:23:10 +02009463 * Also calculates the amount of runnable load which should be moved
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009464 * to restore balance.
9465 *
Yacine Belkadie69f6182013-07-12 20:45:47 +02009466 * Return: - The busiest group if imbalance exists.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009467 */
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009468static struct sched_group *find_busiest_group(struct lb_env *env)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009469{
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009470 struct sg_lb_stats *local, *busiest;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009471 struct sd_lb_stats sds;
9472
Peter Zijlstra147c5fc2013-08-19 15:22:57 +02009473 init_sd_lb_stats(&sds);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009474
9475 /*
Vincent Guittotb0fb1eb2019-10-18 15:26:33 +02009476 * Compute the various statistics relevant for load balancing at
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009477 * this level.
9478 */
Joonsoo Kim23f0d202013-08-06 17:36:42 +09009479 update_sd_lb_stats(env, &sds);
Morten Rasmussen2802bf32018-12-03 09:56:25 +00009480
Peter Zijlstraf8a696f2018-12-05 11:23:56 +01009481 if (sched_energy_enabled()) {
Morten Rasmussen2802bf32018-12-03 09:56:25 +00009482 struct root_domain *rd = env->dst_rq->rd;
9483
9484 if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
9485 goto out_balanced;
9486 }
9487
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009488 local = &sds.local_stat;
9489 busiest = &sds.busiest_stat;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009490
Peter Zijlstracc57aa82011-02-21 18:55:32 +01009491 /* There is no busy sibling group to pull tasks from */
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009492 if (!sds.busiest)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009493 goto out_balanced;
9494
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009495 /* Misfit tasks should be dealt with regardless of the avg load */
9496 if (busiest->group_type == group_misfit_task)
9497 goto force_balance;
9498
9499 /* ASYM feature bypasses nice load balance check */
9500 if (busiest->group_type == group_asym_packing)
9501 goto force_balance;
Ken Chenb0432d82011-04-07 17:23:22 -07009502
Peter Zijlstra866ab432011-02-21 18:56:47 +01009503 /*
9504 * If the busiest group is imbalanced the below checks don't
Peter Zijlstra30ce5da2013-08-15 20:29:29 +02009505 * work because they assume all things are equal, which typically
Sebastian Andrzej Siewior3bd37062019-04-23 16:26:36 +02009506 * isn't true due to cpus_ptr constraints and the like.
Peter Zijlstra866ab432011-02-21 18:56:47 +01009507 */
Rik van Rielcaeb1782014-07-28 14:16:28 -04009508 if (busiest->group_type == group_imbalanced)
Peter Zijlstra866ab432011-02-21 18:56:47 +01009509 goto force_balance;
9510
Brendan Jackman583ffd92017-10-05 11:58:54 +01009511 /*
Zhihui Zhang9c58c792014-09-20 21:24:36 -04009512 * If the local group is busier than the selected busiest group
Peter Zijlstracc57aa82011-02-21 18:55:32 +01009513 * don't try and pull any tasks.
9514 */
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009515 if (local->group_type > busiest->group_type)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009516 goto out_balanced;
9517
Peter Zijlstracc57aa82011-02-21 18:55:32 +01009518 /*
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009519 * When groups are overloaded, use the avg_load to ensure fairness
9520 * between tasks.
Peter Zijlstracc57aa82011-02-21 18:55:32 +01009521 */
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009522 if (local->group_type == group_overloaded) {
Suresh Siddhaaae6d3d2010-09-17 15:02:32 -07009523 /*
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009524 * If the local group is more loaded than the selected
9525 * busiest group don't try to pull any tasks.
Suresh Siddhaaae6d3d2010-09-17 15:02:32 -07009526 */
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009527 if (local->avg_load >= busiest->avg_load)
Suresh Siddhaaae6d3d2010-09-17 15:02:32 -07009528 goto out_balanced;
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009529
9530 /* XXX broken for overlapping NUMA groups */
9531 sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) /
9532 sds.total_capacity;
9533
Peter Zijlstrac186faf2011-02-21 18:52:53 +01009534 /*
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009535 * Don't pull any tasks if this group is already above the
9536 * domain average load.
9537 */
9538 if (local->avg_load >= sds.avg_load)
9539 goto out_balanced;
9540
9541 /*
9542 * If the busiest group is more loaded, use imbalance_pct to be
9543 * conservative.
Peter Zijlstrac186faf2011-02-21 18:52:53 +01009544 */
Joonsoo Kim56cf5152013-08-06 17:36:43 +09009545 if (100 * busiest->avg_load <=
9546 env->sd->imbalance_pct * local->avg_load)
Peter Zijlstrac186faf2011-02-21 18:52:53 +01009547 goto out_balanced;
Suresh Siddhaaae6d3d2010-09-17 15:02:32 -07009548 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009549
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009550 /* Try to move all excess tasks to child's sibling domain */
9551 if (sds.prefer_sibling && local->group_type == group_has_spare &&
Vincent Guittot5e23e472019-10-18 15:26:32 +02009552 busiest->sum_nr_running > local->sum_nr_running + 1)
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009553 goto force_balance;
9554
Vincent Guittot2ab40922019-10-18 15:26:34 +02009555 if (busiest->group_type != group_overloaded) {
9556 if (env->idle == CPU_NOT_IDLE)
9557 /*
9558 * If the busiest group is not overloaded (and as a
9559 * result the local one too) but this CPU is already
9560 * busy, let another idle CPU try to pull task.
9561 */
9562 goto out_balanced;
9563
9564 if (busiest->group_weight > 1 &&
9565 local->idle_cpus <= (busiest->idle_cpus + 1))
9566 /*
9567 * If the busiest group is not overloaded
9568 * and there is no imbalance between this and busiest
9569 * group wrt idle CPUs, it is balanced. The imbalance
9570 * becomes significant if the diff is greater than 1
9571 * otherwise we might end up to just move the imbalance
9572 * on another group. Of course this applies only if
9573 * there is more than 1 CPU per group.
9574 */
9575 goto out_balanced;
9576
9577 if (busiest->sum_h_nr_running == 1)
9578 /*
9579 * busiest doesn't have any tasks waiting to run
9580 */
9581 goto out_balanced;
9582 }
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009583
Nikhil Raofab47622010-10-15 13:12:29 -07009584force_balance:
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009585 /* Looks like there is an imbalance. Compute it */
Peter Zijlstrabd939f42012-05-02 14:20:37 +02009586 calculate_imbalance(env, &sds);
Vincent Guittotbb3485c2018-09-07 09:51:04 +02009587 return env->imbalance ? sds.busiest : NULL;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009588
9589out_balanced:
Peter Zijlstrabd939f42012-05-02 14:20:37 +02009590 env->imbalance = 0;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009591 return NULL;
9592}
9593
9594/*
Ingo Molnar97fb7a02018-03-03 14:01:12 +01009595 * find_busiest_queue - find the busiest runqueue among the CPUs in the group.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009596 */
Peter Zijlstrabd939f42012-05-02 14:20:37 +02009597static struct rq *find_busiest_queue(struct lb_env *env,
Michael Wangb94031302012-07-12 16:10:13 +08009598 struct sched_group *group)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009599{
9600 struct rq *busiest = NULL, *rq;
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009601 unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
9602 unsigned int busiest_nr = 0;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009603 int i;
9604
Peter Zijlstraae4df9d2017-05-01 11:03:12 +02009605 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009606 unsigned long capacity, load, util;
9607 unsigned int nr_running;
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01009608 enum fbq_type rt;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009609
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01009610 rq = cpu_rq(i);
9611 rt = fbq_classify_rq(rq);
9612
9613 /*
9614 * We classify groups/runqueues into three groups:
9615 * - regular: there are !numa tasks
9616 * - remote: there are numa tasks that run on the 'wrong' node
9617 * - all: there is no distinction
9618 *
9619 * In order to avoid migrating ideally placed numa tasks,
9620 * ignore those when there's better options.
9621 *
9622 * If we ignore the actual busiest queue to migrate another
9623 * task, the next balance pass can still reduce the busiest
9624 * queue by moving tasks around inside the node.
9625 *
9626 * If we cannot move enough load due to this classification
9627 * the next pass will adjust the group classification and
9628 * allow migration of more tasks.
9629 *
9630 * Both cases only affect the total convergence complexity.
9631 */
9632 if (rt > env->fbq_type)
9633 continue;
9634
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009635 nr_running = rq->cfs.h_nr_running;
Vincent Guittotfc488ff2021-01-07 11:33:23 +01009636 if (!nr_running)
9637 continue;
9638
9639 capacity = capacity_of(i);
Srivatsa Vaddagiri9d5efe02010-06-08 14:57:02 +10009640
Chris Redpath4ad38312018-07-04 11:17:48 +01009641 /*
9642 * For ASYM_CPUCAPACITY domains, don't pick a CPU that could
9643 * eventually lead to active_balancing high->low capacity.
9644 * Higher per-CPU capacity is considered better than balancing
9645 * average load.
9646 */
9647 if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
Valentin Schneider4aed8aa2021-04-07 23:06:28 +01009648 !capacity_greater(capacity_of(env->dst_cpu), capacity) &&
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009649 nr_running == 1)
Chris Redpath4ad38312018-07-04 11:17:48 +01009650 continue;
9651
Ricardo Neri4006a722021-09-10 18:18:19 -07009652 /* Make sure we only pull tasks from a CPU of lower priority */
9653 if ((env->sd->flags & SD_ASYM_PACKING) &&
9654 sched_asym_prefer(i, env->dst_cpu) &&
9655 nr_running == 1)
9656 continue;
9657
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009658 switch (env->migration_type) {
9659 case migrate_load:
9660 /*
Vincent Guittotb0fb1eb2019-10-18 15:26:33 +02009661 * When comparing with load imbalance, use cpu_load()
9662 * which is not scaled with the CPU capacity.
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009663 */
Vincent Guittotb0fb1eb2019-10-18 15:26:33 +02009664 load = cpu_load(rq);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009665
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009666 if (nr_running == 1 && load > env->imbalance &&
9667 !check_cpu_capacity(rq, env->sd))
9668 break;
Vincent Guittotea678212015-02-27 16:54:11 +01009669
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009670 /*
9671 * For the load comparisons with the other CPUs,
Vincent Guittotb0fb1eb2019-10-18 15:26:33 +02009672 * consider the cpu_load() scaled with the CPU
9673 * capacity, so that the load can be moved away
9674 * from the CPU that is potentially running at a
9675 * lower capacity.
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009676 *
9677 * Thus we're looking for max(load_i / capacity_i),
9678 * crosswise multiplication to rid ourselves of the
9679 * division works out to:
9680 * load_i * capacity_j > load_j * capacity_i;
9681 * where j is our previous maximum.
9682 */
9683 if (load * busiest_capacity > busiest_load * capacity) {
9684 busiest_load = load;
9685 busiest_capacity = capacity;
9686 busiest = rq;
9687 }
9688 break;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009689
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009690 case migrate_util:
Dietmar Eggemann82762d22021-11-18 17:42:40 +01009691 util = cpu_util_cfs(i);
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009692
Vincent Guittotc32b4302020-03-12 17:54:29 +01009693 /*
9694 * Don't try to pull utilization from a CPU with one
9695 * running task. Whatever its utilization, we will fail
9696 * detach the task.
9697 */
9698 if (nr_running <= 1)
9699 continue;
9700
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009701 if (busiest_util < util) {
9702 busiest_util = util;
9703 busiest = rq;
9704 }
9705 break;
9706
9707 case migrate_task:
9708 if (busiest_nr < nr_running) {
9709 busiest_nr = nr_running;
9710 busiest = rq;
9711 }
9712 break;
9713
9714 case migrate_misfit:
9715 /*
9716 * For ASYM_CPUCAPACITY domains with misfit tasks we
9717 * simply seek the "biggest" misfit task.
9718 */
9719 if (rq->misfit_task_load > busiest_load) {
9720 busiest_load = rq->misfit_task_load;
9721 busiest = rq;
9722 }
9723
9724 break;
9725
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009726 }
9727 }
9728
9729 return busiest;
9730}
9731
9732/*
9733 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
9734 * so long as it is large enough.
9735 */
9736#define MAX_PINNED_INTERVAL 512
9737
Vincent Guittot46a745d2018-12-14 17:01:57 +01009738static inline bool
9739asym_active_balance(struct lb_env *env)
9740{
9741 /*
9742 * ASYM_PACKING needs to force migrate tasks from busy but
9743 * lower priority CPUs in order to pack all tasks in the
9744 * highest priority CPUs.
9745 */
9746 return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
9747 sched_asym_prefer(env->dst_cpu, env->src_cpu);
9748}
9749
9750static inline bool
Vincent Guittote9b97342021-01-07 11:33:25 +01009751imbalanced_active_balance(struct lb_env *env)
9752{
9753 struct sched_domain *sd = env->sd;
9754
9755 /*
9756 * The imbalanced case includes the case of pinned tasks preventing a fair
9757 * distribution of the load on the system but also the even distribution of the
9758 * threads on a system with spare capacity
9759 */
9760 if ((env->migration_type == migrate_task) &&
9761 (sd->nr_balance_failed > sd->cache_nice_tries+2))
9762 return 1;
9763
9764 return 0;
9765}
9766
9767static int need_active_balance(struct lb_env *env)
Peter Zijlstra1af3ed32009-12-23 15:10:31 +01009768{
Peter Zijlstrabd939f42012-05-02 14:20:37 +02009769 struct sched_domain *sd = env->sd;
9770
Vincent Guittot46a745d2018-12-14 17:01:57 +01009771 if (asym_active_balance(env))
9772 return 1;
Peter Zijlstra1af3ed32009-12-23 15:10:31 +01009773
Vincent Guittote9b97342021-01-07 11:33:25 +01009774 if (imbalanced_active_balance(env))
9775 return 1;
9776
Vincent Guittot1aaf90a2015-02-27 16:54:14 +01009777 /*
9778 * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
9779 * It's worth migrating the task if the src_cpu's capacity is reduced
9780 * because of other sched_class or IRQs if more capacity stays
9781 * available on dst_cpu.
9782 */
9783 if ((env->idle != CPU_NOT_IDLE) &&
9784 (env->src_rq->cfs.h_nr_running == 1)) {
9785 if ((check_cpu_capacity(env->src_rq, sd)) &&
9786 (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
9787 return 1;
9788 }
9789
Vincent Guittot0b0695f2019-10-18 15:26:31 +02009790 if (env->migration_type == migrate_misfit)
Morten Rasmussencad68e52018-07-04 11:17:42 +01009791 return 1;
9792
Vincent Guittot46a745d2018-12-14 17:01:57 +01009793 return 0;
9794}
9795
Tejun Heo969c7922010-05-06 18:49:21 +02009796static int active_load_balance_cpu_stop(void *data);
9797
Joonsoo Kim23f0d202013-08-06 17:36:42 +09009798static int should_we_balance(struct lb_env *env)
9799{
9800 struct sched_group *sg = env->sd->groups;
Peng Wang64297f22020-04-11 17:20:20 +08009801 int cpu;
Joonsoo Kim23f0d202013-08-06 17:36:42 +09009802
9803 /*
Peter Zijlstra024c9d22017-10-09 10:36:53 +02009804 * Ensure the balancing environment is consistent; can happen
9805 * when the softirq triggers 'during' hotplug.
9806 */
9807 if (!cpumask_test_cpu(env->dst_cpu, env->cpus))
9808 return 0;
9809
9810 /*
Ingo Molnar97fb7a02018-03-03 14:01:12 +01009811 * In the newly idle case, we will allow all the CPUs
Joonsoo Kim23f0d202013-08-06 17:36:42 +09009812 * to do the newly idle load balance.
9813 */
9814 if (env->idle == CPU_NEWLY_IDLE)
9815 return 1;
9816
Ingo Molnar97fb7a02018-03-03 14:01:12 +01009817 /* Try to find first idle CPU */
Peter Zijlstrae5c14b12017-05-01 10:47:02 +02009818 for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
Peter Zijlstraaf218122017-05-01 08:51:05 +02009819 if (!idle_cpu(cpu))
Joonsoo Kim23f0d202013-08-06 17:36:42 +09009820 continue;
9821
Peng Wang64297f22020-04-11 17:20:20 +08009822 /* Are we the first idle CPU? */
9823 return cpu == env->dst_cpu;
Joonsoo Kim23f0d202013-08-06 17:36:42 +09009824 }
9825
Peng Wang64297f22020-04-11 17:20:20 +08009826 /* Are we the first CPU of this group ? */
9827 return group_balance_cpu(sg) == env->dst_cpu;
Joonsoo Kim23f0d202013-08-06 17:36:42 +09009828}
9829
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009830/*
9831 * Check this_cpu to ensure it is balanced within domain. Attempt to move
9832 * tasks if there is an imbalance.
9833 */
9834static int load_balance(int this_cpu, struct rq *this_rq,
9835 struct sched_domain *sd, enum cpu_idle_type idle,
Joonsoo Kim23f0d202013-08-06 17:36:42 +09009836 int *continue_balancing)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009837{
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05309838 int ld_moved, cur_ld_moved, active_balance = 0;
Peter Zijlstra62633222013-08-19 12:41:09 +02009839 struct sched_domain *sd_parent = sd->parent;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009840 struct sched_group *group;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009841 struct rq *busiest;
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02009842 struct rq_flags rf;
Christoph Lameter4ba29682014-08-26 19:12:21 -05009843 struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009844
Peter Zijlstra8e45cb52012-02-22 12:47:19 +01009845 struct lb_env env = {
9846 .sd = sd,
Peter Zijlstraddcdf6e2012-02-22 19:27:40 +01009847 .dst_cpu = this_cpu,
9848 .dst_rq = this_rq,
Peter Zijlstraae4df9d2017-05-01 11:03:12 +02009849 .dst_grpmask = sched_group_span(sd->groups),
Peter Zijlstra8e45cb52012-02-22 12:47:19 +01009850 .idle = idle,
Peter Zijlstraeb953082012-04-17 13:38:40 +02009851 .loop_break = sched_nr_migrate_break,
Michael Wangb94031302012-07-12 16:10:13 +08009852 .cpus = cpus,
Peter Zijlstra0ec8aa02013-10-07 11:29:33 +01009853 .fbq_type = all,
Kirill Tkhai163122b2014-08-20 13:48:29 +04009854 .tasks = LIST_HEAD_INIT(env.tasks),
Peter Zijlstra8e45cb52012-02-22 12:47:19 +01009855 };
9856
Jeffrey Hugo65a44332017-06-07 13:18:57 -06009857 cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009858
Josh Poimboeufae928822016-06-17 12:43:24 -05009859 schedstat_inc(sd->lb_count[idle]);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009860
9861redo:
Joonsoo Kim23f0d202013-08-06 17:36:42 +09009862 if (!should_we_balance(&env)) {
9863 *continue_balancing = 0;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009864 goto out_balanced;
Joonsoo Kim23f0d202013-08-06 17:36:42 +09009865 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009866
Joonsoo Kim23f0d202013-08-06 17:36:42 +09009867 group = find_busiest_group(&env);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009868 if (!group) {
Josh Poimboeufae928822016-06-17 12:43:24 -05009869 schedstat_inc(sd->lb_nobusyg[idle]);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009870 goto out_balanced;
9871 }
9872
Michael Wangb94031302012-07-12 16:10:13 +08009873 busiest = find_busiest_queue(&env, group);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009874 if (!busiest) {
Josh Poimboeufae928822016-06-17 12:43:24 -05009875 schedstat_inc(sd->lb_nobusyq[idle]);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009876 goto out_balanced;
9877 }
9878
Michael Wang78feefc2012-08-06 16:41:59 +08009879 BUG_ON(busiest == env.dst_rq);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009880
Josh Poimboeufae928822016-06-17 12:43:24 -05009881 schedstat_add(sd->lb_imbalance[idle], env.imbalance);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009882
Vincent Guittot1aaf90a2015-02-27 16:54:14 +01009883 env.src_cpu = busiest->cpu;
9884 env.src_rq = busiest;
9885
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009886 ld_moved = 0;
Vincent Guittot8a41dfcd2021-01-07 11:33:24 +01009887 /* Clear this flag as soon as we find a pullable task */
9888 env.flags |= LBF_ALL_PINNED;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009889 if (busiest->nr_running > 1) {
9890 /*
9891 * Attempt to move tasks. If find_busiest_group has found
9892 * an imbalance but busiest->nr_running <= 1, the group is
9893 * still unbalanced. ld_moved simply stays zero, so it is
9894 * correctly treated as an imbalance.
9895 */
Peter Zijlstrac82513e2012-04-26 13:12:27 +02009896 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
Peter Zijlstra8e45cb52012-02-22 12:47:19 +01009897
Peter Zijlstra5d6523e2012-03-10 00:07:36 +01009898more_balance:
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02009899 rq_lock_irqsave(busiest, &rf);
Peter Zijlstra3bed5e22016-10-03 16:35:32 +02009900 update_rq_clock(busiest);
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05309901
9902 /*
9903 * cur_ld_moved - load moved in current iteration
9904 * ld_moved - cumulative load moved across iterations
9905 */
Kirill Tkhai163122b2014-08-20 13:48:29 +04009906 cur_ld_moved = detach_tasks(&env);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009907
9908 /*
Kirill Tkhai163122b2014-08-20 13:48:29 +04009909 * We've detached some tasks from busiest_rq. Every
9910 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
9911 * unlock busiest->lock, and we are able to be sure
9912 * that nobody can manipulate the tasks in parallel.
9913 * See task_rq_lock() family for the details.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009914 */
Kirill Tkhai163122b2014-08-20 13:48:29 +04009915
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02009916 rq_unlock(busiest, &rf);
Kirill Tkhai163122b2014-08-20 13:48:29 +04009917
9918 if (cur_ld_moved) {
9919 attach_tasks(&env);
9920 ld_moved += cur_ld_moved;
9921 }
9922
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +02009923 local_irq_restore(rf.flags);
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05309924
Joonsoo Kimf1cd0852013-04-23 17:27:37 +09009925 if (env.flags & LBF_NEED_BREAK) {
9926 env.flags &= ~LBF_NEED_BREAK;
9927 goto more_balance;
9928 }
9929
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05309930 /*
9931 * Revisit (affine) tasks on src_cpu that couldn't be moved to
9932 * us and move them to an alternate dst_cpu in our sched_group
9933 * where they can run. The upper limit on how many times we
Ingo Molnar97fb7a02018-03-03 14:01:12 +01009934 * iterate on same src_cpu is dependent on number of CPUs in our
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05309935 * sched_group.
9936 *
9937 * This changes load balance semantics a bit on who can move
9938 * load to a given_cpu. In addition to the given_cpu itself
9939 * (or a ilb_cpu acting on its behalf where given_cpu is
9940 * nohz-idle), we now have balance_cpu in a position to move
9941 * load to given_cpu. In rare situations, this may cause
9942 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
9943 * _independently_ and at _same_ time to move some load to
Ingo Molnar3b037062021-03-18 13:38:50 +01009944 * given_cpu) causing excess load to be moved to given_cpu.
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05309945 * This however should not happen so much in practice and
9946 * moreover subsequent load balance cycles should correct the
9947 * excess load moved.
9948 */
Peter Zijlstra62633222013-08-19 12:41:09 +02009949 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05309950
Ingo Molnar97fb7a02018-03-03 14:01:12 +01009951 /* Prevent to re-select dst_cpu via env's CPUs */
Viresh Kumarc89d92e2019-02-12 14:57:01 +05309952 __cpumask_clear_cpu(env.dst_cpu, env.cpus);
Vladimir Davydov7aff2e32013-09-15 21:30:13 +04009953
Michael Wang78feefc2012-08-06 16:41:59 +08009954 env.dst_rq = cpu_rq(env.new_dst_cpu);
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05309955 env.dst_cpu = env.new_dst_cpu;
Peter Zijlstra62633222013-08-19 12:41:09 +02009956 env.flags &= ~LBF_DST_PINNED;
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05309957 env.loop = 0;
9958 env.loop_break = sched_nr_migrate_break;
Joonsoo Kime02e60c2013-04-23 17:27:42 +09009959
Srivatsa Vaddagiri88b8dac2012-06-19 17:43:15 +05309960 /*
9961 * Go back to "more_balance" rather than "redo" since we
9962 * need to continue with same src_cpu.
9963 */
9964 goto more_balance;
9965 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009966
Peter Zijlstra62633222013-08-19 12:41:09 +02009967 /*
9968 * We failed to reach balance because of affinity.
9969 */
9970 if (sd_parent) {
Nicolas Pitre63b2ca32014-05-26 18:19:37 -04009971 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
Peter Zijlstra62633222013-08-19 12:41:09 +02009972
Vincent Guittotafdeee02014-08-26 13:06:44 +02009973 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
Peter Zijlstra62633222013-08-19 12:41:09 +02009974 *group_imbalance = 1;
Peter Zijlstra62633222013-08-19 12:41:09 +02009975 }
9976
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009977 /* All tasks on this runqueue were pinned by CPU affinity */
Peter Zijlstra8e45cb52012-02-22 12:47:19 +01009978 if (unlikely(env.flags & LBF_ALL_PINNED)) {
Viresh Kumarc89d92e2019-02-12 14:57:01 +05309979 __cpumask_clear_cpu(cpu_of(busiest), cpus);
Jeffrey Hugo65a44332017-06-07 13:18:57 -06009980 /*
9981 * Attempting to continue load balancing at the current
9982 * sched_domain level only makes sense if there are
9983 * active CPUs remaining as possible busiest CPUs to
9984 * pull load from which are not contained within the
9985 * destination group that is receiving any migrated
9986 * load.
9987 */
9988 if (!cpumask_subset(cpus, env.dst_grpmask)) {
Prashanth Nageshappabbf18b12012-06-19 17:52:07 +05309989 env.loop = 0;
9990 env.loop_break = sched_nr_migrate_break;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009991 goto redo;
Prashanth Nageshappabbf18b12012-06-19 17:52:07 +05309992 }
Vincent Guittotafdeee02014-08-26 13:06:44 +02009993 goto out_all_pinned;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01009994 }
9995 }
9996
9997 if (!ld_moved) {
Josh Poimboeufae928822016-06-17 12:43:24 -05009998 schedstat_inc(sd->lb_failed[idle]);
Venkatesh Pallipadi58b26c42010-09-10 18:19:17 -07009999 /*
10000 * Increment the failure counter only on periodic balance.
10001 * We do not want newidle balance, which can be very
10002 * frequent, pollute the failure counter causing
10003 * excessive cache_hot migrations and active balances.
10004 */
10005 if (idle != CPU_NEWLY_IDLE)
10006 sd->nr_balance_failed++;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010007
Peter Zijlstrabd939f42012-05-02 14:20:37 +020010008 if (need_active_balance(&env)) {
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +020010009 unsigned long flags;
10010
Peter Zijlstra5cb9eaa2020-11-17 18:19:31 -050010011 raw_spin_rq_lock_irqsave(busiest, flags);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010012
Ingo Molnar97fb7a02018-03-03 14:01:12 +010010013 /*
10014 * Don't kick the active_load_balance_cpu_stop,
10015 * if the curr task on busiest CPU can't be
10016 * moved to this_cpu:
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010017 */
Sebastian Andrzej Siewior3bd37062019-04-23 16:26:36 +020010018 if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
Peter Zijlstra5cb9eaa2020-11-17 18:19:31 -050010019 raw_spin_rq_unlock_irqrestore(busiest, flags);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010020 goto out_one_pinned;
10021 }
10022
Vincent Guittot8a41dfcd2021-01-07 11:33:24 +010010023 /* Record that we found at least one task that could run on this_cpu */
10024 env.flags &= ~LBF_ALL_PINNED;
10025
Tejun Heo969c7922010-05-06 18:49:21 +020010026 /*
10027 * ->active_balance synchronizes accesses to
10028 * ->active_balance_work. Once set, it's cleared
10029 * only after active load balance is finished.
10030 */
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010031 if (!busiest->active_balance) {
10032 busiest->active_balance = 1;
10033 busiest->push_cpu = this_cpu;
10034 active_balance = 1;
10035 }
Peter Zijlstra5cb9eaa2020-11-17 18:19:31 -050010036 raw_spin_rq_unlock_irqrestore(busiest, flags);
Tejun Heo969c7922010-05-06 18:49:21 +020010037
Peter Zijlstrabd939f42012-05-02 14:20:37 +020010038 if (active_balance) {
Tejun Heo969c7922010-05-06 18:49:21 +020010039 stop_one_cpu_nowait(cpu_of(busiest),
10040 active_load_balance_cpu_stop, busiest,
10041 &busiest->active_balance_work);
Peter Zijlstrabd939f42012-05-02 14:20:37 +020010042 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010043 }
Vincent Guittote9b97342021-01-07 11:33:25 +010010044 } else {
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010045 sd->nr_balance_failed = 0;
Vincent Guittote9b97342021-01-07 11:33:25 +010010046 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010047
Vincent Guittote9b97342021-01-07 11:33:25 +010010048 if (likely(!active_balance) || need_active_balance(&env)) {
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010049 /* We were unbalanced, so reset the balancing interval */
10050 sd->balance_interval = sd->min_interval;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010051 }
10052
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010053 goto out;
10054
10055out_balanced:
Vincent Guittotafdeee02014-08-26 13:06:44 +020010056 /*
10057 * We reach balance although we may have faced some affinity
Vincent Guittotf6cad8d2019-07-01 17:47:02 +020010058 * constraints. Clear the imbalance flag only if other tasks got
10059 * a chance to move and fix the imbalance.
Vincent Guittotafdeee02014-08-26 13:06:44 +020010060 */
Vincent Guittotf6cad8d2019-07-01 17:47:02 +020010061 if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
Vincent Guittotafdeee02014-08-26 13:06:44 +020010062 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
10063
10064 if (*group_imbalance)
10065 *group_imbalance = 0;
10066 }
10067
10068out_all_pinned:
10069 /*
10070 * We reach balance because all tasks are pinned at this level so
10071 * we can't migrate them. Let the imbalance flag set so parent level
10072 * can try to migrate them.
10073 */
Josh Poimboeufae928822016-06-17 12:43:24 -050010074 schedstat_inc(sd->lb_balanced[idle]);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010075
10076 sd->nr_balance_failed = 0;
10077
10078out_one_pinned:
Venkatesh Pallipadi46e49b32011-02-14 14:38:50 -080010079 ld_moved = 0;
Valentin Schneider3f130a32018-09-26 16:12:07 +010010080
10081 /*
Peter Zijlstra5ba553e2019-05-29 20:36:42 +000010082 * newidle_balance() disregards balance intervals, so we could
10083 * repeatedly reach this code, which would lead to balance_interval
Ingo Molnar3b037062021-03-18 13:38:50 +010010084 * skyrocketing in a short amount of time. Skip the balance_interval
Peter Zijlstra5ba553e2019-05-29 20:36:42 +000010085 * increase logic to avoid that.
Valentin Schneider3f130a32018-09-26 16:12:07 +010010086 */
10087 if (env.idle == CPU_NEWLY_IDLE)
10088 goto out;
10089
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010090 /* tune up the balancing interval */
Valentin Schneider47b7aee2018-09-26 16:12:06 +010010091 if ((env.flags & LBF_ALL_PINNED &&
10092 sd->balance_interval < MAX_PINNED_INTERVAL) ||
10093 sd->balance_interval < sd->max_interval)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010094 sd->balance_interval *= 2;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010095out:
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010096 return ld_moved;
10097}
10098
Jason Low52a08ef2014-05-08 17:49:22 -070010099static inline unsigned long
10100get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
10101{
10102 unsigned long interval = sd->balance_interval;
10103
10104 if (cpu_busy)
10105 interval *= sd->busy_factor;
10106
10107 /* scale ms to jiffies */
10108 interval = msecs_to_jiffies(interval);
Vincent Guittote4d32e42020-09-21 09:24:23 +020010109
10110 /*
10111 * Reduce likelihood of busy balancing at higher domains racing with
10112 * balancing at lower domains by preventing their balancing periods
10113 * from being multiples of each other.
10114 */
10115 if (cpu_busy)
10116 interval -= 1;
10117
Jason Low52a08ef2014-05-08 17:49:22 -070010118 interval = clamp(interval, 1UL, max_load_balance_interval);
10119
10120 return interval;
10121}
10122
10123static inline void
Leo Yan31851a92016-08-05 14:31:29 +080010124update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
Jason Low52a08ef2014-05-08 17:49:22 -070010125{
10126 unsigned long interval, next;
10127
Leo Yan31851a92016-08-05 14:31:29 +080010128 /* used by idle balance, so cpu_busy = 0 */
10129 interval = get_sd_balance_interval(sd, 0);
Jason Low52a08ef2014-05-08 17:49:22 -070010130 next = sd->last_balance + interval;
10131
10132 if (time_after(*next_balance, next))
10133 *next_balance = next;
10134}
10135
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010136/*
Ingo Molnar97fb7a02018-03-03 14:01:12 +010010137 * active_load_balance_cpu_stop is run by the CPU stopper. It pushes
Tejun Heo969c7922010-05-06 18:49:21 +020010138 * running tasks off the busiest CPU onto idle CPUs. It requires at
10139 * least 1 task to be running on each physical CPU where possible, and
10140 * avoids physical / logical imbalances.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010141 */
Tejun Heo969c7922010-05-06 18:49:21 +020010142static int active_load_balance_cpu_stop(void *data)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010143{
Tejun Heo969c7922010-05-06 18:49:21 +020010144 struct rq *busiest_rq = data;
10145 int busiest_cpu = cpu_of(busiest_rq);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010146 int target_cpu = busiest_rq->push_cpu;
Tejun Heo969c7922010-05-06 18:49:21 +020010147 struct rq *target_rq = cpu_rq(target_cpu);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010148 struct sched_domain *sd;
Kirill Tkhaie5673f22014-08-20 13:48:01 +040010149 struct task_struct *p = NULL;
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +020010150 struct rq_flags rf;
Tejun Heo969c7922010-05-06 18:49:21 +020010151
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +020010152 rq_lock_irq(busiest_rq, &rf);
Peter Zijlstraedd8e412017-09-07 17:03:51 +020010153 /*
10154 * Between queueing the stop-work and running it is a hole in which
10155 * CPUs can become inactive. We should not move tasks from or to
10156 * inactive CPUs.
10157 */
10158 if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
10159 goto out_unlock;
Tejun Heo969c7922010-05-06 18:49:21 +020010160
Ingo Molnar97fb7a02018-03-03 14:01:12 +010010161 /* Make sure the requested CPU hasn't gone down in the meantime: */
Tejun Heo969c7922010-05-06 18:49:21 +020010162 if (unlikely(busiest_cpu != smp_processor_id() ||
10163 !busiest_rq->active_balance))
10164 goto out_unlock;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010165
10166 /* Is there any task to move? */
10167 if (busiest_rq->nr_running <= 1)
Tejun Heo969c7922010-05-06 18:49:21 +020010168 goto out_unlock;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010169
10170 /*
10171 * This condition is "impossible", if it occurs
10172 * we need to fix it. Originally reported by
Ingo Molnar97fb7a02018-03-03 14:01:12 +010010173 * Bjorn Helgaas on a 128-CPU setup.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010174 */
10175 BUG_ON(busiest_rq == target_rq);
10176
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010177 /* Search for an sd spanning us and the target CPU. */
Peter Zijlstradce840a2011-04-07 14:09:50 +020010178 rcu_read_lock();
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010179 for_each_domain(target_cpu, sd) {
Valentin Schneidere669ac82020-04-15 22:05:06 +010010180 if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
10181 break;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010182 }
10183
10184 if (likely(sd)) {
Peter Zijlstra8e45cb52012-02-22 12:47:19 +010010185 struct lb_env env = {
10186 .sd = sd,
Peter Zijlstraddcdf6e2012-02-22 19:27:40 +010010187 .dst_cpu = target_cpu,
10188 .dst_rq = target_rq,
10189 .src_cpu = busiest_rq->cpu,
10190 .src_rq = busiest_rq,
Peter Zijlstra8e45cb52012-02-22 12:47:19 +010010191 .idle = CPU_IDLE,
Valentin Schneider23fb06d2021-04-07 23:06:27 +010010192 .flags = LBF_ACTIVE_LB,
Peter Zijlstra8e45cb52012-02-22 12:47:19 +010010193 };
10194
Josh Poimboeufae928822016-06-17 12:43:24 -050010195 schedstat_inc(sd->alb_count);
Peter Zijlstra3bed5e22016-10-03 16:35:32 +020010196 update_rq_clock(busiest_rq);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010197
Kirill Tkhaie5673f22014-08-20 13:48:01 +040010198 p = detach_one_task(&env);
Srikar Dronamrajud02c071182016-03-23 17:54:44 +053010199 if (p) {
Josh Poimboeufae928822016-06-17 12:43:24 -050010200 schedstat_inc(sd->alb_pushed);
Srikar Dronamrajud02c071182016-03-23 17:54:44 +053010201 /* Active balancing done, reset the failure counter. */
10202 sd->nr_balance_failed = 0;
10203 } else {
Josh Poimboeufae928822016-06-17 12:43:24 -050010204 schedstat_inc(sd->alb_failed);
Srikar Dronamrajud02c071182016-03-23 17:54:44 +053010205 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010206 }
Peter Zijlstradce840a2011-04-07 14:09:50 +020010207 rcu_read_unlock();
Tejun Heo969c7922010-05-06 18:49:21 +020010208out_unlock:
10209 busiest_rq->active_balance = 0;
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +020010210 rq_unlock(busiest_rq, &rf);
Kirill Tkhaie5673f22014-08-20 13:48:01 +040010211
10212 if (p)
10213 attach_one_task(target_rq, p);
10214
10215 local_irq_enable();
10216
Tejun Heo969c7922010-05-06 18:49:21 +020010217 return 0;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010218}
10219
Peter Zijlstraaf3fe032018-02-20 10:58:39 +010010220static DEFINE_SPINLOCK(balancing);
10221
10222/*
10223 * Scale the max load_balance interval with the number of CPUs in the system.
10224 * This trades load-balance latency on larger machines for less cross talk.
10225 */
10226void update_max_interval(void)
10227{
10228 max_load_balance_interval = HZ*num_online_cpus()/10;
10229}
10230
Vincent Guittote60b56e2021-10-19 14:35:35 +020010231static inline bool update_newidle_cost(struct sched_domain *sd, u64 cost)
10232{
10233 if (cost > sd->max_newidle_lb_cost) {
10234 /*
10235 * Track max cost of a domain to make sure to not delay the
10236 * next wakeup on the CPU.
10237 */
10238 sd->max_newidle_lb_cost = cost;
10239 sd->last_decay_max_lb_cost = jiffies;
10240 } else if (time_after(jiffies, sd->last_decay_max_lb_cost + HZ)) {
10241 /*
10242 * Decay the newidle max times by ~1% per second to ensure that
10243 * it is not outdated and the current max cost is actually
10244 * shorter.
10245 */
10246 sd->max_newidle_lb_cost = (sd->max_newidle_lb_cost * 253) / 256;
10247 sd->last_decay_max_lb_cost = jiffies;
10248
10249 return true;
10250 }
10251
10252 return false;
10253}
10254
Peter Zijlstraaf3fe032018-02-20 10:58:39 +010010255/*
10256 * It checks each scheduling domain to see if it is due to be balanced,
10257 * and initiates a balancing operation if so.
10258 *
10259 * Balancing parameters are set up in init_sched_domains.
10260 */
10261static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
10262{
10263 int continue_balancing = 1;
10264 int cpu = rq->cpu;
Viresh Kumar323af6d2020-01-08 13:57:04 +053010265 int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
Peter Zijlstraaf3fe032018-02-20 10:58:39 +010010266 unsigned long interval;
10267 struct sched_domain *sd;
10268 /* Earliest time when we have to do rebalance again */
10269 unsigned long next_balance = jiffies + 60*HZ;
10270 int update_next_balance = 0;
10271 int need_serialize, need_decay = 0;
10272 u64 max_cost = 0;
10273
10274 rcu_read_lock();
10275 for_each_domain(cpu, sd) {
10276 /*
10277 * Decay the newidle max times here because this is a regular
Vincent Guittote60b56e2021-10-19 14:35:35 +020010278 * visit to all the domains.
Peter Zijlstraaf3fe032018-02-20 10:58:39 +010010279 */
Vincent Guittote60b56e2021-10-19 14:35:35 +020010280 need_decay = update_newidle_cost(sd, 0);
Peter Zijlstraaf3fe032018-02-20 10:58:39 +010010281 max_cost += sd->max_newidle_lb_cost;
10282
Peter Zijlstraaf3fe032018-02-20 10:58:39 +010010283 /*
10284 * Stop the load balance at this level. There is another
10285 * CPU in our sched group which is doing load balancing more
10286 * actively.
10287 */
10288 if (!continue_balancing) {
10289 if (need_decay)
10290 continue;
10291 break;
10292 }
10293
Viresh Kumar323af6d2020-01-08 13:57:04 +053010294 interval = get_sd_balance_interval(sd, busy);
Peter Zijlstraaf3fe032018-02-20 10:58:39 +010010295
10296 need_serialize = sd->flags & SD_SERIALIZE;
10297 if (need_serialize) {
10298 if (!spin_trylock(&balancing))
10299 goto out;
10300 }
10301
10302 if (time_after_eq(jiffies, sd->last_balance + interval)) {
10303 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
10304 /*
10305 * The LBF_DST_PINNED logic could have changed
10306 * env->dst_cpu, so we can't know our idle
10307 * state even if we migrated tasks. Update it.
10308 */
10309 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
Viresh Kumar323af6d2020-01-08 13:57:04 +053010310 busy = idle != CPU_IDLE && !sched_idle_cpu(cpu);
Peter Zijlstraaf3fe032018-02-20 10:58:39 +010010311 }
10312 sd->last_balance = jiffies;
Viresh Kumar323af6d2020-01-08 13:57:04 +053010313 interval = get_sd_balance_interval(sd, busy);
Peter Zijlstraaf3fe032018-02-20 10:58:39 +010010314 }
10315 if (need_serialize)
10316 spin_unlock(&balancing);
10317out:
10318 if (time_after(next_balance, sd->last_balance + interval)) {
10319 next_balance = sd->last_balance + interval;
10320 update_next_balance = 1;
10321 }
10322 }
10323 if (need_decay) {
10324 /*
10325 * Ensure the rq-wide value also decays but keep it at a
10326 * reasonable floor to avoid funnies with rq->avg_idle.
10327 */
10328 rq->max_idle_balance_cost =
10329 max((u64)sysctl_sched_migration_cost, max_cost);
10330 }
10331 rcu_read_unlock();
10332
10333 /*
10334 * next_balance will be updated only when there is a need.
10335 * When the cpu is attached to null domain for ex, it will not be
10336 * updated.
10337 */
Vincent Guittot7a82e5f2021-02-24 14:30:04 +010010338 if (likely(update_next_balance))
Peter Zijlstraaf3fe032018-02-20 10:58:39 +010010339 rq->next_balance = next_balance;
10340
Peter Zijlstraaf3fe032018-02-20 10:58:39 +010010341}
10342
Mike Galbraithd987fc72011-12-05 10:01:47 +010010343static inline int on_null_domain(struct rq *rq)
10344{
10345 return unlikely(!rcu_dereference_sched(rq->sd));
10346}
10347
Frederic Weisbecker3451d022011-08-10 23:21:01 +020010348#ifdef CONFIG_NO_HZ_COMMON
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010349/*
10350 * idle load balancing details
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010351 * - When one of the busy CPUs notice that there may be an idle rebalancing
10352 * needed, they will kick the idle load balancer, which then does idle
10353 * load balancing for all the idle CPUs.
Nicholas Piggin9b019ac2019-04-12 14:26:13 +100010354 * - HK_FLAG_MISC CPUs are used for this task, because HK_FLAG_SCHED not set
10355 * anywhere yet.
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010356 */
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010357
Daniel Lezcano3dd03372014-01-06 12:34:41 +010010358static inline int find_new_ilb(void)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010359{
Nicholas Piggin9b019ac2019-04-12 14:26:13 +100010360 int ilb;
Yuan ZhaoXiong031e3bd2021-06-06 21:11:55 +080010361 const struct cpumask *hk_mask;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010362
Yuan ZhaoXiong031e3bd2021-06-06 21:11:55 +080010363 hk_mask = housekeeping_cpumask(HK_FLAG_MISC);
10364
10365 for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) {
Peter Zijlstra45da7a22020-08-18 10:48:17 +020010366
10367 if (ilb == smp_processor_id())
10368 continue;
10369
Nicholas Piggin9b019ac2019-04-12 14:26:13 +100010370 if (idle_cpu(ilb))
10371 return ilb;
10372 }
Suresh Siddha786d6dc2011-12-01 17:07:35 -080010373
10374 return nr_cpu_ids;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010375}
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010376
10377/*
Nicholas Piggin9b019ac2019-04-12 14:26:13 +100010378 * Kick a CPU to do the nohz balancing, if it is time for it. We pick any
10379 * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one).
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010380 */
Peter Zijlstraa4064fb2017-12-21 10:42:50 +010010381static void kick_ilb(unsigned int flags)
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010382{
10383 int ilb_cpu;
10384
Vincent Guittot3ea2f092020-06-09 14:37:48 +020010385 /*
10386 * Increase nohz.next_balance only when if full ilb is triggered but
10387 * not if we only update stats.
10388 */
10389 if (flags & NOHZ_BALANCE_KICK)
10390 nohz.next_balance = jiffies+1;
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010391
Daniel Lezcano3dd03372014-01-06 12:34:41 +010010392 ilb_cpu = find_new_ilb();
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010393
Suresh Siddha0b005cf2011-12-01 17:07:34 -080010394 if (ilb_cpu >= nr_cpu_ids)
10395 return;
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010396
Peter Zijlstra19a1f5e2020-05-26 18:10:58 +020010397 /*
10398 * Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets
10399 * the first flag owns it; cleared by nohz_csd_func().
10400 */
Peter Zijlstraa4064fb2017-12-21 10:42:50 +010010401 flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu));
Peter Zijlstrab7031a02017-12-21 10:11:09 +010010402 if (flags & NOHZ_KICK_MASK)
Suresh Siddha1c792db2011-12-01 17:07:32 -080010403 return;
Peter Zijlstra45504872017-12-21 10:47:48 +010010404
Suresh Siddha1c792db2011-12-01 17:07:32 -080010405 /*
Peter Zijlstra (Intel)90b53632020-03-27 11:44:56 +010010406 * This way we generate an IPI on the target CPU which
Suresh Siddha1c792db2011-12-01 17:07:32 -080010407 * is idle. And the softirq performing nohz idle load balance
10408 * will be run before returning from the IPI.
10409 */
Peter Zijlstra (Intel)90b53632020-03-27 11:44:56 +010010410 smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd);
Peter Zijlstra45504872017-12-21 10:47:48 +010010411}
10412
10413/*
Valentin Schneider9f132742019-01-17 15:34:09 +000010414 * Current decision point for kicking the idle load balancer in the presence
10415 * of idle CPUs in the system.
Peter Zijlstra45504872017-12-21 10:47:48 +010010416 */
10417static void nohz_balancer_kick(struct rq *rq)
10418{
10419 unsigned long now = jiffies;
10420 struct sched_domain_shared *sds;
10421 struct sched_domain *sd;
10422 int nr_busy, i, cpu = rq->cpu;
Peter Zijlstraa4064fb2017-12-21 10:42:50 +010010423 unsigned int flags = 0;
Peter Zijlstra45504872017-12-21 10:47:48 +010010424
10425 if (unlikely(rq->idle_balance))
10426 return;
10427
10428 /*
10429 * We may be recently in ticked or tickless idle mode. At the first
10430 * busy tick after returning from idle, we will update the busy stats.
10431 */
Peter Zijlstra00357f52017-12-21 15:06:50 +010010432 nohz_balance_exit_idle(rq);
Peter Zijlstra45504872017-12-21 10:47:48 +010010433
10434 /*
10435 * None are in tickless mode and hence no need for NOHZ idle load
10436 * balancing.
10437 */
10438 if (likely(!atomic_read(&nohz.nr_cpus)))
10439 return;
10440
Vincent Guittotf643ea22018-02-13 11:31:17 +010010441 if (READ_ONCE(nohz.has_blocked) &&
10442 time_after(now, READ_ONCE(nohz.next_blocked)))
Peter Zijlstraa4064fb2017-12-21 10:42:50 +010010443 flags = NOHZ_STATS_KICK;
10444
Peter Zijlstra45504872017-12-21 10:47:48 +010010445 if (time_before(now, nohz.next_balance))
Peter Zijlstraa4064fb2017-12-21 10:42:50 +010010446 goto out;
Peter Zijlstra45504872017-12-21 10:47:48 +010010447
Valentin Schneidera0fe2cf2019-02-11 17:59:45 +000010448 if (rq->nr_running >= 2) {
Valentin Schneiderefd984c2021-08-23 12:16:59 +010010449 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
Peter Zijlstra45504872017-12-21 10:47:48 +010010450 goto out;
10451 }
10452
10453 rcu_read_lock();
Peter Zijlstra45504872017-12-21 10:47:48 +010010454
10455 sd = rcu_dereference(rq->sd);
10456 if (sd) {
Valentin Schneidere25a7a92019-02-11 17:59:44 +000010457 /*
10458 * If there's a CFS task and the current CPU has reduced
10459 * capacity; kick the ILB to see if there's a better CPU to run
10460 * on.
10461 */
10462 if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) {
Valentin Schneiderefd984c2021-08-23 12:16:59 +010010463 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
Peter Zijlstra45504872017-12-21 10:47:48 +010010464 goto unlock;
10465 }
10466 }
10467
Quentin Perret011b27b2018-12-03 09:56:19 +000010468 sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
Peter Zijlstra45504872017-12-21 10:47:48 +010010469 if (sd) {
Valentin Schneidere25a7a92019-02-11 17:59:44 +000010470 /*
10471 * When ASYM_PACKING; see if there's a more preferred CPU
10472 * currently idle; in which case, kick the ILB to move tasks
10473 * around.
10474 */
Valentin Schneider7edab782019-01-17 15:34:07 +000010475 for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
Peter Zijlstra45504872017-12-21 10:47:48 +010010476 if (sched_asym_prefer(i, cpu)) {
Valentin Schneiderefd984c2021-08-23 12:16:59 +010010477 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
Peter Zijlstra45504872017-12-21 10:47:48 +010010478 goto unlock;
10479 }
10480 }
10481 }
Valentin Schneiderb9a7b882019-02-11 17:59:46 +000010482
10483 sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu));
10484 if (sd) {
10485 /*
10486 * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU
10487 * to run the misfit task on.
10488 */
10489 if (check_misfit_status(rq, sd)) {
Valentin Schneiderefd984c2021-08-23 12:16:59 +010010490 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
Valentin Schneiderb9a7b882019-02-11 17:59:46 +000010491 goto unlock;
10492 }
10493
10494 /*
10495 * For asymmetric systems, we do not want to nicely balance
10496 * cache use, instead we want to embrace asymmetry and only
10497 * ensure tasks have enough CPU capacity.
10498 *
10499 * Skip the LLC logic because it's not relevant in that case.
10500 */
10501 goto unlock;
10502 }
10503
Peter Zijlstra45504872017-12-21 10:47:48 +010010504 sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
10505 if (sds) {
10506 /*
10507 * If there is an imbalance between LLC domains (IOW we could
10508 * increase the overall cache use), we need some less-loaded LLC
10509 * domain to pull some load. Likewise, we may need to spread
10510 * load within the current LLC domain (e.g. packed SMT cores but
10511 * other CPUs are idle). We can't really know from here how busy
10512 * the others are - so just get a nohz balance going if it looks
10513 * like this LLC domain has tasks we could move.
10514 */
10515 nr_busy = atomic_read(&sds->nr_busy_cpus);
10516 if (nr_busy > 1) {
Valentin Schneiderefd984c2021-08-23 12:16:59 +010010517 flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
Peter Zijlstra45504872017-12-21 10:47:48 +010010518 goto unlock;
10519 }
Peter Zijlstra45504872017-12-21 10:47:48 +010010520 }
10521unlock:
10522 rcu_read_unlock();
10523out:
Valentin Schneider7fd7a9e2021-08-23 12:17:00 +010010524 if (READ_ONCE(nohz.needs_update))
10525 flags |= NOHZ_NEXT_KICK;
10526
Peter Zijlstraa4064fb2017-12-21 10:42:50 +010010527 if (flags)
10528 kick_ilb(flags);
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010529}
10530
Peter Zijlstra00357f52017-12-21 15:06:50 +010010531static void set_cpu_sd_state_busy(int cpu)
Suresh Siddha69e1e812011-12-01 17:07:33 -080010532{
10533 struct sched_domain *sd;
Peter Zijlstra00357f52017-12-21 15:06:50 +010010534
10535 rcu_read_lock();
10536 sd = rcu_dereference(per_cpu(sd_llc, cpu));
10537
10538 if (!sd || !sd->nohz_idle)
10539 goto unlock;
10540 sd->nohz_idle = 0;
10541
10542 atomic_inc(&sd->shared->nr_busy_cpus);
10543unlock:
10544 rcu_read_unlock();
10545}
10546
10547void nohz_balance_exit_idle(struct rq *rq)
10548{
10549 SCHED_WARN_ON(rq != this_rq());
10550
10551 if (likely(!rq->nohz_tick_stopped))
10552 return;
10553
10554 rq->nohz_tick_stopped = 0;
10555 cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask);
10556 atomic_dec(&nohz.nr_cpus);
10557
10558 set_cpu_sd_state_busy(rq->cpu);
10559}
10560
10561static void set_cpu_sd_state_idle(int cpu)
10562{
10563 struct sched_domain *sd;
Suresh Siddha69e1e812011-12-01 17:07:33 -080010564
Suresh Siddha69e1e812011-12-01 17:07:33 -080010565 rcu_read_lock();
Peter Zijlstra0e369d72016-05-09 10:38:01 +020010566 sd = rcu_dereference(per_cpu(sd_llc, cpu));
Vincent Guittot25f55d92013-04-23 16:59:02 +020010567
10568 if (!sd || sd->nohz_idle)
10569 goto unlock;
10570 sd->nohz_idle = 1;
10571
Peter Zijlstra0e369d72016-05-09 10:38:01 +020010572 atomic_dec(&sd->shared->nr_busy_cpus);
Vincent Guittot25f55d92013-04-23 16:59:02 +020010573unlock:
Suresh Siddha69e1e812011-12-01 17:07:33 -080010574 rcu_read_unlock();
10575}
10576
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010577/*
Ingo Molnar97fb7a02018-03-03 14:01:12 +010010578 * This routine will record that the CPU is going idle with tick stopped.
Suresh Siddha0b005cf2011-12-01 17:07:34 -080010579 * This info will be used in performing idle load balancing in the future.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010580 */
Alex Shic1cc0172012-09-10 15:10:58 +080010581void nohz_balance_enter_idle(int cpu)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010582{
Peter Zijlstra00357f52017-12-21 15:06:50 +010010583 struct rq *rq = cpu_rq(cpu);
10584
10585 SCHED_WARN_ON(cpu != smp_processor_id());
10586
Ingo Molnar97fb7a02018-03-03 14:01:12 +010010587 /* If this CPU is going down, then nothing needs to be done: */
Suresh Siddha71325962012-01-19 18:28:57 -080010588 if (!cpu_active(cpu))
10589 return;
10590
Frederic Weisbecker387bc8b2017-06-19 04:12:02 +020010591 /* Spare idle load balancing on CPUs that don't want to be disturbed: */
Frederic Weisbeckerde201552017-10-27 04:42:35 +020010592 if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
Frederic Weisbecker387bc8b2017-06-19 04:12:02 +020010593 return;
10594
Vincent Guittotf643ea22018-02-13 11:31:17 +010010595 /*
10596 * Can be set safely without rq->lock held
10597 * If a clear happens, it will have evaluated last additions because
10598 * rq->lock is held during the check and the clear
10599 */
10600 rq->has_blocked_load = 1;
10601
10602 /*
10603 * The tick is still stopped but load could have been added in the
10604 * meantime. We set the nohz.has_blocked flag to trig a check of the
10605 * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear
10606 * of nohz.has_blocked can only happen after checking the new load
10607 */
Peter Zijlstra00357f52017-12-21 15:06:50 +010010608 if (rq->nohz_tick_stopped)
Vincent Guittotf643ea22018-02-13 11:31:17 +010010609 goto out;
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010610
Ingo Molnar97fb7a02018-03-03 14:01:12 +010010611 /* If we're a completely isolated CPU, we don't play: */
Peter Zijlstra00357f52017-12-21 15:06:50 +010010612 if (on_null_domain(rq))
Mike Galbraithd987fc72011-12-05 10:01:47 +010010613 return;
10614
Peter Zijlstra00357f52017-12-21 15:06:50 +010010615 rq->nohz_tick_stopped = 1;
10616
Alex Shic1cc0172012-09-10 15:10:58 +080010617 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
10618 atomic_inc(&nohz.nr_cpus);
Peter Zijlstra00357f52017-12-21 15:06:50 +010010619
Vincent Guittotf643ea22018-02-13 11:31:17 +010010620 /*
10621 * Ensures that if nohz_idle_balance() fails to observe our
10622 * @idle_cpus_mask store, it must observe the @has_blocked
Valentin Schneider7fd7a9e2021-08-23 12:17:00 +010010623 * and @needs_update stores.
Vincent Guittotf643ea22018-02-13 11:31:17 +010010624 */
10625 smp_mb__after_atomic();
10626
Peter Zijlstra00357f52017-12-21 15:06:50 +010010627 set_cpu_sd_state_idle(cpu);
Vincent Guittotf643ea22018-02-13 11:31:17 +010010628
Valentin Schneider7fd7a9e2021-08-23 12:17:00 +010010629 WRITE_ONCE(nohz.needs_update, 1);
Vincent Guittotf643ea22018-02-13 11:31:17 +010010630out:
10631 /*
10632 * Each time a cpu enter idle, we assume that it has blocked load and
10633 * enable the periodic update of the load of idle cpus
10634 */
10635 WRITE_ONCE(nohz.has_blocked, 1);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010636}
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010637
YueHaibing3f5ad912021-03-29 22:40:29 +080010638static bool update_nohz_stats(struct rq *rq)
10639{
10640 unsigned int cpu = rq->cpu;
10641
10642 if (!rq->has_blocked_load)
10643 return false;
10644
10645 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
10646 return false;
10647
10648 if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick)))
10649 return true;
10650
10651 update_blocked_averages(cpu);
10652
10653 return rq->has_blocked_load;
10654}
10655
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010656/*
Vincent Guittot31e77c92018-02-14 16:26:46 +010010657 * Internal function that runs load balance for all idle cpus. The load balance
10658 * can be a simple update of blocked load or a complete load balance with
10659 * tasks movement depending of flags.
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010660 */
Vincent Guittotab2dde52021-02-24 14:30:02 +010010661static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
Vincent Guittot31e77c92018-02-14 16:26:46 +010010662 enum cpu_idle_type idle)
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010663{
Vincent Guittotc5afb6a2015-08-03 11:55:50 +020010664 /* Earliest time when we have to do rebalance again */
Peter Zijlstraa4064fb2017-12-21 10:42:50 +010010665 unsigned long now = jiffies;
10666 unsigned long next_balance = now + 60*HZ;
Vincent Guittotf643ea22018-02-13 11:31:17 +010010667 bool has_blocked_load = false;
Vincent Guittotc5afb6a2015-08-03 11:55:50 +020010668 int update_next_balance = 0;
Peter Zijlstrab7031a02017-12-21 10:11:09 +010010669 int this_cpu = this_rq->cpu;
Peter Zijlstrab7031a02017-12-21 10:11:09 +010010670 int balance_cpu;
10671 struct rq *rq;
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010672
Peter Zijlstrab7031a02017-12-21 10:11:09 +010010673 SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010674
Vincent Guittotf643ea22018-02-13 11:31:17 +010010675 /*
10676 * We assume there will be no idle load after this update and clear
10677 * the has_blocked flag. If a cpu enters idle in the mean time, it will
Valentin Schneider7fd7a9e2021-08-23 12:17:00 +010010678 * set the has_blocked flag and trigger another update of idle load.
Vincent Guittotf643ea22018-02-13 11:31:17 +010010679 * Because a cpu that becomes idle, is added to idle_cpus_mask before
10680 * setting the flag, we are sure to not clear the state and not
10681 * check the load of an idle cpu.
Valentin Schneider7fd7a9e2021-08-23 12:17:00 +010010682 *
10683 * Same applies to idle_cpus_mask vs needs_update.
Vincent Guittotf643ea22018-02-13 11:31:17 +010010684 */
Valentin Schneiderefd984c2021-08-23 12:16:59 +010010685 if (flags & NOHZ_STATS_KICK)
10686 WRITE_ONCE(nohz.has_blocked, 0);
Valentin Schneider7fd7a9e2021-08-23 12:17:00 +010010687 if (flags & NOHZ_NEXT_KICK)
10688 WRITE_ONCE(nohz.needs_update, 0);
Vincent Guittotf643ea22018-02-13 11:31:17 +010010689
10690 /*
10691 * Ensures that if we miss the CPU, we must see the has_blocked
10692 * store from nohz_balance_enter_idle().
10693 */
10694 smp_mb();
10695
Vincent Guittot7a82e5f2021-02-24 14:30:04 +010010696 /*
10697 * Start with the next CPU after this_cpu so we will end with this_cpu and let a
10698 * chance for other idle cpu to pull load.
10699 */
10700 for_each_cpu_wrap(balance_cpu, nohz.idle_cpus_mask, this_cpu+1) {
10701 if (!idle_cpu(balance_cpu))
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010702 continue;
10703
10704 /*
Ingo Molnar97fb7a02018-03-03 14:01:12 +010010705 * If this CPU gets work to do, stop the load balancing
10706 * work being done for other CPUs. Next load
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010707 * balancing owner will pick it up.
10708 */
Vincent Guittotf643ea22018-02-13 11:31:17 +010010709 if (need_resched()) {
Valentin Schneiderefd984c2021-08-23 12:16:59 +010010710 if (flags & NOHZ_STATS_KICK)
10711 has_blocked_load = true;
Valentin Schneider7fd7a9e2021-08-23 12:17:00 +010010712 if (flags & NOHZ_NEXT_KICK)
10713 WRITE_ONCE(nohz.needs_update, 1);
Vincent Guittotf643ea22018-02-13 11:31:17 +010010714 goto abort;
10715 }
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010716
Vincent Guittot5ed4f1d2012-09-13 06:11:26 +020010717 rq = cpu_rq(balance_cpu);
10718
Valentin Schneiderefd984c2021-08-23 12:16:59 +010010719 if (flags & NOHZ_STATS_KICK)
10720 has_blocked_load |= update_nohz_stats(rq);
Vincent Guittotf643ea22018-02-13 11:31:17 +010010721
Tim Chened61bbc2014-05-20 14:39:27 -070010722 /*
10723 * If time for next balance is due,
10724 * do the balance.
10725 */
10726 if (time_after_eq(jiffies, rq->next_balance)) {
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +020010727 struct rq_flags rf;
10728
Vincent Guittot31e77c92018-02-14 16:26:46 +010010729 rq_lock_irqsave(rq, &rf);
Tim Chened61bbc2014-05-20 14:39:27 -070010730 update_rq_clock(rq);
Vincent Guittot31e77c92018-02-14 16:26:46 +010010731 rq_unlock_irqrestore(rq, &rf);
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +020010732
Peter Zijlstrab7031a02017-12-21 10:11:09 +010010733 if (flags & NOHZ_BALANCE_KICK)
10734 rebalance_domains(rq, CPU_IDLE);
Tim Chened61bbc2014-05-20 14:39:27 -070010735 }
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010736
Vincent Guittotc5afb6a2015-08-03 11:55:50 +020010737 if (time_after(next_balance, rq->next_balance)) {
10738 next_balance = rq->next_balance;
10739 update_next_balance = 1;
10740 }
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010741 }
Vincent Guittotc5afb6a2015-08-03 11:55:50 +020010742
Vincent Guittot3ea2f092020-06-09 14:37:48 +020010743 /*
10744 * next_balance will be updated only when there is a need.
10745 * When the CPU is attached to null domain for ex, it will not be
10746 * updated.
10747 */
10748 if (likely(update_next_balance))
10749 nohz.next_balance = next_balance;
10750
Valentin Schneiderefd984c2021-08-23 12:16:59 +010010751 if (flags & NOHZ_STATS_KICK)
10752 WRITE_ONCE(nohz.next_blocked,
10753 now + msecs_to_jiffies(LOAD_AVG_PERIOD));
Vincent Guittotf643ea22018-02-13 11:31:17 +010010754
10755abort:
10756 /* There is still blocked load, enable periodic update */
10757 if (has_blocked_load)
10758 WRITE_ONCE(nohz.has_blocked, 1);
Vincent Guittot31e77c92018-02-14 16:26:46 +010010759}
10760
10761/*
10762 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
10763 * rebalancing for all the cpus for whom scheduler ticks are stopped.
10764 */
10765static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
10766{
Peter Zijlstra19a1f5e2020-05-26 18:10:58 +020010767 unsigned int flags = this_rq->nohz_idle_balance;
Vincent Guittot31e77c92018-02-14 16:26:46 +010010768
Peter Zijlstra19a1f5e2020-05-26 18:10:58 +020010769 if (!flags)
Vincent Guittot31e77c92018-02-14 16:26:46 +010010770 return false;
10771
Peter Zijlstra19a1f5e2020-05-26 18:10:58 +020010772 this_rq->nohz_idle_balance = 0;
Vincent Guittot31e77c92018-02-14 16:26:46 +010010773
Peter Zijlstra19a1f5e2020-05-26 18:10:58 +020010774 if (idle != CPU_IDLE)
Vincent Guittot31e77c92018-02-14 16:26:46 +010010775 return false;
10776
10777 _nohz_idle_balance(this_rq, flags, idle);
10778
Peter Zijlstrab7031a02017-12-21 10:11:09 +010010779 return true;
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010780}
Vincent Guittot31e77c92018-02-14 16:26:46 +010010781
Vincent Guittotc6f88652021-02-24 14:30:06 +010010782/*
10783 * Check if we need to run the ILB for updating blocked load before entering
10784 * idle state.
10785 */
10786void nohz_run_idle_balance(int cpu)
10787{
10788 unsigned int flags;
10789
10790 flags = atomic_fetch_andnot(NOHZ_NEWILB_KICK, nohz_flags(cpu));
10791
10792 /*
10793 * Update the blocked load only if no SCHED_SOFTIRQ is about to happen
10794 * (ie NOHZ_STATS_KICK set) and will do the same.
10795 */
10796 if ((flags == NOHZ_NEWILB_KICK) && !need_resched())
10797 _nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK, CPU_IDLE);
10798}
10799
Vincent Guittot31e77c92018-02-14 16:26:46 +010010800static void nohz_newidle_balance(struct rq *this_rq)
10801{
10802 int this_cpu = this_rq->cpu;
10803
10804 /*
10805 * This CPU doesn't want to be disturbed by scheduler
10806 * housekeeping
10807 */
10808 if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED))
10809 return;
10810
10811 /* Will wake up very soon. No time for doing anything else*/
10812 if (this_rq->avg_idle < sysctl_sched_migration_cost)
10813 return;
10814
10815 /* Don't need to update blocked load of idle CPUs*/
10816 if (!READ_ONCE(nohz.has_blocked) ||
10817 time_before(jiffies, READ_ONCE(nohz.next_blocked)))
10818 return;
10819
Vincent Guittot31e77c92018-02-14 16:26:46 +010010820 /*
Vincent Guittotc6f88652021-02-24 14:30:06 +010010821 * Set the need to trigger ILB in order to update blocked load
10822 * before entering idle state.
Vincent Guittot31e77c92018-02-14 16:26:46 +010010823 */
Vincent Guittotc6f88652021-02-24 14:30:06 +010010824 atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu));
Vincent Guittot31e77c92018-02-14 16:26:46 +010010825}
10826
Peter Zijlstradd707242018-02-20 10:59:45 +010010827#else /* !CONFIG_NO_HZ_COMMON */
10828static inline void nohz_balancer_kick(struct rq *rq) { }
10829
Vincent Guittot31e77c92018-02-14 16:26:46 +010010830static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
Peter Zijlstrab7031a02017-12-21 10:11:09 +010010831{
10832 return false;
10833}
Vincent Guittot31e77c92018-02-14 16:26:46 +010010834
10835static inline void nohz_newidle_balance(struct rq *this_rq) { }
Peter Zijlstradd707242018-02-20 10:59:45 +010010836#endif /* CONFIG_NO_HZ_COMMON */
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010837
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010838/*
Barry Song5b78f2d2020-12-03 11:06:41 +130010839 * newidle_balance is called by schedule() if this_cpu is about to become
Peter Zijlstra47ea5412018-02-20 11:45:47 +010010840 * idle. Attempts to pull tasks from other CPUs.
Peter Zijlstra7277a342019-11-08 14:15:55 +010010841 *
10842 * Returns:
10843 * < 0 - we released the lock and there are !fair tasks present
10844 * 0 - failed, no new tasks
10845 * > 0 - success, new (fair) tasks present
Peter Zijlstra47ea5412018-02-20 11:45:47 +010010846 */
Chen Yud91cecc2020-04-21 18:50:34 +080010847static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
Peter Zijlstra47ea5412018-02-20 11:45:47 +010010848{
10849 unsigned long next_balance = jiffies + HZ;
10850 int this_cpu = this_rq->cpu;
Vincent Guittot9e9af812021-10-19 14:35:33 +020010851 u64 t0, t1, curr_cost = 0;
Peter Zijlstra47ea5412018-02-20 11:45:47 +010010852 struct sched_domain *sd;
10853 int pulled_task = 0;
Peter Zijlstra47ea5412018-02-20 11:45:47 +010010854
Peter Zijlstra5ba553e2019-05-29 20:36:42 +000010855 update_misfit_status(NULL, this_rq);
Rik van Riele5e678e2021-04-22 13:02:36 -040010856
10857 /*
10858 * There is a task waiting to run. No need to search for one.
10859 * Return 0; the task will be enqueued when switching to idle.
10860 */
10861 if (this_rq->ttwu_pending)
10862 return 0;
10863
Peter Zijlstra47ea5412018-02-20 11:45:47 +010010864 /*
10865 * We must set idle_stamp _before_ calling idle_balance(), such that we
10866 * measure the duration of idle_balance() as idle time.
10867 */
10868 this_rq->idle_stamp = rq_clock(this_rq);
10869
10870 /*
10871 * Do not pull tasks towards !active CPUs...
10872 */
10873 if (!cpu_active(this_cpu))
10874 return 0;
10875
10876 /*
10877 * This is OK, because current is on_cpu, which avoids it being picked
10878 * for load-balance and preemption/IRQs are still disabled avoiding
10879 * further scheduler activity on it and we're being very careful to
10880 * re-start the picking loop.
10881 */
10882 rq_unpin_lock(this_rq, rf);
10883
Vincent Guittot9d783c82021-10-19 14:35:34 +020010884 rcu_read_lock();
10885 sd = rcu_dereference_check_sched_domain(this_rq->sd);
Vincent Guittot31e77c92018-02-14 16:26:46 +010010886
Vincent Guittotc5b0a7e2021-10-19 14:35:36 +020010887 if (!READ_ONCE(this_rq->rd->overload) ||
Vincent Guittot9d783c82021-10-19 14:35:34 +020010888 (sd && this_rq->avg_idle < sd->max_newidle_lb_cost)) {
10889
Peter Zijlstra47ea5412018-02-20 11:45:47 +010010890 if (sd)
10891 update_next_balance(sd, &next_balance);
10892 rcu_read_unlock();
10893
10894 goto out;
10895 }
Vincent Guittot9d783c82021-10-19 14:35:34 +020010896 rcu_read_unlock();
Peter Zijlstra47ea5412018-02-20 11:45:47 +010010897
Peter Zijlstra5cb9eaa2020-11-17 18:19:31 -050010898 raw_spin_rq_unlock(this_rq);
Peter Zijlstra47ea5412018-02-20 11:45:47 +010010899
Vincent Guittot9e9af812021-10-19 14:35:33 +020010900 t0 = sched_clock_cpu(this_cpu);
Peter Zijlstra47ea5412018-02-20 11:45:47 +010010901 update_blocked_averages(this_cpu);
Vincent Guittot9e9af812021-10-19 14:35:33 +020010902
Peter Zijlstra47ea5412018-02-20 11:45:47 +010010903 rcu_read_lock();
10904 for_each_domain(this_cpu, sd) {
10905 int continue_balancing = 1;
Vincent Guittot9e9af812021-10-19 14:35:33 +020010906 u64 domain_cost;
Peter Zijlstra47ea5412018-02-20 11:45:47 +010010907
Vincent Guittot8ea91832021-10-19 14:35:37 +020010908 update_next_balance(sd, &next_balance);
10909
10910 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
Peter Zijlstra47ea5412018-02-20 11:45:47 +010010911 break;
Peter Zijlstra47ea5412018-02-20 11:45:47 +010010912
10913 if (sd->flags & SD_BALANCE_NEWIDLE) {
Peter Zijlstra47ea5412018-02-20 11:45:47 +010010914
10915 pulled_task = load_balance(this_cpu, this_rq,
10916 sd, CPU_NEWLY_IDLE,
10917 &continue_balancing);
10918
Vincent Guittot9e9af812021-10-19 14:35:33 +020010919 t1 = sched_clock_cpu(this_cpu);
10920 domain_cost = t1 - t0;
Vincent Guittote60b56e2021-10-19 14:35:35 +020010921 update_newidle_cost(sd, domain_cost);
Peter Zijlstra47ea5412018-02-20 11:45:47 +010010922
10923 curr_cost += domain_cost;
Vincent Guittot9e9af812021-10-19 14:35:33 +020010924 t0 = t1;
Peter Zijlstra47ea5412018-02-20 11:45:47 +010010925 }
10926
Peter Zijlstra47ea5412018-02-20 11:45:47 +010010927 /*
10928 * Stop searching for tasks to pull if there are
10929 * now runnable tasks on this rq.
10930 */
Rik van Riele5e678e2021-04-22 13:02:36 -040010931 if (pulled_task || this_rq->nr_running > 0 ||
10932 this_rq->ttwu_pending)
Peter Zijlstra47ea5412018-02-20 11:45:47 +010010933 break;
10934 }
10935 rcu_read_unlock();
10936
Peter Zijlstra5cb9eaa2020-11-17 18:19:31 -050010937 raw_spin_rq_lock(this_rq);
Peter Zijlstra47ea5412018-02-20 11:45:47 +010010938
10939 if (curr_cost > this_rq->max_idle_balance_cost)
10940 this_rq->max_idle_balance_cost = curr_cost;
10941
10942 /*
10943 * While browsing the domains, we released the rq lock, a task could
10944 * have been enqueued in the meantime. Since we're not going idle,
10945 * pretend we pulled a task.
10946 */
10947 if (this_rq->cfs.h_nr_running && !pulled_task)
10948 pulled_task = 1;
10949
Peter Zijlstra47ea5412018-02-20 11:45:47 +010010950 /* Is there a task of a high priority class? */
10951 if (this_rq->nr_running != this_rq->cfs.h_nr_running)
10952 pulled_task = -1;
10953
Vincent Guittot6553fc12021-02-24 14:30:05 +010010954out:
10955 /* Move the next balance forward */
10956 if (time_after(this_rq->next_balance, next_balance))
10957 this_rq->next_balance = next_balance;
10958
Peter Zijlstra47ea5412018-02-20 11:45:47 +010010959 if (pulled_task)
10960 this_rq->idle_stamp = 0;
Vincent Guittot08265302021-02-24 14:30:01 +010010961 else
10962 nohz_newidle_balance(this_rq);
Peter Zijlstra47ea5412018-02-20 11:45:47 +010010963
10964 rq_repin_lock(this_rq, rf);
10965
10966 return pulled_task;
10967}
10968
10969/*
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010970 * run_rebalance_domains is triggered when needed from the scheduler tick.
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -070010971 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010972 */
Emese Revfy0766f782016-06-20 20:42:34 +020010973static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010974{
Daniel Lezcano208cb162014-01-06 12:34:44 +010010975 struct rq *this_rq = this_rq();
Suresh Siddha6eb57e02011-10-03 15:09:01 -070010976 enum cpu_idle_type idle = this_rq->idle_balance ?
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010977 CPU_IDLE : CPU_NOT_IDLE;
10978
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010979 /*
Ingo Molnar97fb7a02018-03-03 14:01:12 +010010980 * If this CPU has a pending nohz_balance_kick, then do the
10981 * balancing on behalf of the other idle CPUs whose ticks are
Preeti U Murthyd4573c32015-03-26 18:32:44 +053010982 * stopped. Do nohz_idle_balance *before* rebalance_domains to
Ingo Molnar97fb7a02018-03-03 14:01:12 +010010983 * give the idle CPUs a chance to load balance. Else we may
Preeti U Murthyd4573c32015-03-26 18:32:44 +053010984 * load balance only within the local sched_domain hierarchy
10985 * and abort nohz_idle_balance altogether if we pull some load.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010986 */
Peter Zijlstrab7031a02017-12-21 10:11:09 +010010987 if (nohz_idle_balance(this_rq, idle))
10988 return;
10989
10990 /* normal load balance */
10991 update_blocked_averages(this_rq->cpu);
Preeti U Murthyd4573c32015-03-26 18:32:44 +053010992 rebalance_domains(this_rq, idle);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010993}
10994
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010995/*
10996 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010997 */
Daniel Lezcano7caff662014-01-06 12:34:38 +010010998void trigger_load_balance(struct rq *rq)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010010999{
Anna-Maria Behnsene0b257c2020-12-15 11:44:00 +010011000 /*
11001 * Don't need to rebalance while attached to NULL domain or
11002 * runqueue CPU is not active
11003 */
11004 if (unlikely(on_null_domain(rq) || !cpu_active(cpu_of(rq))))
Daniel Lezcanoc7260992014-01-06 12:34:45 +010011005 return;
11006
11007 if (time_after_eq(jiffies, rq->next_balance))
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010011008 raise_softirq(SCHED_SOFTIRQ);
Peter Zijlstra45504872017-12-21 10:47:48 +010011009
11010 nohz_balancer_kick(rq);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +010011011}
11012
Christian Ehrhardt0bcdcf22009-11-30 12:16:46 +010011013static void rq_online_fair(struct rq *rq)
11014{
11015 update_sysctl();
Kirill Tkhai0e59bda2014-06-25 12:19:42 +040011016
11017 update_runtime_enabled(rq);
Christian Ehrhardt0bcdcf22009-11-30 12:16:46 +010011018}
11019
11020static void rq_offline_fair(struct rq *rq)
11021{
11022 update_sysctl();
Peter Boonstoppela4c96ae2012-08-09 15:34:47 -070011023
11024 /* Ensure any throttled groups are reachable by pick_next_task */
11025 unthrottle_offline_cfs_rqs(rq);
Christian Ehrhardt0bcdcf22009-11-30 12:16:46 +010011026}
11027
Dhaval Giani55e12e52008-06-24 23:39:43 +053011028#endif /* CONFIG_SMP */
Peter Williamse1d14842007-10-24 18:23:51 +020011029
Vineeth Pillai8039e96f2020-11-17 18:19:38 -050011030#ifdef CONFIG_SCHED_CORE
11031static inline bool
11032__entity_slice_used(struct sched_entity *se, int min_nr_tasks)
11033{
11034 u64 slice = sched_slice(cfs_rq_of(se), se);
11035 u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime;
11036
11037 return (rtime * min_nr_tasks > slice);
11038}
11039
11040#define MIN_NR_TASKS_DURING_FORCEIDLE 2
11041static inline void task_tick_core(struct rq *rq, struct task_struct *curr)
11042{
11043 if (!sched_core_enabled(rq))
11044 return;
11045
11046 /*
11047 * If runqueue has only one task which used up its slice and
11048 * if the sibling is forced idle, then trigger schedule to
11049 * give forced idle task a chance.
11050 *
11051 * sched_slice() considers only this active rq and it gets the
11052 * whole slice. But during force idle, we have siblings acting
11053 * like a single runqueue and hence we need to consider runnable
Ingo Molnarcc00c192021-05-12 19:51:31 +020011054 * tasks on this CPU and the forced idle CPU. Ideally, we should
Vineeth Pillai8039e96f2020-11-17 18:19:38 -050011055 * go through the forced idle rq, but that would be a perf hit.
Ingo Molnarcc00c192021-05-12 19:51:31 +020011056 * We can assume that the forced idle CPU has at least
Vineeth Pillai8039e96f2020-11-17 18:19:38 -050011057 * MIN_NR_TASKS_DURING_FORCEIDLE - 1 tasks and use that to check
Ingo Molnarcc00c192021-05-12 19:51:31 +020011058 * if we need to give up the CPU.
Vineeth Pillai8039e96f2020-11-17 18:19:38 -050011059 */
Josh Don4feee7d2021-10-18 13:34:28 -070011060 if (rq->core->core_forceidle_count && rq->cfs.nr_running == 1 &&
Vineeth Pillai8039e96f2020-11-17 18:19:38 -050011061 __entity_slice_used(&curr->se, MIN_NR_TASKS_DURING_FORCEIDLE))
11062 resched_curr(rq);
11063}
Joel Fernandes (Google)c6047c22020-11-17 18:19:39 -050011064
11065/*
11066 * se_fi_update - Update the cfs_rq->min_vruntime_fi in a CFS hierarchy if needed.
11067 */
11068static void se_fi_update(struct sched_entity *se, unsigned int fi_seq, bool forceidle)
11069{
11070 for_each_sched_entity(se) {
11071 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11072
11073 if (forceidle) {
11074 if (cfs_rq->forceidle_seq == fi_seq)
11075 break;
11076 cfs_rq->forceidle_seq = fi_seq;
11077 }
11078
11079 cfs_rq->min_vruntime_fi = cfs_rq->min_vruntime;
11080 }
11081}
11082
11083void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi)
11084{
11085 struct sched_entity *se = &p->se;
11086
11087 if (p->sched_class != &fair_sched_class)
11088 return;
11089
11090 se_fi_update(se, rq->core->core_forceidle_seq, in_fi);
11091}
11092
11093bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
11094{
11095 struct rq *rq = task_rq(a);
11096 struct sched_entity *sea = &a->se;
11097 struct sched_entity *seb = &b->se;
11098 struct cfs_rq *cfs_rqa;
11099 struct cfs_rq *cfs_rqb;
11100 s64 delta;
11101
11102 SCHED_WARN_ON(task_rq(b)->core != rq->core);
11103
11104#ifdef CONFIG_FAIR_GROUP_SCHED
11105 /*
11106 * Find an se in the hierarchy for tasks a and b, such that the se's
11107 * are immediate siblings.
11108 */
11109 while (sea->cfs_rq->tg != seb->cfs_rq->tg) {
11110 int sea_depth = sea->depth;
11111 int seb_depth = seb->depth;
11112
11113 if (sea_depth >= seb_depth)
11114 sea = parent_entity(sea);
11115 if (sea_depth <= seb_depth)
11116 seb = parent_entity(seb);
11117 }
11118
11119 se_fi_update(sea, rq->core->core_forceidle_seq, in_fi);
11120 se_fi_update(seb, rq->core->core_forceidle_seq, in_fi);
11121
11122 cfs_rqa = sea->cfs_rq;
11123 cfs_rqb = seb->cfs_rq;
11124#else
11125 cfs_rqa = &task_rq(a)->cfs;
11126 cfs_rqb = &task_rq(b)->cfs;
11127#endif
11128
11129 /*
11130 * Find delta after normalizing se's vruntime with its cfs_rq's
11131 * min_vruntime_fi, which would have been updated in prior calls
11132 * to se_fi_update().
11133 */
11134 delta = (s64)(sea->vruntime - seb->vruntime) +
11135 (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);
11136
11137 return delta > 0;
11138}
Vineeth Pillai8039e96f2020-11-17 18:19:38 -050011139#else
11140static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {}
11141#endif
11142
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011143/*
Frederic Weisbeckerd84b3132018-02-21 05:17:27 +010011144 * scheduler tick hitting a task of our scheduling class.
11145 *
11146 * NOTE: This function can be called remotely by the tick offload that
11147 * goes along full dynticks. Therefore no local assumption can be made
11148 * and everything must be accessed through the @rq and @curr passed in
11149 * parameters.
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011150 */
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +010011151static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011152{
11153 struct cfs_rq *cfs_rq;
11154 struct sched_entity *se = &curr->se;
11155
11156 for_each_sched_entity(se) {
11157 cfs_rq = cfs_rq_of(se);
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +010011158 entity_tick(cfs_rq, se, queued);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011159 }
Ben Segall18bf2802012-10-04 12:51:20 +020011160
Srikar Dronamrajub52da862015-10-02 07:48:25 +053011161 if (static_branch_unlikely(&sched_numa_balancing))
Peter Zijlstracbee9f82012-10-25 14:16:43 +020011162 task_tick_numa(rq, curr);
Morten Rasmussen3b1baa62018-07-04 11:17:40 +010011163
11164 update_misfit_status(curr, rq);
Morten Rasmussen2802bf32018-12-03 09:56:25 +000011165 update_overutilized_status(task_rq(curr));
Vineeth Pillai8039e96f2020-11-17 18:19:38 -050011166
11167 task_tick_core(rq, curr);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011168}
11169
11170/*
Peter Zijlstracd29fe62009-11-27 17:32:46 +010011171 * called on fork with the child task as argument from the parent's context
11172 * - child not yet on the tasklist
11173 * - preemption disabled
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011174 */
Peter Zijlstracd29fe62009-11-27 17:32:46 +010011175static void task_fork_fair(struct task_struct *p)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011176{
Daisuke Nishimura4fc420c2011-12-15 14:36:55 +090011177 struct cfs_rq *cfs_rq;
11178 struct sched_entity *se = &p->se, *curr;
Peter Zijlstracd29fe62009-11-27 17:32:46 +010011179 struct rq *rq = this_rq();
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +020011180 struct rq_flags rf;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011181
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +020011182 rq_lock(rq, &rf);
Peter Zijlstra861d0342010-08-19 13:31:43 +020011183 update_rq_clock(rq);
11184
Daisuke Nishimura4fc420c2011-12-15 14:36:55 +090011185 cfs_rq = task_cfs_rq(current);
11186 curr = cfs_rq->curr;
Peter Zijlstrae210bff2016-06-16 18:51:48 +020011187 if (curr) {
11188 update_curr(cfs_rq);
Mike Galbraithb5d9d732009-09-08 11:12:28 +020011189 se->vruntime = curr->vruntime;
Peter Zijlstrae210bff2016-06-16 18:51:48 +020011190 }
Peter Zijlstraaeb73b02007-10-15 17:00:05 +020011191 place_entity(cfs_rq, se, 1);
Peter Zijlstra4d78e7b2007-10-15 17:00:04 +020011192
Peter Zijlstracd29fe62009-11-27 17:32:46 +010011193 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
Dmitry Adamushko87fefa32007-10-15 17:00:08 +020011194 /*
Ingo Molnaredcb60a2007-10-15 17:00:08 +020011195 * Upon rescheduling, sched_class::put_prev_task() will place
11196 * 'current' within the tree based on its new key value.
11197 */
Peter Zijlstra4d78e7b2007-10-15 17:00:04 +020011198 swap(curr->vruntime, se->vruntime);
Kirill Tkhai88751252014-06-29 00:03:57 +040011199 resched_curr(rq);
Peter Zijlstra4d78e7b2007-10-15 17:00:04 +020011200 }
11201
Peter Zijlstra88ec22d2009-12-16 18:04:41 +010011202 se->vruntime -= cfs_rq->min_vruntime;
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +020011203 rq_unlock(rq, &rf);
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011204}
11205
Steven Rostedtcb469842008-01-25 21:08:22 +010011206/*
11207 * Priority of the task has changed. Check to see if we preempt
11208 * the current task.
11209 */
Peter Zijlstrada7a7352011-01-17 17:03:27 +010011210static void
11211prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
Steven Rostedtcb469842008-01-25 21:08:22 +010011212{
Kirill Tkhaida0c1e62014-08-20 13:47:32 +040011213 if (!task_on_rq_queued(p))
Peter Zijlstrada7a7352011-01-17 17:03:27 +010011214 return;
11215
Frederic Weisbecker7c2e8bb2019-12-03 17:01:05 +010011216 if (rq->cfs.nr_running == 1)
11217 return;
11218
Steven Rostedtcb469842008-01-25 21:08:22 +010011219 /*
11220 * Reschedule if we are currently running on this runqueue and
11221 * our priority decreased, or if we are not currently running on
11222 * this runqueue and our priority is higher than the current's
11223 */
Hui Su65bcf072020-10-31 01:32:23 +080011224 if (task_current(rq, p)) {
Steven Rostedtcb469842008-01-25 21:08:22 +010011225 if (p->prio > oldprio)
Kirill Tkhai88751252014-06-29 00:03:57 +040011226 resched_curr(rq);
Steven Rostedtcb469842008-01-25 21:08:22 +010011227 } else
Peter Zijlstra15afe092008-09-20 23:38:02 +020011228 check_preempt_curr(rq, p, 0);
Steven Rostedtcb469842008-01-25 21:08:22 +010011229}
11230
Byungchul Parkdaa59402015-08-20 20:22:00 +090011231static inline bool vruntime_normalized(struct task_struct *p)
11232{
11233 struct sched_entity *se = &p->se;
11234
11235 /*
11236 * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
11237 * the dequeue_entity(.flags=0) will already have normalized the
11238 * vruntime.
11239 */
11240 if (p->on_rq)
11241 return true;
11242
11243 /*
11244 * When !on_rq, vruntime of the task has usually NOT been normalized.
11245 * But there are some cases where it has already been normalized:
11246 *
11247 * - A forked child which is waiting for being woken up by
11248 * wake_up_new_task().
11249 * - A task which has been woken up by try_to_wake_up() and
11250 * waiting for actually being woken up by sched_ttwu_pending().
11251 */
Steve Muckled0cdb3c2018-08-31 15:42:17 -070011252 if (!se->sum_exec_runtime ||
Peter Zijlstra2f064a52021-06-11 10:28:17 +020011253 (READ_ONCE(p->__state) == TASK_WAKING && p->sched_remote_wakeup))
Byungchul Parkdaa59402015-08-20 20:22:00 +090011254 return true;
11255
11256 return false;
11257}
11258
Vincent Guittot09a43ac2016-11-08 10:53:45 +010011259#ifdef CONFIG_FAIR_GROUP_SCHED
11260/*
11261 * Propagate the changes of the sched_entity across the tg tree to make it
11262 * visible to the root
11263 */
11264static void propagate_entity_cfs_rq(struct sched_entity *se)
11265{
11266 struct cfs_rq *cfs_rq;
11267
Odin Ugedal0258bdf2021-05-01 16:19:50 +020011268 list_add_leaf_cfs_rq(cfs_rq_of(se));
11269
Vincent Guittot09a43ac2016-11-08 10:53:45 +010011270 /* Start to propagate at parent */
11271 se = se->parent;
11272
11273 for_each_sched_entity(se) {
11274 cfs_rq = cfs_rq_of(se);
11275
Odin Ugedal0258bdf2021-05-01 16:19:50 +020011276 if (!cfs_rq_throttled(cfs_rq)){
11277 update_load_avg(cfs_rq, se, UPDATE_TG);
11278 list_add_leaf_cfs_rq(cfs_rq);
11279 continue;
11280 }
Vincent Guittot09a43ac2016-11-08 10:53:45 +010011281
Odin Ugedal0258bdf2021-05-01 16:19:50 +020011282 if (list_add_leaf_cfs_rq(cfs_rq))
11283 break;
Vincent Guittot09a43ac2016-11-08 10:53:45 +010011284 }
11285}
11286#else
11287static void propagate_entity_cfs_rq(struct sched_entity *se) { }
11288#endif
11289
Vincent Guittotdf217912016-11-08 10:53:42 +010011290static void detach_entity_cfs_rq(struct sched_entity *se)
Peter Zijlstrada7a7352011-01-17 17:03:27 +010011291{
Peter Zijlstrada7a7352011-01-17 17:03:27 +010011292 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11293
Yuyang Du9d89c252015-07-15 08:04:37 +080011294 /* Catch up with the cfs_rq and remove our load when we leave */
Peter Zijlstra88c06162017-05-06 17:32:43 +020011295 update_load_avg(cfs_rq, se, 0);
Byungchul Parka05e8c52015-08-20 20:21:56 +090011296 detach_entity_load_avg(cfs_rq, se);
Xianting Tianfe749152020-09-24 09:47:55 +080011297 update_tg_load_avg(cfs_rq);
Vincent Guittot09a43ac2016-11-08 10:53:45 +010011298 propagate_entity_cfs_rq(se);
Peter Zijlstrada7a7352011-01-17 17:03:27 +010011299}
11300
Vincent Guittotdf217912016-11-08 10:53:42 +010011301static void attach_entity_cfs_rq(struct sched_entity *se)
Steven Rostedtcb469842008-01-25 21:08:22 +010011302{
Byungchul Parkdaa59402015-08-20 20:22:00 +090011303 struct cfs_rq *cfs_rq = cfs_rq_of(se);
Byungchul Park7855a352015-08-10 18:02:55 +090011304
11305#ifdef CONFIG_FAIR_GROUP_SCHED
Michael wangeb7a59b2014-02-20 11:14:53 +080011306 /*
11307 * Since the real-depth could have been changed (only FAIR
11308 * class maintain depth value), reset depth properly.
11309 */
11310 se->depth = se->parent ? se->parent->depth + 1 : 0;
11311#endif
Byungchul Park7855a352015-08-10 18:02:55 +090011312
Vincent Guittotdf217912016-11-08 10:53:42 +010011313 /* Synchronize entity with its cfs_rq */
Peter Zijlstra88c06162017-05-06 17:32:43 +020011314 update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
Vincent Guittota4f9a0e2020-01-15 11:20:20 +010011315 attach_entity_load_avg(cfs_rq, se);
Xianting Tianfe749152020-09-24 09:47:55 +080011316 update_tg_load_avg(cfs_rq);
Vincent Guittot09a43ac2016-11-08 10:53:45 +010011317 propagate_entity_cfs_rq(se);
Vincent Guittotdf217912016-11-08 10:53:42 +010011318}
11319
11320static void detach_task_cfs_rq(struct task_struct *p)
11321{
11322 struct sched_entity *se = &p->se;
11323 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11324
11325 if (!vruntime_normalized(p)) {
11326 /*
11327 * Fix up our vruntime so that the current sleep doesn't
11328 * cause 'unlimited' sleep bonus.
11329 */
11330 place_entity(cfs_rq, se, 0);
11331 se->vruntime -= cfs_rq->min_vruntime;
11332 }
11333
11334 detach_entity_cfs_rq(se);
11335}
11336
11337static void attach_task_cfs_rq(struct task_struct *p)
11338{
11339 struct sched_entity *se = &p->se;
11340 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11341
11342 attach_entity_cfs_rq(se);
Byungchul Park6efdb102015-08-20 20:21:59 +090011343
Byungchul Parkdaa59402015-08-20 20:22:00 +090011344 if (!vruntime_normalized(p))
11345 se->vruntime += cfs_rq->min_vruntime;
11346}
Byungchul Park7855a352015-08-10 18:02:55 +090011347
Byungchul Parkdaa59402015-08-20 20:22:00 +090011348static void switched_from_fair(struct rq *rq, struct task_struct *p)
11349{
11350 detach_task_cfs_rq(p);
11351}
11352
11353static void switched_to_fair(struct rq *rq, struct task_struct *p)
11354{
11355 attach_task_cfs_rq(p);
11356
11357 if (task_on_rq_queued(p)) {
Byungchul Park7855a352015-08-10 18:02:55 +090011358 /*
Byungchul Parkdaa59402015-08-20 20:22:00 +090011359 * We were most likely switched from sched_rt, so
11360 * kick off the schedule if running, otherwise just see
11361 * if we can still preempt the current task.
Byungchul Park7855a352015-08-10 18:02:55 +090011362 */
Hui Su65bcf072020-10-31 01:32:23 +080011363 if (task_current(rq, p))
Byungchul Parkdaa59402015-08-20 20:22:00 +090011364 resched_curr(rq);
11365 else
11366 check_preempt_curr(rq, p, 0);
Byungchul Park7855a352015-08-10 18:02:55 +090011367 }
Steven Rostedtcb469842008-01-25 21:08:22 +010011368}
11369
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +020011370/* Account for a task changing its policy or group.
11371 *
11372 * This routine is mostly called to set cfs_rq->curr field when a task
11373 * migrates between groups/classes.
11374 */
Peter Zijlstraa0e813f2019-11-08 14:16:00 +010011375static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +020011376{
Peter Zijlstra03b7fad2019-05-29 20:36:41 +000011377 struct sched_entity *se = &p->se;
11378
11379#ifdef CONFIG_SMP
11380 if (task_on_rq_queued(p)) {
11381 /*
11382 * Move the next running task to the front of the list, so our
11383 * cfs_tasks list becomes MRU one.
11384 */
11385 list_move(&se->group_node, &rq->cfs_tasks);
11386 }
11387#endif
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +020011388
Paul Turnerec12cb72011-07-21 09:43:30 -070011389 for_each_sched_entity(se) {
11390 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11391
11392 set_next_entity(cfs_rq, se);
11393 /* ensure bandwidth has been allocated on our new cfs_rq */
11394 account_cfs_rq_runtime(cfs_rq, 0);
11395 }
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +020011396}
11397
Peter Zijlstra029632f2011-10-25 10:00:11 +020011398void init_cfs_rq(struct cfs_rq *cfs_rq)
11399{
Davidlohr Buesobfb06882017-09-08 16:14:55 -070011400 cfs_rq->tasks_timeline = RB_ROOT_CACHED;
Peter Zijlstra029632f2011-10-25 10:00:11 +020011401 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
11402#ifndef CONFIG_64BIT
11403 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
11404#endif
Alex Shi141965c2013-06-26 13:05:39 +080011405#ifdef CONFIG_SMP
Peter Zijlstra2a2f5d4e2017-05-08 16:51:41 +020011406 raw_spin_lock_init(&cfs_rq->removed.lock);
Paul Turner9ee474f2012-10-04 13:18:30 +020011407#endif
Peter Zijlstra029632f2011-10-25 10:00:11 +020011408}
11409
Peter Zijlstra810b3812008-02-29 15:21:01 -050011410#ifdef CONFIG_FAIR_GROUP_SCHED
Vincent Guittotea86cb42016-06-17 13:38:55 +020011411static void task_set_group_fair(struct task_struct *p)
11412{
11413 struct sched_entity *se = &p->se;
11414
11415 set_task_rq(p, task_cpu(p));
11416 se->depth = se->parent ? se->parent->depth + 1 : 0;
11417}
11418
Peter Zijlstrabc54da22015-08-31 17:13:55 +020011419static void task_move_group_fair(struct task_struct *p)
Peter Zijlstra810b3812008-02-29 15:21:01 -050011420{
Byungchul Parkdaa59402015-08-20 20:22:00 +090011421 detach_task_cfs_rq(p);
Peter Zijlstrab2b5ce02010-10-15 15:24:15 +020011422 set_task_rq(p, task_cpu(p));
Byungchul Park6efdb102015-08-20 20:21:59 +090011423
11424#ifdef CONFIG_SMP
11425 /* Tell se's cfs_rq has been changed -- migrated */
11426 p->se.avg.last_update_time = 0;
11427#endif
Byungchul Parkdaa59402015-08-20 20:22:00 +090011428 attach_task_cfs_rq(p);
Peter Zijlstra810b3812008-02-29 15:21:01 -050011429}
Peter Zijlstra029632f2011-10-25 10:00:11 +020011430
Vincent Guittotea86cb42016-06-17 13:38:55 +020011431static void task_change_group_fair(struct task_struct *p, int type)
11432{
11433 switch (type) {
11434 case TASK_SET_GROUP:
11435 task_set_group_fair(p);
11436 break;
11437
11438 case TASK_MOVE_GROUP:
11439 task_move_group_fair(p);
11440 break;
11441 }
11442}
11443
Peter Zijlstra029632f2011-10-25 10:00:11 +020011444void free_fair_sched_group(struct task_group *tg)
11445{
11446 int i;
11447
Peter Zijlstra029632f2011-10-25 10:00:11 +020011448 for_each_possible_cpu(i) {
11449 if (tg->cfs_rq)
11450 kfree(tg->cfs_rq[i]);
Peter Zijlstra6fe1f342016-01-21 22:24:16 +010011451 if (tg->se)
Peter Zijlstra029632f2011-10-25 10:00:11 +020011452 kfree(tg->se[i]);
11453 }
11454
11455 kfree(tg->cfs_rq);
11456 kfree(tg->se);
11457}
11458
11459int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
11460{
Peter Zijlstra029632f2011-10-25 10:00:11 +020011461 struct sched_entity *se;
Peter Zijlstrab7fa30c2016-06-09 15:07:50 +020011462 struct cfs_rq *cfs_rq;
Peter Zijlstra029632f2011-10-25 10:00:11 +020011463 int i;
11464
Kees Cook6396bb22018-06-12 14:03:40 -070011465 tg->cfs_rq = kcalloc(nr_cpu_ids, sizeof(cfs_rq), GFP_KERNEL);
Peter Zijlstra029632f2011-10-25 10:00:11 +020011466 if (!tg->cfs_rq)
11467 goto err;
Kees Cook6396bb22018-06-12 14:03:40 -070011468 tg->se = kcalloc(nr_cpu_ids, sizeof(se), GFP_KERNEL);
Peter Zijlstra029632f2011-10-25 10:00:11 +020011469 if (!tg->se)
11470 goto err;
11471
11472 tg->shares = NICE_0_LOAD;
11473
11474 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
11475
11476 for_each_possible_cpu(i) {
11477 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
11478 GFP_KERNEL, cpu_to_node(i));
11479 if (!cfs_rq)
11480 goto err;
11481
Yafang Shaoceeadb82021-09-05 14:35:41 +000011482 se = kzalloc_node(sizeof(struct sched_entity_stats),
Peter Zijlstra029632f2011-10-25 10:00:11 +020011483 GFP_KERNEL, cpu_to_node(i));
11484 if (!se)
11485 goto err_free_rq;
11486
11487 init_cfs_rq(cfs_rq);
11488 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
Yuyang Du540247f2015-07-15 08:04:39 +080011489 init_entity_runnable_average(se);
Peter Zijlstra029632f2011-10-25 10:00:11 +020011490 }
11491
11492 return 1;
11493
11494err_free_rq:
11495 kfree(cfs_rq);
11496err:
11497 return 0;
11498}
11499
Peter Zijlstra8663e242016-06-22 14:58:02 +020011500void online_fair_sched_group(struct task_group *tg)
11501{
11502 struct sched_entity *se;
Phil Aulda46d14e2019-08-01 09:37:49 -040011503 struct rq_flags rf;
Peter Zijlstra8663e242016-06-22 14:58:02 +020011504 struct rq *rq;
11505 int i;
11506
11507 for_each_possible_cpu(i) {
11508 rq = cpu_rq(i);
11509 se = tg->se[i];
Phil Aulda46d14e2019-08-01 09:37:49 -040011510 rq_lock_irq(rq, &rf);
Peter Zijlstra4126bad2016-10-03 16:20:59 +020011511 update_rq_clock(rq);
Vincent Guittotd0326692016-11-08 10:53:47 +010011512 attach_entity_cfs_rq(se);
Peter Zijlstra55e16d32016-06-22 15:14:26 +020011513 sync_throttle(tg, i);
Phil Aulda46d14e2019-08-01 09:37:49 -040011514 rq_unlock_irq(rq, &rf);
Peter Zijlstra8663e242016-06-22 14:58:02 +020011515 }
11516}
11517
Peter Zijlstra6fe1f342016-01-21 22:24:16 +010011518void unregister_fair_sched_group(struct task_group *tg)
Peter Zijlstra029632f2011-10-25 10:00:11 +020011519{
Peter Zijlstra029632f2011-10-25 10:00:11 +020011520 unsigned long flags;
Peter Zijlstra6fe1f342016-01-21 22:24:16 +010011521 struct rq *rq;
11522 int cpu;
Peter Zijlstra029632f2011-10-25 10:00:11 +020011523
Mathias Krauseb0277892021-11-03 20:06:13 +010011524 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
11525
Peter Zijlstra6fe1f342016-01-21 22:24:16 +010011526 for_each_possible_cpu(cpu) {
11527 if (tg->se[cpu])
11528 remove_entity_load_avg(tg->se[cpu]);
Peter Zijlstra029632f2011-10-25 10:00:11 +020011529
Peter Zijlstra6fe1f342016-01-21 22:24:16 +010011530 /*
11531 * Only empty task groups can be destroyed; so we can speculatively
11532 * check on_list without danger of it being re-added.
11533 */
11534 if (!tg->cfs_rq[cpu]->on_list)
11535 continue;
11536
11537 rq = cpu_rq(cpu);
11538
Peter Zijlstra5cb9eaa2020-11-17 18:19:31 -050011539 raw_spin_rq_lock_irqsave(rq, flags);
Peter Zijlstra6fe1f342016-01-21 22:24:16 +010011540 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
Peter Zijlstra5cb9eaa2020-11-17 18:19:31 -050011541 raw_spin_rq_unlock_irqrestore(rq, flags);
Peter Zijlstra6fe1f342016-01-21 22:24:16 +010011542 }
Peter Zijlstra029632f2011-10-25 10:00:11 +020011543}
11544
11545void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
11546 struct sched_entity *se, int cpu,
11547 struct sched_entity *parent)
11548{
11549 struct rq *rq = cpu_rq(cpu);
11550
11551 cfs_rq->tg = tg;
11552 cfs_rq->rq = rq;
Peter Zijlstra029632f2011-10-25 10:00:11 +020011553 init_cfs_rq_runtime(cfs_rq);
11554
11555 tg->cfs_rq[cpu] = cfs_rq;
11556 tg->se[cpu] = se;
11557
11558 /* se could be NULL for root_task_group */
11559 if (!se)
11560 return;
11561
Peter Zijlstrafed14d42012-02-11 06:05:00 +010011562 if (!parent) {
Peter Zijlstra029632f2011-10-25 10:00:11 +020011563 se->cfs_rq = &rq->cfs;
Peter Zijlstrafed14d42012-02-11 06:05:00 +010011564 se->depth = 0;
11565 } else {
Peter Zijlstra029632f2011-10-25 10:00:11 +020011566 se->cfs_rq = parent->my_q;
Peter Zijlstrafed14d42012-02-11 06:05:00 +010011567 se->depth = parent->depth + 1;
11568 }
Peter Zijlstra029632f2011-10-25 10:00:11 +020011569
11570 se->my_q = cfs_rq;
Paul Turner0ac9b1c2013-10-16 11:16:27 -070011571 /* guarantee group entities always have weight */
11572 update_load_set(&se->load, NICE_0_LOAD);
Peter Zijlstra029632f2011-10-25 10:00:11 +020011573 se->parent = parent;
11574}
11575
11576static DEFINE_MUTEX(shares_mutex);
11577
Josh Don30400032021-07-29 19:00:18 -070011578static int __sched_group_set_shares(struct task_group *tg, unsigned long shares)
Peter Zijlstra029632f2011-10-25 10:00:11 +020011579{
11580 int i;
Peter Zijlstra029632f2011-10-25 10:00:11 +020011581
Josh Don30400032021-07-29 19:00:18 -070011582 lockdep_assert_held(&shares_mutex);
11583
Peter Zijlstra029632f2011-10-25 10:00:11 +020011584 /*
11585 * We can't change the weight of the root cgroup.
11586 */
11587 if (!tg->se[0])
11588 return -EINVAL;
11589
11590 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
11591
Peter Zijlstra029632f2011-10-25 10:00:11 +020011592 if (tg->shares == shares)
Josh Don30400032021-07-29 19:00:18 -070011593 return 0;
Peter Zijlstra029632f2011-10-25 10:00:11 +020011594
11595 tg->shares = shares;
11596 for_each_possible_cpu(i) {
11597 struct rq *rq = cpu_rq(i);
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +020011598 struct sched_entity *se = tg->se[i];
11599 struct rq_flags rf;
Peter Zijlstra029632f2011-10-25 10:00:11 +020011600
Peter Zijlstra029632f2011-10-25 10:00:11 +020011601 /* Propagate contribution to hierarchy */
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +020011602 rq_lock_irqsave(rq, &rf);
Frederic Weisbecker71b1da42013-04-12 01:50:59 +020011603 update_rq_clock(rq);
Vincent Guittot89ee0482016-12-21 16:50:26 +010011604 for_each_sched_entity(se) {
Peter Zijlstra88c06162017-05-06 17:32:43 +020011605 update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
Peter Zijlstra1ea6c462017-05-06 15:59:54 +020011606 update_cfs_group(se);
Vincent Guittot89ee0482016-12-21 16:50:26 +010011607 }
Peter Zijlstra8a8c69c2016-10-04 16:04:35 +020011608 rq_unlock_irqrestore(rq, &rf);
Peter Zijlstra029632f2011-10-25 10:00:11 +020011609 }
11610
Josh Don30400032021-07-29 19:00:18 -070011611 return 0;
11612}
11613
11614int sched_group_set_shares(struct task_group *tg, unsigned long shares)
11615{
11616 int ret;
11617
11618 mutex_lock(&shares_mutex);
11619 if (tg_is_idle(tg))
11620 ret = -EINVAL;
11621 else
11622 ret = __sched_group_set_shares(tg, shares);
11623 mutex_unlock(&shares_mutex);
11624
11625 return ret;
11626}
11627
11628int sched_group_set_idle(struct task_group *tg, long idle)
11629{
11630 int i;
11631
11632 if (tg == &root_task_group)
11633 return -EINVAL;
11634
11635 if (idle < 0 || idle > 1)
11636 return -EINVAL;
11637
11638 mutex_lock(&shares_mutex);
11639
11640 if (tg->idle == idle) {
11641 mutex_unlock(&shares_mutex);
11642 return 0;
11643 }
11644
11645 tg->idle = idle;
11646
11647 for_each_possible_cpu(i) {
11648 struct rq *rq = cpu_rq(i);
11649 struct sched_entity *se = tg->se[i];
Josh Dona480add2021-08-19 18:04:01 -070011650 struct cfs_rq *parent_cfs_rq, *grp_cfs_rq = tg->cfs_rq[i];
Josh Don30400032021-07-29 19:00:18 -070011651 bool was_idle = cfs_rq_is_idle(grp_cfs_rq);
11652 long idle_task_delta;
11653 struct rq_flags rf;
11654
11655 rq_lock_irqsave(rq, &rf);
11656
11657 grp_cfs_rq->idle = idle;
11658 if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq)))
11659 goto next_cpu;
11660
Josh Dona480add2021-08-19 18:04:01 -070011661 if (se->on_rq) {
11662 parent_cfs_rq = cfs_rq_of(se);
11663 if (cfs_rq_is_idle(grp_cfs_rq))
11664 parent_cfs_rq->idle_nr_running++;
11665 else
11666 parent_cfs_rq->idle_nr_running--;
11667 }
11668
Josh Don30400032021-07-29 19:00:18 -070011669 idle_task_delta = grp_cfs_rq->h_nr_running -
11670 grp_cfs_rq->idle_h_nr_running;
11671 if (!cfs_rq_is_idle(grp_cfs_rq))
11672 idle_task_delta *= -1;
11673
11674 for_each_sched_entity(se) {
11675 struct cfs_rq *cfs_rq = cfs_rq_of(se);
11676
11677 if (!se->on_rq)
11678 break;
11679
11680 cfs_rq->idle_h_nr_running += idle_task_delta;
11681
11682 /* Already accounted at parent level and above. */
11683 if (cfs_rq_is_idle(cfs_rq))
11684 break;
11685 }
11686
11687next_cpu:
11688 rq_unlock_irqrestore(rq, &rf);
11689 }
11690
11691 /* Idle groups have minimum weight. */
11692 if (tg_is_idle(tg))
11693 __sched_group_set_shares(tg, scale_load(WEIGHT_IDLEPRIO));
11694 else
11695 __sched_group_set_shares(tg, NICE_0_LOAD);
11696
Peter Zijlstra029632f2011-10-25 10:00:11 +020011697 mutex_unlock(&shares_mutex);
11698 return 0;
11699}
Josh Don30400032021-07-29 19:00:18 -070011700
Peter Zijlstra029632f2011-10-25 10:00:11 +020011701#else /* CONFIG_FAIR_GROUP_SCHED */
11702
11703void free_fair_sched_group(struct task_group *tg) { }
11704
11705int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
11706{
11707 return 1;
11708}
11709
Peter Zijlstra8663e242016-06-22 14:58:02 +020011710void online_fair_sched_group(struct task_group *tg) { }
11711
Peter Zijlstra6fe1f342016-01-21 22:24:16 +010011712void unregister_fair_sched_group(struct task_group *tg) { }
Peter Zijlstra029632f2011-10-25 10:00:11 +020011713
11714#endif /* CONFIG_FAIR_GROUP_SCHED */
11715
Peter Zijlstra810b3812008-02-29 15:21:01 -050011716
H Hartley Sweeten6d686f42010-01-13 20:21:52 -070011717static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
Peter Williams0d721ce2009-09-21 01:31:53 +000011718{
11719 struct sched_entity *se = &task->se;
Peter Williams0d721ce2009-09-21 01:31:53 +000011720 unsigned int rr_interval = 0;
11721
11722 /*
11723 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
11724 * idle runqueue:
11725 */
Peter Williams0d721ce2009-09-21 01:31:53 +000011726 if (rq->cfs.load.weight)
Zhu Yanhaia59f4e02013-01-08 12:56:52 +080011727 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
Peter Williams0d721ce2009-09-21 01:31:53 +000011728
11729 return rr_interval;
11730}
11731
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011732/*
11733 * All the scheduling class methods:
11734 */
Peter Zijlstra43c31ac2020-10-21 15:45:33 +020011735DEFINE_SCHED_CLASS(fair) = {
11736
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011737 .enqueue_task = enqueue_task_fair,
11738 .dequeue_task = dequeue_task_fair,
11739 .yield_task = yield_task_fair,
Mike Galbraithd95f4122011-02-01 09:50:51 -050011740 .yield_to_task = yield_to_task_fair,
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011741
Ingo Molnar2e09bf52007-10-15 17:00:05 +020011742 .check_preempt_curr = check_preempt_wakeup,
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011743
Peter Zijlstra98c2f702019-11-08 14:15:58 +010011744 .pick_next_task = __pick_next_task_fair,
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011745 .put_prev_task = put_prev_task_fair,
Peter Zijlstra03b7fad2019-05-29 20:36:41 +000011746 .set_next_task = set_next_task_fair,
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011747
Peter Williams681f3e62007-10-24 18:23:51 +020011748#ifdef CONFIG_SMP
Peter Zijlstra6e2df052019-11-08 11:11:52 +010011749 .balance = balance_fair,
Peter Zijlstra21f56ffe2020-11-17 18:19:32 -050011750 .pick_task = pick_task_fair,
Li Zefan4ce72a22008-10-22 15:25:26 +080011751 .select_task_rq = select_task_rq_fair,
Paul Turner0a74bef2012-10-04 13:18:30 +020011752 .migrate_task_rq = migrate_task_rq_fair,
Alex Shi141965c2013-06-26 13:05:39 +080011753
Christian Ehrhardt0bcdcf22009-11-30 12:16:46 +010011754 .rq_online = rq_online_fair,
11755 .rq_offline = rq_offline_fair,
Peter Zijlstra88ec22d2009-12-16 18:04:41 +010011756
Yuyang Du12695572015-07-15 08:04:40 +080011757 .task_dead = task_dead_fair,
Peter Zijlstrac5b28032015-05-15 17:43:35 +020011758 .set_cpus_allowed = set_cpus_allowed_common,
Peter Williams681f3e62007-10-24 18:23:51 +020011759#endif
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011760
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011761 .task_tick = task_tick_fair,
Peter Zijlstracd29fe62009-11-27 17:32:46 +010011762 .task_fork = task_fork_fair,
Steven Rostedtcb469842008-01-25 21:08:22 +010011763
11764 .prio_changed = prio_changed_fair,
Peter Zijlstrada7a7352011-01-17 17:03:27 +010011765 .switched_from = switched_from_fair,
Steven Rostedtcb469842008-01-25 21:08:22 +010011766 .switched_to = switched_to_fair,
Peter Zijlstra810b3812008-02-29 15:21:01 -050011767
Peter Williams0d721ce2009-09-21 01:31:53 +000011768 .get_rr_interval = get_rr_interval_fair,
11769
Stanislaw Gruszka6e998912014-11-12 16:58:44 +010011770 .update_curr = update_curr_fair,
11771
Peter Zijlstra810b3812008-02-29 15:21:01 -050011772#ifdef CONFIG_FAIR_GROUP_SCHED
Vincent Guittotea86cb42016-06-17 13:38:55 +020011773 .task_change_group = task_change_group_fair,
Peter Zijlstra810b3812008-02-29 15:21:01 -050011774#endif
Patrick Bellasi982d9cd2019-06-21 09:42:10 +010011775
11776#ifdef CONFIG_UCLAMP_TASK
11777 .uclamp_enabled = 1,
11778#endif
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011779};
11780
11781#ifdef CONFIG_SCHED_DEBUG
Peter Zijlstra029632f2011-10-25 10:00:11 +020011782void print_cfs_stats(struct seq_file *m, int cpu)
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011783{
Vincent Guittot039ae8b2019-02-06 17:14:22 +010011784 struct cfs_rq *cfs_rq, *pos;
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011785
Peter Zijlstra5973e5b2008-01-25 21:08:34 +010011786 rcu_read_lock();
Vincent Guittot039ae8b2019-02-06 17:14:22 +010011787 for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
Ingo Molnar5cef9ec2007-08-09 11:16:47 +020011788 print_cfs_rq(m, cpu, cfs_rq);
Peter Zijlstra5973e5b2008-01-25 21:08:34 +010011789 rcu_read_unlock();
Ingo Molnarbf0f6f22007-07-09 18:51:58 +020011790}
Srikar Dronamraju397f2372015-06-25 22:51:43 +053011791
11792#ifdef CONFIG_NUMA_BALANCING
11793void show_numa_stats(struct task_struct *p, struct seq_file *m)
11794{
11795 int node;
11796 unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
Jann Horncb361d82019-07-16 17:20:47 +020011797 struct numa_group *ng;
Srikar Dronamraju397f2372015-06-25 22:51:43 +053011798
Jann Horncb361d82019-07-16 17:20:47 +020011799 rcu_read_lock();
11800 ng = rcu_dereference(p->numa_group);
Srikar Dronamraju397f2372015-06-25 22:51:43 +053011801 for_each_online_node(node) {
11802 if (p->numa_faults) {
11803 tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
11804 tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
11805 }
Jann Horncb361d82019-07-16 17:20:47 +020011806 if (ng) {
11807 gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)],
11808 gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
Srikar Dronamraju397f2372015-06-25 22:51:43 +053011809 }
11810 print_numa_stats(m, node, tsf, tpf, gsf, gpf);
11811 }
Jann Horncb361d82019-07-16 17:20:47 +020011812 rcu_read_unlock();
Srikar Dronamraju397f2372015-06-25 22:51:43 +053011813}
11814#endif /* CONFIG_NUMA_BALANCING */
11815#endif /* CONFIG_SCHED_DEBUG */
Peter Zijlstra029632f2011-10-25 10:00:11 +020011816
11817__init void init_sched_fair_class(void)
11818{
11819#ifdef CONFIG_SMP
11820 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
11821
Frederic Weisbecker3451d022011-08-10 23:21:01 +020011822#ifdef CONFIG_NO_HZ_COMMON
Diwakar Tundlam554ceca2012-03-07 14:44:26 -080011823 nohz.next_balance = jiffies;
Vincent Guittotf643ea22018-02-13 11:31:17 +010011824 nohz.next_blocked = jiffies;
Peter Zijlstra029632f2011-10-25 10:00:11 +020011825 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
Peter Zijlstra029632f2011-10-25 10:00:11 +020011826#endif
11827#endif /* SMP */
11828
11829}
Qais Yousef3c93a0c2019-06-04 12:14:55 +010011830
11831/*
11832 * Helper functions to facilitate extracting info from tracepoints.
11833 */
11834
11835const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq)
11836{
11837#ifdef CONFIG_SMP
11838 return cfs_rq ? &cfs_rq->avg : NULL;
11839#else
11840 return NULL;
11841#endif
11842}
11843EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg);
11844
11845char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len)
11846{
11847 if (!cfs_rq) {
11848 if (str)
11849 strlcpy(str, "(null)", len);
11850 else
11851 return NULL;
11852 }
11853
11854 cfs_rq_tg_path(cfs_rq, str, len);
11855 return str;
11856}
11857EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path);
11858
11859int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq)
11860{
11861 return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1;
11862}
11863EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu);
11864
11865const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq)
11866{
11867#ifdef CONFIG_SMP
11868 return rq ? &rq->avg_rt : NULL;
11869#else
11870 return NULL;
11871#endif
11872}
11873EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt);
11874
11875const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq)
11876{
11877#ifdef CONFIG_SMP
11878 return rq ? &rq->avg_dl : NULL;
11879#else
11880 return NULL;
11881#endif
11882}
11883EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl);
11884
11885const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq)
11886{
11887#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ)
11888 return rq ? &rq->avg_irq : NULL;
11889#else
11890 return NULL;
11891#endif
11892}
11893EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq);
11894
11895int sched_trace_rq_cpu(struct rq *rq)
11896{
11897 return rq ? cpu_of(rq) : -1;
11898}
11899EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
11900
Vincent Donnefort51cf18c2020-08-28 10:00:49 +010011901int sched_trace_rq_cpu_capacity(struct rq *rq)
11902{
11903 return rq ?
11904#ifdef CONFIG_SMP
11905 rq->cpu_capacity
11906#else
11907 SCHED_CAPACITY_SCALE
11908#endif
11909 : -1;
11910}
11911EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity);
11912
Qais Yousef3c93a0c2019-06-04 12:14:55 +010011913const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
11914{
11915#ifdef CONFIG_SMP
11916 return rd ? rd->span : NULL;
11917#else
11918 return NULL;
11919#endif
11920}
11921EXPORT_SYMBOL_GPL(sched_trace_rd_span);
Phil Auld9d246052020-06-29 15:23:03 -040011922
11923int sched_trace_rq_nr_running(struct rq *rq)
11924{
11925 return rq ? rq->nr_running : -1;
11926}
11927EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running);