Blame - kernel/sched/fair.c - SHIFTPHONES/mainline/linux

blob: 1e5f58081762bd8a582b3fbd4a19b1f07b720279 [file] [log] [blame]

Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1	/*
				2	* Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
				3	*
				4	* Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
				5	*
				6	* Interactivity improvements by Mike Galbraith
				7	* (C) 2007 Mike Galbraith <efault@gmx.de>
				8	*
				9	* Various enhancements by Dmitry Adamushko.
				10	* (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
				11	*
				12	* Group scheduling enhancements by Srivatsa Vaddagiri
				13	* Copyright IBM Corporation, 2007
				14	* Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
				15	*
				16	* Scaled math optimizations by Thomas Gleixner
				17	* Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	18	*
				19	* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
Peter Zijlstra	90eec10	2015-11-16 11:08:45 +0100	[diff] [blame]	20	* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	21	*/
				22
Ingo Molnar	589ee62	2017-02-04 00:16:44 +0100	[diff] [blame]	23	#include <linux/sched/mm.h>
Ingo Molnar	105ab3d	2017-02-01 16:36:40 +0100	[diff] [blame]	24	#include <linux/sched/topology.h>
				25
Mel Gorman	cb25176	2016-02-05 09:08:36 +0000	[diff] [blame]	26	#include <linux/latencytop.h>
Sisir Koppaka	3436ae1	2011-03-26 18:22:55 +0530	[diff] [blame]	27	#include <linux/cpumask.h>
Nicolas Pitre	83a0a96	2014-09-04 11:32:10 -0400	[diff] [blame]	28	#include <linux/cpuidle.h>
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	29	#include <linux/slab.h>
				30	#include <linux/profile.h>
				31	#include <linux/interrupt.h>
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	32	#include <linux/mempolicy.h>
Mel Gorman	e14808b	2012-11-19 10:59:15 +0000	[diff] [blame]	33	#include <linux/migrate.h>
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	34	#include <linux/task_work.h>
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	35
				36	#include <trace/events/sched.h>
				37
				38	#include "sched.h"
Arjan van de Ven	9745512	2008-01-25 21:08:34 +0100	[diff] [blame]	39
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	40	/*
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	41	* Targeted preemption latency for CPU-bound tasks:
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	42	*
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	43	* NOTE: this latency value is not the same as the concept of
Ingo Molnar	d274a4c	2007-10-15 17:00:14 +0200	[diff] [blame]	44	* 'timeslice length' - timeslices in CFS are of variable length
				45	* and have no persistent notion like in traditional, time-slice
				46	* based scheduling concepts.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	47	*
Ingo Molnar	d274a4c	2007-10-15 17:00:14 +0200	[diff] [blame]	48	* (to see the precise effective timeslice length of your workload,
				49	* run vmstat and monitor the context-switches (cs) field)
Ingo Molnar	2b4d5b2	2016-11-23 07:37:00 +0100	[diff] [blame]	50	*
				51	* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	52	*/
Ingo Molnar	2b4d5b2	2016-11-23 07:37:00 +0100	[diff] [blame]	53	unsigned int sysctl_sched_latency = 6000000ULL;
				54	unsigned int normalized_sysctl_sched_latency = 6000000ULL;
Ingo Molnar	2bd8e6d	2007-10-15 17:00:02 +0200	[diff] [blame]	55
				56	/*
Christian Ehrhardt	1983a92	2009-11-30 12:16:47 +0100	[diff] [blame]	57	* The initial- and re-scaling of tunables is configurable
Christian Ehrhardt	1983a92	2009-11-30 12:16:47 +0100	[diff] [blame]	58	*
				59	* Options are:
Ingo Molnar	2b4d5b2	2016-11-23 07:37:00 +0100	[diff] [blame]	60	*
				61	* SCHED_TUNABLESCALING_NONE - unscaled, always *1
				62	* SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
				63	* SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
				64	*
				65	* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
Christian Ehrhardt	1983a92	2009-11-30 12:16:47 +0100	[diff] [blame]	66	*/
Ingo Molnar	2b4d5b2	2016-11-23 07:37:00 +0100	[diff] [blame]	67	enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
Christian Ehrhardt	1983a92	2009-11-30 12:16:47 +0100	[diff] [blame]	68
				69	/*
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	70	* Minimal preemption granularity for CPU-bound tasks:
Ingo Molnar	2b4d5b2	2016-11-23 07:37:00 +0100	[diff] [blame]	71	*
Takuya Yoshikawa	864616e	2010-10-14 16:09:13 +0900	[diff] [blame]	72	* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	73	*/
Ingo Molnar	2b4d5b2	2016-11-23 07:37:00 +0100	[diff] [blame]	74	unsigned int sysctl_sched_min_granularity = 750000ULL;
				75	unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	76
				77	/*
Ingo Molnar	2b4d5b2	2016-11-23 07:37:00 +0100	[diff] [blame]	78	* This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	79	*/
Ingo Molnar	0bf377b	2010-09-12 08:14:52 +0200	[diff] [blame]	80	static unsigned int sched_nr_latency = 8;
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	81
				82	/*
Mike Galbraith	2bba22c	2009-09-09 15:41:37 +0200	[diff] [blame]	83	* After fork, child runs first. If set to 0 (default) then
Ingo Molnar	2bd8e6d	2007-10-15 17:00:02 +0200	[diff] [blame]	84	* parent will (try to) run first.
				85	*/
Mike Galbraith	2bba22c	2009-09-09 15:41:37 +0200	[diff] [blame]	86	unsigned int sysctl_sched_child_runs_first __read_mostly;
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	87
				88	/*
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	89	* SCHED_OTHER wake-up granularity.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	90	*
				91	* This option delays the preemption effects of decoupled workloads
				92	* and reduces their over-scheduling. Synchronous workloads will still
				93	* have immediate wakeup/sleep latencies.
Ingo Molnar	2b4d5b2	2016-11-23 07:37:00 +0100	[diff] [blame]	94	*
				95	* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	96	*/
Ingo Molnar	2b4d5b2	2016-11-23 07:37:00 +0100	[diff] [blame]	97	unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
				98	unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	99
Ingo Molnar	2b4d5b2	2016-11-23 07:37:00 +0100	[diff] [blame]	100	const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
Ingo Molnar	da84d96	2007-10-15 17:00:18 +0200	[diff] [blame]	101
Tim Chen	afe06ef	2016-11-22 12:23:53 -0800	[diff] [blame]	102	#ifdef CONFIG_SMP
				103	/*
				104	* For asym packing, by default the lower numbered cpu has higher priority.
				105	*/
				106	int __weak arch_asym_cpu_priority(int cpu)
				107	{
				108	return -cpu;
				109	}
				110	#endif
				111
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	112	#ifdef CONFIG_CFS_BANDWIDTH
				113	/*
				114	* Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
				115	* each time a cfs_rq requests quota.
				116	*
				117	* Note: in the case that the slice exceeds the runtime remaining (either due
				118	* to consumption or the quota being specified to be smaller than the slice)
				119	* we will always only issue the remaining available time.
				120	*
Ingo Molnar	2b4d5b2	2016-11-23 07:37:00 +0100	[diff] [blame]	121	* (default: 5 msec, units: microseconds)
				122	*/
				123	unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	124	#endif
				125
Morten Rasmussen	3273163	2016-07-25 14:34:26 +0100	[diff] [blame]	126	/*
				127	* The margin used when comparing utilization with CPU capacity:
Morten Rasmussen	893c5d2	2016-10-14 14:41:12 +0100	[diff] [blame]	128	* util * margin < capacity * 1024
Ingo Molnar	2b4d5b2	2016-11-23 07:37:00 +0100	[diff] [blame]	129	*
				130	* (default: ~20%)
Morten Rasmussen	3273163	2016-07-25 14:34:26 +0100	[diff] [blame]	131	*/
Ingo Molnar	2b4d5b2	2016-11-23 07:37:00 +0100	[diff] [blame]	132	unsigned int capacity_margin = 1280;
Morten Rasmussen	3273163	2016-07-25 14:34:26 +0100	[diff] [blame]	133
Paul Gortmaker	8527632	2013-04-19 15:10:50 -0400	[diff] [blame]	134	static inline void update_load_add(struct load_weight *lw, unsigned long inc)
				135	{
				136	lw->weight += inc;
				137	lw->inv_weight = 0;
				138	}
				139
				140	static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
				141	{
				142	lw->weight -= dec;
				143	lw->inv_weight = 0;
				144	}
				145
				146	static inline void update_load_set(struct load_weight *lw, unsigned long w)
				147	{
				148	lw->weight = w;
				149	lw->inv_weight = 0;
				150	}
				151
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	152	/*
				153	* Increase the granularity value when there are more CPUs,
				154	* because with more CPUs the 'effective latency' as visible
				155	* to users decreases. But the relationship is not linear,
				156	* so pick a second-best guess by going with the log2 of the
				157	* number of CPUs.
				158	*
				159	* This idea comes from the SD scheduler of Con Kolivas:
				160	*/
Nicholas Mc Guire	58ac93e	2015-05-15 21:05:42 +0200	[diff] [blame]	161	static unsigned int get_update_sysctl_factor(void)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	162	{
Nicholas Mc Guire	58ac93e	2015-05-15 21:05:42 +0200	[diff] [blame]	163	unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	164	unsigned int factor;
				165
				166	switch (sysctl_sched_tunable_scaling) {
				167	case SCHED_TUNABLESCALING_NONE:
				168	factor = 1;
				169	break;
				170	case SCHED_TUNABLESCALING_LINEAR:
				171	factor = cpus;
				172	break;
				173	case SCHED_TUNABLESCALING_LOG:
				174	default:
				175	factor = 1 + ilog2(cpus);
				176	break;
				177	}
				178
				179	return factor;
				180	}
				181
				182	static void update_sysctl(void)
				183	{
				184	unsigned int factor = get_update_sysctl_factor();
				185
				186	#define SET_SYSCTL(name) \
				187	(sysctl_##name = (factor) * normalized_sysctl_##name)
				188	SET_SYSCTL(sched_min_granularity);
				189	SET_SYSCTL(sched_latency);
				190	SET_SYSCTL(sched_wakeup_granularity);
				191	#undef SET_SYSCTL
				192	}
				193
				194	void sched_init_granularity(void)
				195	{
				196	update_sysctl();
				197	}
				198
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	199	#define WMULT_CONST (~0U)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	200	#define WMULT_SHIFT 32
				201
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	202	static void __update_inv_weight(struct load_weight *lw)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	203	{
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	204	unsigned long w;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	205
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	206	if (likely(lw->inv_weight))
				207	return;
				208
				209	w = scale_load_down(lw->weight);
				210
				211	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
				212	lw->inv_weight = 1;
				213	else if (unlikely(!w))
				214	lw->inv_weight = WMULT_CONST;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	215	else
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	216	lw->inv_weight = WMULT_CONST / w;
				217	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	218
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	219	/*
				220	* delta_exec * weight / lw.weight
				221	* OR
				222	* (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
				223	*
Yuyang Du	1c3de5e	2016-03-30 07:07:51 +0800	[diff] [blame]	224	* Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	225	* we're guaranteed shift stays positive because inv_weight is guaranteed to
				226	* fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
				227	*
				228	* Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
				229	* weight/lw.weight <= 1, and therefore our shift will also be positive.
				230	*/
				231	static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
				232	{
				233	u64 fact = scale_load_down(weight);
				234	int shift = WMULT_SHIFT;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	235
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	236	__update_inv_weight(lw);
				237
				238	if (unlikely(fact >> 32)) {
				239	while (fact >> 32) {
				240	fact >>= 1;
				241	shift--;
				242	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	243	}
				244
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	245	/* hint to use a 32x32->64 mul */
				246	fact = (u64)(u32)fact * lw->inv_weight;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	247
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	248	while (fact >> 32) {
				249	fact >>= 1;
				250	shift--;
				251	}
				252
				253	return mul_u64_u32_shr(delta_exec, fact, shift);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	254	}
				255
				256
				257	const struct sched_class fair_sched_class;
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	258
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	259	/**************************************************************
				260	* CFS operations on generic schedulable entities:
				261	*/
				262
				263	#ifdef CONFIG_FAIR_GROUP_SCHED
				264
				265	/* cpu runqueue to which this cfs_rq is attached */
				266	static inline struct rq rq_of(struct cfs_rq cfs_rq)
				267	{
				268	return cfs_rq->rq;
				269	}
				270
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	271	/* An entity is a task if it doesn't "own" a runqueue */
				272	#define entity_is_task(se) (!se->my_q)
				273
Peter Zijlstra	8f48894	2009-07-24 12:25:30 +0200	[diff] [blame]	274	static inline struct task_struct task_of(struct sched_entity se)
				275	{
Peter Zijlstra	9148a3a	2016-09-20 22:34:51 +0200	[diff] [blame]	276	SCHED_WARN_ON(!entity_is_task(se));
Peter Zijlstra	8f48894	2009-07-24 12:25:30 +0200	[diff] [blame]	277	return container_of(se, struct task_struct, se);
				278	}
				279
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	280	/* Walk up scheduling entities hierarchy */
				281	#define for_each_sched_entity(se) \
				282	for (; se; se = se->parent)
				283
				284	static inline struct cfs_rq task_cfs_rq(struct task_struct p)
				285	{
				286	return p->se.cfs_rq;
				287	}
				288
				289	/* runqueue on which this entity is (to be) queued */
				290	static inline struct cfs_rq cfs_rq_of(struct sched_entity se)
				291	{
				292	return se->cfs_rq;
				293	}
				294
				295	/* runqueue "owned" by this group */
				296	static inline struct cfs_rq group_cfs_rq(struct sched_entity grp)
				297	{
				298	return grp->my_q;
				299	}
				300
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	301	static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				302	{
				303	if (!cfs_rq->on_list) {
Vincent Guittot	9c2791f	2016-11-08 10:53:43 +0100	[diff] [blame]	304	struct rq *rq = rq_of(cfs_rq);
				305	int cpu = cpu_of(rq);
Paul Turner	67e8625	2010-11-15 15:47:05 -0800	[diff] [blame]	306	/*
				307	* Ensure we either appear before our parent (if already
				308	* enqueued) or force our parent to appear after us when it is
Vincent Guittot	9c2791f	2016-11-08 10:53:43 +0100	[diff] [blame]	309	* enqueued. The fact that we always enqueue bottom-up
				310	* reduces this to two cases and a special case for the root
				311	* cfs_rq. Furthermore, it also means that we will always reset
				312	* tmp_alone_branch either when the branch is connected
				313	* to a tree or when we reach the beg of the tree
Paul Turner	67e8625	2010-11-15 15:47:05 -0800	[diff] [blame]	314	*/
				315	if (cfs_rq->tg->parent &&
Vincent Guittot	9c2791f	2016-11-08 10:53:43 +0100	[diff] [blame]	316	cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
				317	/*
				318	* If parent is already on the list, we add the child
				319	* just before. Thanks to circular linked property of
				320	* the list, this means to put the child at the tail
				321	* of the list that starts by parent.
				322	*/
Paul Turner	67e8625	2010-11-15 15:47:05 -0800	[diff] [blame]	323	list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
Vincent Guittot	9c2791f	2016-11-08 10:53:43 +0100	[diff] [blame]	324	&(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
				325	/*
				326	* The branch is now connected to its tree so we can
				327	* reset tmp_alone_branch to the beginning of the
				328	* list.
				329	*/
				330	rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
				331	} else if (!cfs_rq->tg->parent) {
				332	/*
				333	* cfs rq without parent should be put
				334	* at the tail of the list.
				335	*/
				336	list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
				337	&rq->leaf_cfs_rq_list);
				338	/*
				339	* We have reach the beg of a tree so we can reset
				340	* tmp_alone_branch to the beginning of the list.
				341	*/
				342	rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
				343	} else {
				344	/*
				345	* The parent has not already been added so we want to
				346	* make sure that it will be put after us.
				347	* tmp_alone_branch points to the beg of the branch
				348	* where we will add parent.
				349	*/
				350	list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
				351	rq->tmp_alone_branch);
				352	/*
				353	* update tmp_alone_branch to points to the new beg
				354	* of the branch
				355	*/
				356	rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
Paul Turner	67e8625	2010-11-15 15:47:05 -0800	[diff] [blame]	357	}
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	358
				359	cfs_rq->on_list = 1;
				360	}
				361	}
				362
				363	static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				364	{
				365	if (cfs_rq->on_list) {
				366	list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
				367	cfs_rq->on_list = 0;
				368	}
				369	}
				370
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	371	/* Iterate thr' all leaf cfs_rq's on a runqueue */
				372	#define for_each_leaf_cfs_rq(rq, cfs_rq) \
				373	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
				374
				375	/* Do the two (enqueued) entities belong to the same group ? */
Peter Zijlstra	fed14d4	2012-02-11 06:05:00 +0100	[diff] [blame]	376	static inline struct cfs_rq *
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	377	is_same_group(struct sched_entity se, struct sched_entity pse)
				378	{
				379	if (se->cfs_rq == pse->cfs_rq)
Peter Zijlstra	fed14d4	2012-02-11 06:05:00 +0100	[diff] [blame]	380	return se->cfs_rq;
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	381
Peter Zijlstra	fed14d4	2012-02-11 06:05:00 +0100	[diff] [blame]	382	return NULL;
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	383	}
				384
				385	static inline struct sched_entity parent_entity(struct sched_entity se)
				386	{
				387	return se->parent;
				388	}
				389
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	390	static void
				391	find_matching_se(struct sched_entity se, struct sched_entity pse)
				392	{
				393	int se_depth, pse_depth;
				394
				395	/*
				396	* preemption test can be made between sibling entities who are in the
				397	* same cfs_rq i.e who have a common parent. Walk up the hierarchy of
				398	* both tasks until we find their ancestors who are siblings of common
				399	* parent.
				400	*/
				401
				402	/* First walk up until both entities are at same depth */
Peter Zijlstra	fed14d4	2012-02-11 06:05:00 +0100	[diff] [blame]	403	se_depth = (*se)->depth;
				404	pse_depth = (*pse)->depth;
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	405
				406	while (se_depth > pse_depth) {
				407	se_depth--;
				408	se = parent_entity(se);
				409	}
				410
				411	while (pse_depth > se_depth) {
				412	pse_depth--;
				413	pse = parent_entity(pse);
				414	}
				415
				416	while (!is_same_group(se, pse)) {
				417	se = parent_entity(se);
				418	pse = parent_entity(pse);
				419	}
				420	}
				421
Peter Zijlstra	8f48894	2009-07-24 12:25:30 +0200	[diff] [blame]	422	#else /* !CONFIG_FAIR_GROUP_SCHED */
				423
				424	static inline struct task_struct task_of(struct sched_entity se)
				425	{
				426	return container_of(se, struct task_struct, se);
				427	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	428
				429	static inline struct rq rq_of(struct cfs_rq cfs_rq)
				430	{
				431	return container_of(cfs_rq, struct rq, cfs);
				432	}
				433
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	434	#define entity_is_task(se) 1
				435
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	436	#define for_each_sched_entity(se) \
				437	for (; se; se = NULL)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	438
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	439	static inline struct cfs_rq task_cfs_rq(struct task_struct p)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	440	{
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	441	return &task_rq(p)->cfs;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	442	}
				443
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	444	static inline struct cfs_rq cfs_rq_of(struct sched_entity se)
				445	{
				446	struct task_struct *p = task_of(se);
				447	struct rq *rq = task_rq(p);
				448
				449	return &rq->cfs;
				450	}
				451
				452	/* runqueue "owned" by this group */
				453	static inline struct cfs_rq group_cfs_rq(struct sched_entity grp)
				454	{
				455	return NULL;
				456	}
				457
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	458	static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				459	{
				460	}
				461
				462	static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				463	{
				464	}
				465
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	466	#define for_each_leaf_cfs_rq(rq, cfs_rq) \
				467	for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
				468
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	469	static inline struct sched_entity parent_entity(struct sched_entity se)
				470	{
				471	return NULL;
				472	}
				473
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	474	static inline void
				475	find_matching_se(struct sched_entity se, struct sched_entity pse)
				476	{
				477	}
				478
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	479	#endif /* CONFIG_FAIR_GROUP_SCHED */
				480
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	481	static __always_inline
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	482	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	483
				484	/**************************************************************
				485	* Scheduling class tree data structure manipulation methods:
				486	*/
				487
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	488	static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
Peter Zijlstra	02e0431	2007-10-15 17:00:07 +0200	[diff] [blame]	489	{
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	490	s64 delta = (s64)(vruntime - max_vruntime);
Peter Zijlstra	368059a	2007-10-15 17:00:11 +0200	[diff] [blame]	491	if (delta > 0)
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	492	max_vruntime = vruntime;
Peter Zijlstra	02e0431	2007-10-15 17:00:07 +0200	[diff] [blame]	493
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	494	return max_vruntime;
Peter Zijlstra	02e0431	2007-10-15 17:00:07 +0200	[diff] [blame]	495	}
				496
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	497	static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
Peter Zijlstra	b0ffd24	2007-10-15 17:00:12 +0200	[diff] [blame]	498	{
				499	s64 delta = (s64)(vruntime - min_vruntime);
				500	if (delta < 0)
				501	min_vruntime = vruntime;
				502
				503	return min_vruntime;
				504	}
				505
Fabio Checconi	54fdc58	2009-07-16 12:32:27 +0200	[diff] [blame]	506	static inline int entity_before(struct sched_entity *a,
				507	struct sched_entity *b)
				508	{
				509	return (s64)(a->vruntime - b->vruntime) < 0;
				510	}
				511
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	512	static void update_min_vruntime(struct cfs_rq *cfs_rq)
				513	{
Peter Zijlstra	b60205c	2016-09-20 21:58:12 +0200	[diff] [blame]	514	struct sched_entity *curr = cfs_rq->curr;
				515
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	516	u64 vruntime = cfs_rq->min_vruntime;
				517
Peter Zijlstra	b60205c	2016-09-20 21:58:12 +0200	[diff] [blame]	518	if (curr) {
				519	if (curr->on_rq)
				520	vruntime = curr->vruntime;
				521	else
				522	curr = NULL;
				523	}
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	524
				525	if (cfs_rq->rb_leftmost) {
				526	struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
				527	struct sched_entity,
				528	run_node);
				529
Peter Zijlstra	b60205c	2016-09-20 21:58:12 +0200	[diff] [blame]	530	if (!curr)
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	531	vruntime = se->vruntime;
				532	else
				533	vruntime = min_vruntime(vruntime, se->vruntime);
				534	}
				535
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	536	/* ensure we never gain time by being placed backwards. */
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	537	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	538	#ifndef CONFIG_64BIT
				539	smp_wmb();
				540	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
				541	#endif
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	542	}
				543
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	544	/*
				545	* Enqueue an entity into the rb-tree:
				546	*/
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	547	static void __enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	548	{
				549	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
				550	struct rb_node *parent = NULL;
				551	struct sched_entity *entry;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	552	int leftmost = 1;
				553
				554	/*
				555	* Find the right place in the rbtree:
				556	*/
				557	while (*link) {
				558	parent = *link;
				559	entry = rb_entry(parent, struct sched_entity, run_node);
				560	/*
				561	* We dont care about collisions. Nodes with
				562	* the same key stay together.
				563	*/
Stephan Baerwolf	2bd2d6f	2011-07-20 14:46:59 +0200	[diff] [blame]	564	if (entity_before(se, entry)) {
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	565	link = &parent->rb_left;
				566	} else {
				567	link = &parent->rb_right;
				568	leftmost = 0;
				569	}
				570	}
				571
				572	/*
				573	* Maintain a cache of leftmost tree entries (it is frequently
				574	* used):
				575	*/
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	576	if (leftmost)
Ingo Molnar	57cb499	2007-10-15 17:00:11 +0200	[diff] [blame]	577	cfs_rq->rb_leftmost = &se->run_node;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	578
				579	rb_link_node(&se->run_node, parent, link);
				580	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	581	}
				582
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	583	static void __dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	584	{
Peter Zijlstra	3fe6974	2008-03-14 20:55:51 +0100	[diff] [blame]	585	if (cfs_rq->rb_leftmost == &se->run_node) {
				586	struct rb_node *next_node;
Peter Zijlstra	3fe6974	2008-03-14 20:55:51 +0100	[diff] [blame]	587
				588	next_node = rb_next(&se->run_node);
				589	cfs_rq->rb_leftmost = next_node;
Peter Zijlstra	3fe6974	2008-03-14 20:55:51 +0100	[diff] [blame]	590	}
Ingo Molnar	e9acbff	2007-10-15 17:00:04 +0200	[diff] [blame]	591
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	592	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	593	}
				594
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	595	struct sched_entity __pick_first_entity(struct cfs_rq cfs_rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	596	{
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	597	struct rb_node *left = cfs_rq->rb_leftmost;
				598
				599	if (!left)
				600	return NULL;
				601
				602	return rb_entry(left, struct sched_entity, run_node);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	603	}
				604
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	605	static struct sched_entity __pick_next_entity(struct sched_entity se)
				606	{
				607	struct rb_node *next = rb_next(&se->run_node);
				608
				609	if (!next)
				610	return NULL;
				611
				612	return rb_entry(next, struct sched_entity, run_node);
				613	}
				614
				615	#ifdef CONFIG_SCHED_DEBUG
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	616	struct sched_entity __pick_last_entity(struct cfs_rq cfs_rq)
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	617	{
Ingo Molnar	7eee3e6	2008-02-22 10:32:21 +0100	[diff] [blame]	618	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	619
Balbir Singh	70eee74	2008-02-22 13:25:53 +0530	[diff] [blame]	620	if (!last)
				621	return NULL;
Ingo Molnar	7eee3e6	2008-02-22 10:32:21 +0100	[diff] [blame]	622
				623	return rb_entry(last, struct sched_entity, run_node);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	624	}
				625
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	626	/**************************************************************
				627	* Scheduling class statistics methods:
				628	*/
				629
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	630	int sched_proc_update_handler(struct ctl_table *table, int write,
Alexey Dobriyan	8d65af7	2009-09-23 15:57:19 -0700	[diff] [blame]	631	void __user buffer, size_t lenp,
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	632	loff_t *ppos)
				633	{
Alexey Dobriyan	8d65af7	2009-09-23 15:57:19 -0700	[diff] [blame]	634	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
Nicholas Mc Guire	58ac93e	2015-05-15 21:05:42 +0200	[diff] [blame]	635	unsigned int factor = get_update_sysctl_factor();
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	636
				637	if (ret \|\| !write)
				638	return ret;
				639
				640	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
				641	sysctl_sched_min_granularity);
				642
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	643	#define WRT_SYSCTL(name) \
				644	(normalized_sysctl_##name = sysctl_##name / (factor))
				645	WRT_SYSCTL(sched_min_granularity);
				646	WRT_SYSCTL(sched_latency);
				647	WRT_SYSCTL(sched_wakeup_granularity);
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	648	#undef WRT_SYSCTL
				649
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	650	return 0;
				651	}
				652	#endif
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	653
				654	/*
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	655	* delta /= w
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	656	*/
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	657	static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	658	{
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	659	if (unlikely(se->load.weight != NICE_0_LOAD))
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	660	delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	661
				662	return delta;
				663	}
				664
				665	/*
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	666	* The idea is to set a period in which each task runs once.
				667	*
Borislav Petkov	532b185	2012-08-08 16:16:04 +0200	[diff] [blame]	668	* When there are too many tasks (sched_nr_latency) we have to stretch
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	669	* this period because otherwise the slices get too small.
				670	*
				671	* p = (nr <= nl) ? l : l*nr/nl
				672	*/
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	673	static u64 __sched_period(unsigned long nr_running)
				674	{
Boqun Feng	8e2b0bf	2015-07-02 22:25:52 +0800	[diff] [blame]	675	if (unlikely(nr_running > sched_nr_latency))
				676	return nr_running * sysctl_sched_min_granularity;
				677	else
				678	return sysctl_sched_latency;
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	679	}
				680
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	681	/*
				682	* We calculate the wall-time slice from the period by taking a part
				683	* proportional to the weight.
				684	*
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	685	* s = p*P[w/rw]
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	686	*/
Peter Zijlstra	6d0f0eb	2007-10-15 17:00:05 +0200	[diff] [blame]	687	static u64 sched_slice(struct cfs_rq cfs_rq, struct sched_entity se)
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	688	{
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	689	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	690
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	691	for_each_sched_entity(se) {
Lin Ming	6272d68	2009-01-15 17:17:15 +0100	[diff] [blame]	692	struct load_weight *load;
Christian Engelmayer	3104bf0	2009-06-16 10:35:12 +0200	[diff] [blame]	693	struct load_weight lw;
Lin Ming	6272d68	2009-01-15 17:17:15 +0100	[diff] [blame]	694
				695	cfs_rq = cfs_rq_of(se);
				696	load = &cfs_rq->load;
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	697
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	698	if (unlikely(!se->on_rq)) {
Christian Engelmayer	3104bf0	2009-06-16 10:35:12 +0200	[diff] [blame]	699	lw = cfs_rq->load;
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	700
				701	update_load_add(&lw, se->load.weight);
				702	load = &lw;
				703	}
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	704	slice = __calc_delta(slice, se->load.weight, load);
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	705	}
				706	return slice;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	707	}
				708
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	709	/*
Andrei Epure	660cc00	2013-03-11 12:03:20 +0200	[diff] [blame]	710	* We calculate the vruntime slice of a to-be-inserted task.
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	711	*
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	712	* vs = s/w
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	713	*/
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	714	static u64 sched_vslice(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	715	{
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	716	return calc_delta_fair(sched_slice(cfs_rq, se), se);
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	717	}
				718
Alex Shi	a75cdaa	2013-06-20 10:18:47 +0800	[diff] [blame]	719	#ifdef CONFIG_SMP
Morten Rasmussen	772bd008c	2016-06-22 18:03:13 +0100	[diff] [blame]	720	static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	721	static unsigned long task_h_load(struct task_struct *p);
				722
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	723	/*
				724	* We choose a half-life close to 1 scheduling period.
Leo Yan	84fb5a1	2015-09-15 18:57:37 +0800	[diff] [blame]	725	* Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
				726	* dependent on this value.
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	727	*/
				728	#define LOAD_AVG_PERIOD 32
				729	#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
Alex Shi	a75cdaa	2013-06-20 10:18:47 +0800	[diff] [blame]	730
Yuyang Du	540247f	2015-07-15 08:04:39 +0800	[diff] [blame]	731	/* Give new sched_entity start runnable values to heavy its load in infant time */
				732	void init_entity_runnable_average(struct sched_entity *se)
Alex Shi	a75cdaa	2013-06-20 10:18:47 +0800	[diff] [blame]	733	{
Yuyang Du	540247f	2015-07-15 08:04:39 +0800	[diff] [blame]	734	struct sched_avg *sa = &se->avg;
Alex Shi	a75cdaa	2013-06-20 10:18:47 +0800	[diff] [blame]	735
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	736	sa->last_update_time = 0;
				737	/*
				738	* sched_avg's period_contrib should be strictly less then 1024, so
				739	* we give it 1023 to make sure it is almost a period (1024us), and
				740	* will definitely be update (after enqueue).
				741	*/
				742	sa->period_contrib = 1023;
Vincent Guittot	b5a9b34	2016-10-19 14:45:23 +0200	[diff] [blame]	743	/*
				744	* Tasks are intialized with full load to be seen as heavy tasks until
				745	* they get a chance to stabilize to their real load level.
				746	* Group entities are intialized with zero load to reflect the fact that
				747	* nothing has been attached to the task group yet.
				748	*/
				749	if (entity_is_task(se))
				750	sa->load_avg = scale_load_down(se->load.weight);
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	751	sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
Yuyang Du	2b8c41d	2016-03-30 04:30:56 +0800	[diff] [blame]	752	/*
				753	* At this point, util_avg won't be used in select_task_rq_fair anyway
				754	*/
				755	sa->util_avg = 0;
				756	sa->util_sum = 0;
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	757	/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
Alex Shi	a75cdaa	2013-06-20 10:18:47 +0800	[diff] [blame]	758	}
Yuyang Du	7ea241a	2015-07-15 08:04:42 +0800	[diff] [blame]	759
Peter Zijlstra	7dc603c	2016-06-16 13:29:28 +0200	[diff] [blame]	760	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
Vincent Guittot	df21791	2016-11-08 10:53:42 +0100	[diff] [blame]	761	static void attach_entity_cfs_rq(struct sched_entity *se);
Peter Zijlstra	7dc603c	2016-06-16 13:29:28 +0200	[diff] [blame]	762
Yuyang Du	2b8c41d	2016-03-30 04:30:56 +0800	[diff] [blame]	763	/*
				764	* With new tasks being created, their initial util_avgs are extrapolated
				765	* based on the cfs_rq's current util_avg:
				766	*
				767	* util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
				768	*
				769	* However, in many cases, the above util_avg does not give a desired
				770	* value. Moreover, the sum of the util_avgs may be divergent, such
				771	* as when the series is a harmonic series.
				772	*
				773	* To solve this problem, we also cap the util_avg of successive tasks to
				774	* only 1/2 of the left utilization budget:
				775	*
				776	* util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
				777	*
				778	* where n denotes the nth task.
				779	*
				780	* For example, a simplest series from the beginning would be like:
				781	*
				782	* task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
				783	* cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
				784	*
				785	* Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
				786	* if util_avg > util_avg_cap.
				787	*/
				788	void post_init_entity_util_avg(struct sched_entity *se)
				789	{
				790	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				791	struct sched_avg *sa = &se->avg;
Yuyang Du	172895e	2016-04-05 12:12:27 +0800	[diff] [blame]	792	long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
Yuyang Du	2b8c41d	2016-03-30 04:30:56 +0800	[diff] [blame]	793
				794	if (cap > 0) {
				795	if (cfs_rq->avg.util_avg != 0) {
				796	sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
				797	sa->util_avg /= (cfs_rq->avg.load_avg + 1);
				798
				799	if (sa->util_avg > cap)
				800	sa->util_avg = cap;
				801	} else {
				802	sa->util_avg = cap;
				803	}
				804	sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
				805	}
Peter Zijlstra	7dc603c	2016-06-16 13:29:28 +0200	[diff] [blame]	806
				807	if (entity_is_task(se)) {
				808	struct task_struct *p = task_of(se);
				809	if (p->sched_class != &fair_sched_class) {
				810	/*
				811	* For !fair tasks do:
				812	*
				813	update_cfs_rq_load_avg(now, cfs_rq, false);
				814	attach_entity_load_avg(cfs_rq, se);
				815	switched_from_fair(rq, p);
				816	*
				817	* such that the next switched_to_fair() has the
				818	* expected state.
				819	*/
Vincent Guittot	df21791	2016-11-08 10:53:42 +0100	[diff] [blame]	820	se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
Peter Zijlstra	7dc603c	2016-06-16 13:29:28 +0200	[diff] [blame]	821	return;
				822	}
				823	}
				824
Vincent Guittot	df21791	2016-11-08 10:53:42 +0100	[diff] [blame]	825	attach_entity_cfs_rq(se);
Yuyang Du	2b8c41d	2016-03-30 04:30:56 +0800	[diff] [blame]	826	}
				827
Peter Zijlstra	7dc603c	2016-06-16 13:29:28 +0200	[diff] [blame]	828	#else /* !CONFIG_SMP */
Yuyang Du	540247f	2015-07-15 08:04:39 +0800	[diff] [blame]	829	void init_entity_runnable_average(struct sched_entity *se)
Alex Shi	a75cdaa	2013-06-20 10:18:47 +0800	[diff] [blame]	830	{
				831	}
Yuyang Du	2b8c41d	2016-03-30 04:30:56 +0800	[diff] [blame]	832	void post_init_entity_util_avg(struct sched_entity *se)
				833	{
				834	}
Peter Zijlstra	3d30544f	2016-06-21 14:27:50 +0200	[diff] [blame]	835	static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
				836	{
				837	}
Peter Zijlstra	7dc603c	2016-06-16 13:29:28 +0200	[diff] [blame]	838	#endif /* CONFIG_SMP */
Alex Shi	a75cdaa	2013-06-20 10:18:47 +0800	[diff] [blame]	839
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	840	/*
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	841	* Update the current task's runtime statistics.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	842	*/
Ingo Molnar	b7cc089	2007-08-09 11:16:47 +0200	[diff] [blame]	843	static void update_curr(struct cfs_rq *cfs_rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	844	{
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	845	struct sched_entity *curr = cfs_rq->curr;
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	846	u64 now = rq_clock_task(rq_of(cfs_rq));
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	847	u64 delta_exec;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	848
				849	if (unlikely(!curr))
				850	return;
				851
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	852	delta_exec = now - curr->exec_start;
				853	if (unlikely((s64)delta_exec <= 0))
Peter Zijlstra	34f28ec	2008-12-16 08:45:31 +0100	[diff] [blame]	854	return;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	855
Ingo Molnar	8ebc91d	2007-10-15 17:00:03 +0200	[diff] [blame]	856	curr->exec_start = now;
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	857
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	858	schedstat_set(curr->statistics.exec_max,
				859	max(delta_exec, curr->statistics.exec_max));
				860
				861	curr->sum_exec_runtime += delta_exec;
Josh Poimboeuf	ae92882	2016-06-17 12:43:24 -0500	[diff] [blame]	862	schedstat_add(cfs_rq->exec_clock, delta_exec);
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	863
				864	curr->vruntime += calc_delta_fair(delta_exec, curr);
				865	update_min_vruntime(cfs_rq);
				866
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	867	if (entity_is_task(curr)) {
				868	struct task_struct *curtask = task_of(curr);
				869
Ingo Molnar	f977bb4	2009-09-13 18:15:54 +0200	[diff] [blame]	870	trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	871	cpuacct_charge(curtask, delta_exec);
Frank Mayhar	f06febc	2008-09-12 09:54:39 -0700	[diff] [blame]	872	account_group_exec_runtime(curtask, delta_exec);
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	873	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	874
				875	account_cfs_rq_runtime(cfs_rq, delta_exec);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	876	}
				877
Stanislaw Gruszka	6e99891	2014-11-12 16:58:44 +0100	[diff] [blame]	878	static void update_curr_fair(struct rq *rq)
				879	{
				880	update_curr(cfs_rq_of(&rq->curr->se));
				881	}
				882
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	883	static inline void
Ingo Molnar	5870db5	2007-08-09 11:16:47 +0200	[diff] [blame]	884	update_stats_wait_start(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	885	{
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	886	u64 wait_start, prev_wait_start;
				887
				888	if (!schedstat_enabled())
				889	return;
				890
				891	wait_start = rq_clock(rq_of(cfs_rq));
				892	prev_wait_start = schedstat_val(se->statistics.wait_start);
Joonwoo Park	3ea94de	2015-11-12 19:38:54 -0800	[diff] [blame]	893
				894	if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	895	likely(wait_start > prev_wait_start))
				896	wait_start -= prev_wait_start;
Joonwoo Park	3ea94de	2015-11-12 19:38:54 -0800	[diff] [blame]	897
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	898	schedstat_set(se->statistics.wait_start, wait_start);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	899	}
				900
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	901	static inline void
Joonwoo Park	3ea94de	2015-11-12 19:38:54 -0800	[diff] [blame]	902	update_stats_wait_end(struct cfs_rq cfs_rq, struct sched_entity se)
				903	{
				904	struct task_struct *p;
Mel Gorman	cb25176	2016-02-05 09:08:36 +0000	[diff] [blame]	905	u64 delta;
				906
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	907	if (!schedstat_enabled())
				908	return;
				909
				910	delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
Joonwoo Park	3ea94de	2015-11-12 19:38:54 -0800	[diff] [blame]	911
				912	if (entity_is_task(se)) {
				913	p = task_of(se);
				914	if (task_on_rq_migrating(p)) {
				915	/*
				916	* Preserve migrating task's wait time so wait_start
				917	* time stamp can be adjusted to accumulate wait time
				918	* prior to migration.
				919	*/
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	920	schedstat_set(se->statistics.wait_start, delta);
Joonwoo Park	3ea94de	2015-11-12 19:38:54 -0800	[diff] [blame]	921	return;
				922	}
				923	trace_sched_stat_wait(p, delta);
				924	}
				925
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	926	schedstat_set(se->statistics.wait_max,
				927	max(schedstat_val(se->statistics.wait_max), delta));
				928	schedstat_inc(se->statistics.wait_count);
				929	schedstat_add(se->statistics.wait_sum, delta);
				930	schedstat_set(se->statistics.wait_start, 0);
Joonwoo Park	3ea94de	2015-11-12 19:38:54 -0800	[diff] [blame]	931	}
Joonwoo Park	3ea94de	2015-11-12 19:38:54 -0800	[diff] [blame]	932
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	933	static inline void
Josh Poimboeuf	1a3d027	2016-06-17 12:43:23 -0500	[diff] [blame]	934	update_stats_enqueue_sleeper(struct cfs_rq cfs_rq, struct sched_entity se)
				935	{
				936	struct task_struct *tsk = NULL;
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	937	u64 sleep_start, block_start;
				938
				939	if (!schedstat_enabled())
				940	return;
				941
				942	sleep_start = schedstat_val(se->statistics.sleep_start);
				943	block_start = schedstat_val(se->statistics.block_start);
Josh Poimboeuf	1a3d027	2016-06-17 12:43:23 -0500	[diff] [blame]	944
				945	if (entity_is_task(se))
				946	tsk = task_of(se);
				947
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	948	if (sleep_start) {
				949	u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
Josh Poimboeuf	1a3d027	2016-06-17 12:43:23 -0500	[diff] [blame]	950
				951	if ((s64)delta < 0)
				952	delta = 0;
				953
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	954	if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
				955	schedstat_set(se->statistics.sleep_max, delta);
Josh Poimboeuf	1a3d027	2016-06-17 12:43:23 -0500	[diff] [blame]	956
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	957	schedstat_set(se->statistics.sleep_start, 0);
				958	schedstat_add(se->statistics.sum_sleep_runtime, delta);
Josh Poimboeuf	1a3d027	2016-06-17 12:43:23 -0500	[diff] [blame]	959
				960	if (tsk) {
				961	account_scheduler_latency(tsk, delta >> 10, 1);
				962	trace_sched_stat_sleep(tsk, delta);
				963	}
				964	}
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	965	if (block_start) {
				966	u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
Josh Poimboeuf	1a3d027	2016-06-17 12:43:23 -0500	[diff] [blame]	967
				968	if ((s64)delta < 0)
				969	delta = 0;
				970
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	971	if (unlikely(delta > schedstat_val(se->statistics.block_max)))
				972	schedstat_set(se->statistics.block_max, delta);
Josh Poimboeuf	1a3d027	2016-06-17 12:43:23 -0500	[diff] [blame]	973
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	974	schedstat_set(se->statistics.block_start, 0);
				975	schedstat_add(se->statistics.sum_sleep_runtime, delta);
Josh Poimboeuf	1a3d027	2016-06-17 12:43:23 -0500	[diff] [blame]	976
				977	if (tsk) {
				978	if (tsk->in_iowait) {
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	979	schedstat_add(se->statistics.iowait_sum, delta);
				980	schedstat_inc(se->statistics.iowait_count);
Josh Poimboeuf	1a3d027	2016-06-17 12:43:23 -0500	[diff] [blame]	981	trace_sched_stat_iowait(tsk, delta);
				982	}
				983
				984	trace_sched_stat_blocked(tsk, delta);
				985
				986	/*
				987	* Blocking time is in units of nanosecs, so shift by
				988	* 20 to get a milliseconds-range estimation of the
				989	* amount of time that the task spent sleeping:
				990	*/
				991	if (unlikely(prof_on == SLEEP_PROFILING)) {
				992	profile_hits(SLEEP_PROFILING,
				993	(void *)get_wchan(tsk),
				994	delta >> 20);
				995	}
				996	account_scheduler_latency(tsk, delta >> 10, 0);
				997	}
				998	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	999	}
				1000
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1001	/*
				1002	* Task is being enqueued - update stats:
				1003	*/
Mel Gorman	cb25176	2016-02-05 09:08:36 +0000	[diff] [blame]	1004	static inline void
Josh Poimboeuf	1a3d027	2016-06-17 12:43:23 -0500	[diff] [blame]	1005	update_stats_enqueue(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1006	{
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	1007	if (!schedstat_enabled())
				1008	return;
				1009
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1010	/*
				1011	* Are we enqueueing a waiting task? (for current tasks
				1012	* a dequeue/enqueue event is a NOP)
				1013	*/
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	1014	if (se != cfs_rq->curr)
Ingo Molnar	5870db5	2007-08-09 11:16:47 +0200	[diff] [blame]	1015	update_stats_wait_start(cfs_rq, se);
Josh Poimboeuf	1a3d027	2016-06-17 12:43:23 -0500	[diff] [blame]	1016
				1017	if (flags & ENQUEUE_WAKEUP)
				1018	update_stats_enqueue_sleeper(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1019	}
				1020
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1021	static inline void
Mel Gorman	cb25176	2016-02-05 09:08:36 +0000	[diff] [blame]	1022	update_stats_dequeue(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1023	{
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	1024
				1025	if (!schedstat_enabled())
				1026	return;
				1027
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1028	/*
				1029	* Mark the end of the wait period if dequeueing a
				1030	* waiting task:
				1031	*/
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	1032	if (se != cfs_rq->curr)
Ingo Molnar	9ef0a96	2007-08-09 11:16:47 +0200	[diff] [blame]	1033	update_stats_wait_end(cfs_rq, se);
Mel Gorman	cb25176	2016-02-05 09:08:36 +0000	[diff] [blame]	1034
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	1035	if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
				1036	struct task_struct *tsk = task_of(se);
Mel Gorman	cb25176	2016-02-05 09:08:36 +0000	[diff] [blame]	1037
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	1038	if (tsk->state & TASK_INTERRUPTIBLE)
				1039	schedstat_set(se->statistics.sleep_start,
				1040	rq_clock(rq_of(cfs_rq)));
				1041	if (tsk->state & TASK_UNINTERRUPTIBLE)
				1042	schedstat_set(se->statistics.block_start,
				1043	rq_clock(rq_of(cfs_rq)));
Mel Gorman	cb25176	2016-02-05 09:08:36 +0000	[diff] [blame]	1044	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1045	}
				1046
				1047	/*
				1048	* We are picking a new current task - update its stats:
				1049	*/
				1050	static inline void
Ingo Molnar	79303e9	2007-08-09 11:16:47 +0200	[diff] [blame]	1051	update_stats_curr_start(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1052	{
				1053	/*
				1054	* We are starting a new run period:
				1055	*/
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	1056	se->exec_start = rq_clock_task(rq_of(cfs_rq));
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1057	}
				1058
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1059	/**************************************************
				1060	* Scheduling class queueing methods:
				1061	*/
				1062
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1063	#ifdef CONFIG_NUMA_BALANCING
				1064	/*
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	1065	* Approximate time to scan a full NUMA task in ms. The task scan period is
				1066	* calculated based on the tasks virtual memory size and
				1067	* numa_balancing_scan_size.
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1068	*/
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	1069	unsigned int sysctl_numa_balancing_scan_period_min = 1000;
				1070	unsigned int sysctl_numa_balancing_scan_period_max = 60000;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	1071
				1072	/* Portion of address space to scan in MB */
				1073	unsigned int sysctl_numa_balancing_scan_size = 256;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1074
Peter Zijlstra	4b96a29	2012-10-25 14:16:47 +0200	[diff] [blame]	1075	/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
				1076	unsigned int sysctl_numa_balancing_scan_delay = 1000;
				1077
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	1078	static unsigned int task_nr_scan_windows(struct task_struct *p)
				1079	{
				1080	unsigned long rss = 0;
				1081	unsigned long nr_scan_pages;
				1082
				1083	/*
				1084	* Calculations based on RSS as non-present and empty pages are skipped
				1085	* by the PTE scanner and NUMA hinting faults should be trapped based
				1086	* on resident pages
				1087	*/
				1088	nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
				1089	rss = get_mm_rss(p->mm);
				1090	if (!rss)
				1091	rss = nr_scan_pages;
				1092
				1093	rss = round_up(rss, nr_scan_pages);
				1094	return rss / nr_scan_pages;
				1095	}
				1096
				1097	/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
				1098	#define MAX_SCAN_WINDOW 2560
				1099
				1100	static unsigned int task_scan_min(struct task_struct *p)
				1101	{
Jason Low	316c1608d	2015-04-28 13:00:20 -0700	[diff] [blame]	1102	unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	1103	unsigned int scan, floor;
				1104	unsigned int windows = 1;
				1105
Kirill Tkhai	6419265	2014-10-16 14:39:37 +0400	[diff] [blame]	1106	if (scan_size < MAX_SCAN_WINDOW)
				1107	windows = MAX_SCAN_WINDOW / scan_size;
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	1108	floor = 1000 / windows;
				1109
				1110	scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
				1111	return max_t(unsigned int, floor, scan);
				1112	}
				1113
				1114	static unsigned int task_scan_max(struct task_struct *p)
				1115	{
				1116	unsigned int smin = task_scan_min(p);
				1117	unsigned int smax;
				1118
				1119	/* Watch for min being lower than max due to floor calculations */
				1120	smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
				1121	return max(smin, smax);
				1122	}
				1123
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	1124	static void account_numa_enqueue(struct rq rq, struct task_struct p)
				1125	{
				1126	rq->nr_numa_running += (p->numa_preferred_nid != -1);
				1127	rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
				1128	}
				1129
				1130	static void account_numa_dequeue(struct rq rq, struct task_struct p)
				1131	{
				1132	rq->nr_numa_running -= (p->numa_preferred_nid != -1);
				1133	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
				1134	}
				1135
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	1136	struct numa_group {
				1137	atomic_t refcount;
				1138
				1139	spinlock_t lock; /* nr_tasks, tasks */
				1140	int nr_tasks;
Mel Gorman	e29cf08	2013-10-07 11:29:22 +0100	[diff] [blame]	1141	pid_t gid;
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	1142	int active_nodes;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	1143
				1144	struct rcu_head rcu;
Mel Gorman	989348b	2013-10-07 11:29:40 +0100	[diff] [blame]	1145	unsigned long total_faults;
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	1146	unsigned long max_faults_cpu;
Rik van Riel	7e2703e	2014-01-27 17:03:45 -0500	[diff] [blame]	1147	/*
				1148	* Faults_cpu is used to decide whether memory should move
				1149	* towards the CPU. As a consequence, these stats are weighted
				1150	* more by CPU use than by memory faults.
				1151	*/
Rik van Riel	50ec8a4	2014-01-27 17:03:42 -0500	[diff] [blame]	1152	unsigned long *faults_cpu;
Mel Gorman	989348b	2013-10-07 11:29:40 +0100	[diff] [blame]	1153	unsigned long faults[0];
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	1154	};
				1155
Rik van Riel	be1e4e7	2014-01-27 17:03:48 -0500	[diff] [blame]	1156	/* Shared or private faults. */
				1157	#define NR_NUMA_HINT_FAULT_TYPES 2
				1158
				1159	/* Memory and CPU locality */
				1160	#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
				1161
				1162	/* Averaged statistics, and temporary buffers. */
				1163	#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
				1164
Mel Gorman	e29cf08	2013-10-07 11:29:22 +0100	[diff] [blame]	1165	pid_t task_numa_group_id(struct task_struct *p)
				1166	{
				1167	return p->numa_group ? p->numa_group->gid : 0;
				1168	}
				1169
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	1170	/*
				1171	* The averaged statistics, shared & private, memory & cpu,
				1172	* occupy the first half of the array. The second half of the
				1173	* array is for current counters, which are averaged into the
				1174	* first set by task_numa_placement.
				1175	*/
				1176	static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
Mel Gorman	ac8e895	2013-10-07 11:29:03 +0100	[diff] [blame]	1177	{
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	1178	return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
Mel Gorman	ac8e895	2013-10-07 11:29:03 +0100	[diff] [blame]	1179	}
				1180
				1181	static inline unsigned long task_faults(struct task_struct *p, int nid)
				1182	{
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	1183	if (!p->numa_faults)
Mel Gorman	ac8e895	2013-10-07 11:29:03 +0100	[diff] [blame]	1184	return 0;
				1185
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	1186	return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
				1187	p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
Mel Gorman	ac8e895	2013-10-07 11:29:03 +0100	[diff] [blame]	1188	}
				1189
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	1190	static inline unsigned long group_faults(struct task_struct *p, int nid)
				1191	{
				1192	if (!p->numa_group)
				1193	return 0;
				1194
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	1195	return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
				1196	p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	1197	}
				1198
Rik van Riel	20e07de	2014-01-27 17:03:43 -0500	[diff] [blame]	1199	static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
				1200	{
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	1201	return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
				1202	group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
Rik van Riel	20e07de	2014-01-27 17:03:43 -0500	[diff] [blame]	1203	}
				1204
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	1205	/*
				1206	* A node triggering more than 1/3 as many NUMA faults as the maximum is
				1207	* considered part of a numa group's pseudo-interleaving set. Migrations
				1208	* between these nodes are slowed down, to allow things to settle down.
				1209	*/
				1210	#define ACTIVE_NODE_FRACTION 3
				1211
				1212	static bool numa_is_active_node(int nid, struct numa_group *ng)
				1213	{
				1214	return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
				1215	}
				1216
Rik van Riel	6c6b119	2014-10-17 03:29:52 -0400	[diff] [blame]	1217	/* Handle placement on systems where not all nodes are directly connected. */
				1218	static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
				1219	int maxdist, bool task)
				1220	{
				1221	unsigned long score = 0;
				1222	int node;
				1223
				1224	/*
				1225	* All nodes are directly connected, and the same distance
				1226	* from each other. No need for fancy placement algorithms.
				1227	*/
				1228	if (sched_numa_topology_type == NUMA_DIRECT)
				1229	return 0;
				1230
				1231	/*
				1232	* This code is called for each node, introducing N^2 complexity,
				1233	* which should be ok given the number of nodes rarely exceeds 8.
				1234	*/
				1235	for_each_online_node(node) {
				1236	unsigned long faults;
				1237	int dist = node_distance(nid, node);
				1238
				1239	/*
				1240	* The furthest away nodes in the system are not interesting
				1241	* for placement; nid was already counted.
				1242	*/
				1243	if (dist == sched_max_numa_distance \|\| node == nid)
				1244	continue;
				1245
				1246	/*
				1247	* On systems with a backplane NUMA topology, compare groups
				1248	* of nodes, and move tasks towards the group with the most
				1249	* memory accesses. When comparing two nodes at distance
				1250	* "hoplimit", only nodes closer by than "hoplimit" are part
				1251	* of each group. Skip other nodes.
				1252	*/
				1253	if (sched_numa_topology_type == NUMA_BACKPLANE &&
				1254	dist > maxdist)
				1255	continue;
				1256
				1257	/* Add up the faults from nearby nodes. */
				1258	if (task)
				1259	faults = task_faults(p, node);
				1260	else
				1261	faults = group_faults(p, node);
				1262
				1263	/*
				1264	* On systems with a glueless mesh NUMA topology, there are
				1265	* no fixed "groups of nodes". Instead, nodes that are not
				1266	* directly connected bounce traffic through intermediate
				1267	* nodes; a numa_group can occupy any set of nodes.
				1268	* The further away a node is, the less the faults count.
				1269	* This seems to result in good task placement.
				1270	*/
				1271	if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
				1272	faults *= (sched_max_numa_distance - dist);
				1273	faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
				1274	}
				1275
				1276	score += faults;
				1277	}
				1278
				1279	return score;
				1280	}
				1281
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	1282	/*
				1283	* These return the fraction of accesses done by a particular task, or
				1284	* task group, on a particular numa node. The group weight is given a
				1285	* larger multiplier, in order to group tasks together that are almost
				1286	* evenly spread out between numa nodes.
				1287	*/
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1288	static inline unsigned long task_weight(struct task_struct *p, int nid,
				1289	int dist)
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	1290	{
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1291	unsigned long faults, total_faults;
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	1292
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	1293	if (!p->numa_faults)
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	1294	return 0;
				1295
				1296	total_faults = p->total_numa_faults;
				1297
				1298	if (!total_faults)
				1299	return 0;
				1300
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1301	faults = task_faults(p, nid);
Rik van Riel	6c6b119	2014-10-17 03:29:52 -0400	[diff] [blame]	1302	faults += score_nearby_nodes(p, nid, dist, true);
				1303
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1304	return 1000 * faults / total_faults;
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	1305	}
				1306
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1307	static inline unsigned long group_weight(struct task_struct *p, int nid,
				1308	int dist)
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	1309	{
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1310	unsigned long faults, total_faults;
				1311
				1312	if (!p->numa_group)
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	1313	return 0;
				1314
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1315	total_faults = p->numa_group->total_faults;
				1316
				1317	if (!total_faults)
				1318	return 0;
				1319
				1320	faults = group_faults(p, nid);
Rik van Riel	6c6b119	2014-10-17 03:29:52 -0400	[diff] [blame]	1321	faults += score_nearby_nodes(p, nid, dist, false);
				1322
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1323	return 1000 * faults / total_faults;
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	1324	}
				1325
Rik van Riel	10f3904	2014-01-27 17:03:44 -0500	[diff] [blame]	1326	bool should_numa_migrate_memory(struct task_struct p, struct page page,
				1327	int src_nid, int dst_cpu)
				1328	{
				1329	struct numa_group *ng = p->numa_group;
				1330	int dst_nid = cpu_to_node(dst_cpu);
				1331	int last_cpupid, this_cpupid;
				1332
				1333	this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
				1334
				1335	/*
				1336	* Multi-stage node selection is used in conjunction with a periodic
				1337	* migration fault to build a temporal task<->page relation. By using
				1338	* a two-stage filter we remove short/unlikely relations.
				1339	*
				1340	* Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
				1341	* a task's usage of a particular page (n_p) per total usage of this
				1342	* page (n_t) (in a given time-span) to a probability.
				1343	*
				1344	* Our periodic faults will sample this probability and getting the
				1345	* same result twice in a row, given these samples are fully
				1346	* independent, is then given by P(n)^2, provided our sample period
				1347	* is sufficiently short compared to the usage pattern.
				1348	*
				1349	* This quadric squishes small probabilities, making it less likely we
				1350	* act on an unlikely task<->page relation.
				1351	*/
				1352	last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
				1353	if (!cpupid_pid_unset(last_cpupid) &&
				1354	cpupid_to_nid(last_cpupid) != dst_nid)
				1355	return false;
				1356
				1357	/* Always allow migrate on private faults */
				1358	if (cpupid_match_pid(p, last_cpupid))
				1359	return true;
				1360
				1361	/* A shared fault, but p->numa_group has not been set up yet. */
				1362	if (!ng)
				1363	return true;
				1364
				1365	/*
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	1366	* Destination node is much more heavily used than the source
				1367	* node? Allow migration.
Rik van Riel	10f3904	2014-01-27 17:03:44 -0500	[diff] [blame]	1368	*/
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	1369	if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
				1370	ACTIVE_NODE_FRACTION)
Rik van Riel	10f3904	2014-01-27 17:03:44 -0500	[diff] [blame]	1371	return true;
				1372
				1373	/*
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	1374	* Distribute memory according to CPU & memory use on each node,
				1375	* with 3/4 hysteresis to avoid unnecessary memory migrations:
				1376	*
				1377	* faults_cpu(dst) 3 faults_cpu(src)
				1378	* --------------- * - > ---------------
				1379	* faults_mem(dst) 4 faults_mem(src)
Rik van Riel	10f3904	2014-01-27 17:03:44 -0500	[diff] [blame]	1380	*/
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	1381	return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
				1382	group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
Rik van Riel	10f3904	2014-01-27 17:03:44 -0500	[diff] [blame]	1383	}
				1384
Mel Gorman	e6628d5	2013-10-07 11:29:02 +0100	[diff] [blame]	1385	static unsigned long weighted_cpuload(const int cpu);
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1386	static unsigned long source_load(int cpu, int type);
				1387	static unsigned long target_load(int cpu, int type);
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	1388	static unsigned long capacity_of(int cpu);
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1389	static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
Mel Gorman	e6628d5	2013-10-07 11:29:02 +0100	[diff] [blame]	1390
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1391	/* Cached statistics for all CPUs within a node */
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1392	struct numa_stats {
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1393	unsigned long nr_running;
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1394	unsigned long load;
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1395
				1396	/* Total compute capacity of CPUs on a node */
Nicolas Pitre	5ef20ca	2014-05-26 18:19:34 -0400	[diff] [blame]	1397	unsigned long compute_capacity;
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1398
				1399	/* Approximate capacity in terms of runnable tasks on a node */
Nicolas Pitre	5ef20ca	2014-05-26 18:19:34 -0400	[diff] [blame]	1400	unsigned long task_capacity;
Nicolas Pitre	1b6a749	2014-05-26 18:19:35 -0400	[diff] [blame]	1401	int has_free_capacity;
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1402	};
Mel Gorman	e6628d5	2013-10-07 11:29:02 +0100	[diff] [blame]	1403
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1404	/*
				1405	* XXX borrowed from update_sg_lb_stats
				1406	*/
				1407	static void update_numa_stats(struct numa_stats *ns, int nid)
				1408	{
Rik van Riel	83d7f24	2014-08-04 13:23:28 -0400	[diff] [blame]	1409	int smt, cpu, cpus = 0;
				1410	unsigned long capacity;
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1411
				1412	memset(ns, 0, sizeof(*ns));
				1413	for_each_cpu(cpu, cpumask_of_node(nid)) {
				1414	struct rq *rq = cpu_rq(cpu);
				1415
				1416	ns->nr_running += rq->nr_running;
				1417	ns->load += weighted_cpuload(cpu);
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	1418	ns->compute_capacity += capacity_of(cpu);
Peter Zijlstra	5eca82a	2013-11-06 18:47:57 +0100	[diff] [blame]	1419
				1420	cpus++;
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1421	}
				1422
Peter Zijlstra	5eca82a	2013-11-06 18:47:57 +0100	[diff] [blame]	1423	/*
				1424	* If we raced with hotplug and there are no CPUs left in our mask
				1425	* the @ns structure is NULL'ed and task_numa_compare() will
				1426	* not find this node attractive.
				1427	*
Nicolas Pitre	1b6a749	2014-05-26 18:19:35 -0400	[diff] [blame]	1428	* We'll either bail at !has_free_capacity, or we'll detect a huge
				1429	* imbalance and bail there.
Peter Zijlstra	5eca82a	2013-11-06 18:47:57 +0100	[diff] [blame]	1430	*/
				1431	if (!cpus)
				1432	return;
				1433
Rik van Riel	83d7f24	2014-08-04 13:23:28 -0400	[diff] [blame]	1434	/* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
				1435	smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
				1436	capacity = cpus / smt; /* cores */
				1437
				1438	ns->task_capacity = min_t(unsigned, capacity,
				1439	DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
Nicolas Pitre	1b6a749	2014-05-26 18:19:35 -0400	[diff] [blame]	1440	ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1441	}
				1442
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1443	struct task_numa_env {
				1444	struct task_struct *p;
				1445
				1446	int src_cpu, src_nid;
				1447	int dst_cpu, dst_nid;
				1448
				1449	struct numa_stats src_stats, dst_stats;
				1450
Wanpeng Li	40ea2b4	2013-12-05 19:10:17 +0800	[diff] [blame]	1451	int imbalance_pct;
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1452	int dist;
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1453
				1454	struct task_struct *best_task;
				1455	long best_imp;
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1456	int best_cpu;
				1457	};
				1458
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1459	static void task_numa_assign(struct task_numa_env *env,
				1460	struct task_struct *p, long imp)
				1461	{
				1462	if (env->best_task)
				1463	put_task_struct(env->best_task);
Oleg Nesterov	bac7857	2016-05-18 21:57:33 +0200	[diff] [blame]	1464	if (p)
				1465	get_task_struct(p);
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1466
				1467	env->best_task = p;
				1468	env->best_imp = imp;
				1469	env->best_cpu = env->dst_cpu;
				1470	}
				1471
Rik van Riel	28a2174	2014-06-23 11:46:13 -0400	[diff] [blame]	1472	static bool load_too_imbalanced(long src_load, long dst_load,
Rik van Riel	e63da03	2014-05-14 13:22:21 -0400	[diff] [blame]	1473	struct task_numa_env *env)
				1474	{
Rik van Riel	e4991b2	2015-05-27 15:04:27 -0400	[diff] [blame]	1475	long imb, old_imb;
				1476	long orig_src_load, orig_dst_load;
Rik van Riel	28a2174	2014-06-23 11:46:13 -0400	[diff] [blame]	1477	long src_capacity, dst_capacity;
				1478
				1479	/*
				1480	* The load is corrected for the CPU capacity available on each node.
				1481	*
				1482	* src_load dst_load
				1483	* ------------ vs ---------
				1484	* src_capacity dst_capacity
				1485	*/
				1486	src_capacity = env->src_stats.compute_capacity;
				1487	dst_capacity = env->dst_stats.compute_capacity;
Rik van Riel	e63da03	2014-05-14 13:22:21 -0400	[diff] [blame]	1488
				1489	/* We care about the slope of the imbalance, not the direction. */
Rik van Riel	e4991b2	2015-05-27 15:04:27 -0400	[diff] [blame]	1490	if (dst_load < src_load)
				1491	swap(dst_load, src_load);
Rik van Riel	e63da03	2014-05-14 13:22:21 -0400	[diff] [blame]	1492
				1493	/* Is the difference below the threshold? */
Rik van Riel	e4991b2	2015-05-27 15:04:27 -0400	[diff] [blame]	1494	imb = dst_load * src_capacity * 100 -
				1495	src_load * dst_capacity * env->imbalance_pct;
Rik van Riel	e63da03	2014-05-14 13:22:21 -0400	[diff] [blame]	1496	if (imb <= 0)
				1497	return false;
				1498
				1499	/*
				1500	* The imbalance is above the allowed threshold.
Rik van Riel	e4991b2	2015-05-27 15:04:27 -0400	[diff] [blame]	1501	* Compare it with the old imbalance.
Rik van Riel	e63da03	2014-05-14 13:22:21 -0400	[diff] [blame]	1502	*/
Rik van Riel	28a2174	2014-06-23 11:46:13 -0400	[diff] [blame]	1503	orig_src_load = env->src_stats.load;
Rik van Riel	e4991b2	2015-05-27 15:04:27 -0400	[diff] [blame]	1504	orig_dst_load = env->dst_stats.load;
Rik van Riel	28a2174	2014-06-23 11:46:13 -0400	[diff] [blame]	1505
Rik van Riel	e4991b2	2015-05-27 15:04:27 -0400	[diff] [blame]	1506	if (orig_dst_load < orig_src_load)
				1507	swap(orig_dst_load, orig_src_load);
Rik van Riel	e63da03	2014-05-14 13:22:21 -0400	[diff] [blame]	1508
Rik van Riel	e4991b2	2015-05-27 15:04:27 -0400	[diff] [blame]	1509	old_imb = orig_dst_load * src_capacity * 100 -
				1510	orig_src_load * dst_capacity * env->imbalance_pct;
				1511
				1512	/* Would this change make things worse? */
				1513	return (imb > old_imb);
Rik van Riel	e63da03	2014-05-14 13:22:21 -0400	[diff] [blame]	1514	}
				1515
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1516	/*
				1517	* This checks if the overall compute and NUMA accesses of the system would
				1518	* be improved if the source tasks was migrated to the target dst_cpu taking
				1519	* into account that it might be best if task running on the dst_cpu should
				1520	* be exchanged with the source task
				1521	*/
Rik van Riel	887c290	2013-10-07 11:29:31 +0100	[diff] [blame]	1522	static void task_numa_compare(struct task_numa_env *env,
				1523	long taskimp, long groupimp)
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1524	{
				1525	struct rq *src_rq = cpu_rq(env->src_cpu);
				1526	struct rq *dst_rq = cpu_rq(env->dst_cpu);
				1527	struct task_struct *cur;
Rik van Riel	28a2174	2014-06-23 11:46:13 -0400	[diff] [blame]	1528	long src_load, dst_load;
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1529	long load;
Rik van Riel	1c5d3eb	2014-06-23 11:46:15 -0400	[diff] [blame]	1530	long imp = env->p->numa_group ? groupimp : taskimp;
Rik van Riel	0132c3e	2014-06-23 11:46:16 -0400	[diff] [blame]	1531	long moveimp = imp;
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1532	int dist = env->dist;
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1533
				1534	rcu_read_lock();
Oleg Nesterov	bac7857	2016-05-18 21:57:33 +0200	[diff] [blame]	1535	cur = task_rcu_dereference(&dst_rq->curr);
				1536	if (cur && ((cur->flags & PF_EXITING) \|\| is_idle_task(cur)))
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1537	cur = NULL;
				1538
				1539	/*
Peter Zijlstra	7af6833	2014-11-10 10:54:35 +0100	[diff] [blame]	1540	* Because we have preemption enabled we can get migrated around and
				1541	* end try selecting ourselves (current == env->p) as a swap candidate.
				1542	*/
				1543	if (cur == env->p)
				1544	goto unlock;
				1545
				1546	/*
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1547	* "imp" is the fault differential for the source task between the
				1548	* source and destination node. Calculate the total differential for
				1549	* the source task and potential destination task. The more negative
				1550	* the value is, the more rmeote accesses that would be expected to
				1551	* be incurred if the tasks were swapped.
				1552	*/
				1553	if (cur) {
				1554	/* Skip this swap candidate if cannot move to the source cpu */
Ingo Molnar	0c98d34	2017-02-05 15:38:10 +0100	[diff] [blame]	1555	if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1556	goto unlock;
				1557
Rik van Riel	887c290	2013-10-07 11:29:31 +0100	[diff] [blame]	1558	/*
				1559	* If dst and source tasks are in the same NUMA group, or not
Rik van Riel	ca28aa53	2013-10-07 11:29:32 +0100	[diff] [blame]	1560	* in any group then look only at task weights.
Rik van Riel	887c290	2013-10-07 11:29:31 +0100	[diff] [blame]	1561	*/
Rik van Riel	ca28aa53	2013-10-07 11:29:32 +0100	[diff] [blame]	1562	if (cur->numa_group == env->p->numa_group) {
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1563	imp = taskimp + task_weight(cur, env->src_nid, dist) -
				1564	task_weight(cur, env->dst_nid, dist);
Rik van Riel	ca28aa53	2013-10-07 11:29:32 +0100	[diff] [blame]	1565	/*
				1566	* Add some hysteresis to prevent swapping the
				1567	* tasks within a group over tiny differences.
				1568	*/
				1569	if (cur->numa_group)
				1570	imp -= imp/16;
Rik van Riel	887c290	2013-10-07 11:29:31 +0100	[diff] [blame]	1571	} else {
Rik van Riel	ca28aa53	2013-10-07 11:29:32 +0100	[diff] [blame]	1572	/*
				1573	* Compare the group weights. If a task is all by
				1574	* itself (not part of a group), use the task weight
				1575	* instead.
				1576	*/
Rik van Riel	ca28aa53	2013-10-07 11:29:32 +0100	[diff] [blame]	1577	if (cur->numa_group)
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1578	imp += group_weight(cur, env->src_nid, dist) -
				1579	group_weight(cur, env->dst_nid, dist);
Rik van Riel	ca28aa53	2013-10-07 11:29:32 +0100	[diff] [blame]	1580	else
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1581	imp += task_weight(cur, env->src_nid, dist) -
				1582	task_weight(cur, env->dst_nid, dist);
Rik van Riel	887c290	2013-10-07 11:29:31 +0100	[diff] [blame]	1583	}
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1584	}
				1585
Rik van Riel	0132c3e	2014-06-23 11:46:16 -0400	[diff] [blame]	1586	if (imp <= env->best_imp && moveimp <= env->best_imp)
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1587	goto unlock;
				1588
				1589	if (!cur) {
				1590	/* Is there capacity at our destination? */
Rik van Riel	b932c03	2014-08-04 13:23:27 -0400	[diff] [blame]	1591	if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
Nicolas Pitre	1b6a749	2014-05-26 18:19:35 -0400	[diff] [blame]	1592	!env->dst_stats.has_free_capacity)
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1593	goto unlock;
				1594
				1595	goto balance;
				1596	}
				1597
				1598	/* Balance doesn't matter much if we're running a task per cpu */
Rik van Riel	0132c3e	2014-06-23 11:46:16 -0400	[diff] [blame]	1599	if (imp > env->best_imp && src_rq->nr_running == 1 &&
				1600	dst_rq->nr_running == 1)
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1601	goto assign;
				1602
				1603	/*
				1604	* In the overloaded case, try and keep the load balanced.
				1605	*/
				1606	balance:
Peter Zijlstra	e720fff	2014-07-11 16:01:53 +0200	[diff] [blame]	1607	load = task_h_load(env->p);
				1608	dst_load = env->dst_stats.load + load;
				1609	src_load = env->src_stats.load - load;
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1610
Rik van Riel	0132c3e	2014-06-23 11:46:16 -0400	[diff] [blame]	1611	if (moveimp > imp && moveimp > env->best_imp) {
				1612	/*
				1613	* If the improvement from just moving env->p direction is
				1614	* better than swapping tasks around, check if a move is
				1615	* possible. Store a slightly smaller score than moveimp,
				1616	* so an actually idle CPU will win.
				1617	*/
				1618	if (!load_too_imbalanced(src_load, dst_load, env)) {
				1619	imp = moveimp - 1;
				1620	cur = NULL;
				1621	goto assign;
				1622	}
				1623	}
				1624
				1625	if (imp <= env->best_imp)
				1626	goto unlock;
				1627
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1628	if (cur) {
Peter Zijlstra	e720fff	2014-07-11 16:01:53 +0200	[diff] [blame]	1629	load = task_h_load(cur);
				1630	dst_load -= load;
				1631	src_load += load;
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1632	}
				1633
Rik van Riel	28a2174	2014-06-23 11:46:13 -0400	[diff] [blame]	1634	if (load_too_imbalanced(src_load, dst_load, env))
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1635	goto unlock;
				1636
Rik van Riel	ba7e5a2	2014-09-04 16:35:30 -0400	[diff] [blame]	1637	/*
				1638	* One idle CPU per node is evaluated for a task numa move.
				1639	* Call select_idle_sibling to maybe find a better one.
				1640	*/
Peter Zijlstra	10e2f1a	2016-05-09 10:38:05 +0200	[diff] [blame]	1641	if (!cur) {
				1642	/*
				1643	* select_idle_siblings() uses an per-cpu cpumask that
				1644	* can be used from IRQ context.
				1645	*/
				1646	local_irq_disable();
Morten Rasmussen	772bd008c	2016-06-22 18:03:13 +0100	[diff] [blame]	1647	env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
				1648	env->dst_cpu);
Peter Zijlstra	10e2f1a	2016-05-09 10:38:05 +0200	[diff] [blame]	1649	local_irq_enable();
				1650	}
Rik van Riel	ba7e5a2	2014-09-04 16:35:30 -0400	[diff] [blame]	1651
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1652	assign:
				1653	task_numa_assign(env, cur, imp);
				1654	unlock:
				1655	rcu_read_unlock();
				1656	}
				1657
Rik van Riel	887c290	2013-10-07 11:29:31 +0100	[diff] [blame]	1658	static void task_numa_find_cpu(struct task_numa_env *env,
				1659	long taskimp, long groupimp)
Mel Gorman	2c8a50a	2013-10-07 11:29:18 +0100	[diff] [blame]	1660	{
				1661	int cpu;
				1662
				1663	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
				1664	/* Skip this CPU if the source task cannot migrate */
Ingo Molnar	0c98d34	2017-02-05 15:38:10 +0100	[diff] [blame]	1665	if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
Mel Gorman	2c8a50a	2013-10-07 11:29:18 +0100	[diff] [blame]	1666	continue;
				1667
				1668	env->dst_cpu = cpu;
Rik van Riel	887c290	2013-10-07 11:29:31 +0100	[diff] [blame]	1669	task_numa_compare(env, taskimp, groupimp);
Mel Gorman	2c8a50a	2013-10-07 11:29:18 +0100	[diff] [blame]	1670	}
				1671	}
				1672
Rik van Riel	6f9aad0	2015-05-28 09:52:49 -0400	[diff] [blame]	1673	/* Only move tasks to a NUMA node less busy than the current node. */
				1674	static bool numa_has_capacity(struct task_numa_env *env)
				1675	{
				1676	struct numa_stats *src = &env->src_stats;
				1677	struct numa_stats *dst = &env->dst_stats;
				1678
				1679	if (src->has_free_capacity && !dst->has_free_capacity)
				1680	return false;
				1681
				1682	/*
				1683	* Only consider a task move if the source has a higher load
				1684	* than the destination, corrected for CPU capacity on each node.
				1685	*
				1686	* src->load dst->load
				1687	* --------------------- vs ---------------------
				1688	* src->compute_capacity dst->compute_capacity
				1689	*/
Srikar Dronamraju	44dcb04	2015-06-16 17:26:00 +0530	[diff] [blame]	1690	if (src->load * dst->compute_capacity * env->imbalance_pct >
				1691
				1692	dst->load * src->compute_capacity * 100)
Rik van Riel	6f9aad0	2015-05-28 09:52:49 -0400	[diff] [blame]	1693	return true;
				1694
				1695	return false;
				1696	}
				1697
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1698	static int task_numa_migrate(struct task_struct *p)
Mel Gorman	e6628d5	2013-10-07 11:29:02 +0100	[diff] [blame]	1699	{
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1700	struct task_numa_env env = {
				1701	.p = p,
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1702
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1703	.src_cpu = task_cpu(p),
Ingo Molnar	b32e86b	2013-10-07 11:29:30 +0100	[diff] [blame]	1704	.src_nid = task_node(p),
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1705
				1706	.imbalance_pct = 112,
				1707
				1708	.best_task = NULL,
				1709	.best_imp = 0,
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	1710	.best_cpu = -1,
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1711	};
				1712	struct sched_domain *sd;
Rik van Riel	887c290	2013-10-07 11:29:31 +0100	[diff] [blame]	1713	unsigned long taskweight, groupweight;
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1714	int nid, ret, dist;
Rik van Riel	887c290	2013-10-07 11:29:31 +0100	[diff] [blame]	1715	long taskimp, groupimp;
Mel Gorman	e6628d5	2013-10-07 11:29:02 +0100	[diff] [blame]	1716
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1717	/*
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1718	* Pick the lowest SD_NUMA domain, as that would have the smallest
				1719	* imbalance and would be the first to start moving tasks about.
				1720	*
				1721	* And we want to avoid any moving of tasks about, as that would create
				1722	* random movement of tasks -- counter the numa conditions we're trying
				1723	* to satisfy here.
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1724	*/
Mel Gorman	e6628d5	2013-10-07 11:29:02 +0100	[diff] [blame]	1725	rcu_read_lock();
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1726	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
Rik van Riel	46a73e8	2013-11-11 19:29:25 -0500	[diff] [blame]	1727	if (sd)
				1728	env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
Mel Gorman	e6628d5	2013-10-07 11:29:02 +0100	[diff] [blame]	1729	rcu_read_unlock();
				1730
Rik van Riel	46a73e8	2013-11-11 19:29:25 -0500	[diff] [blame]	1731	/*
				1732	* Cpusets can break the scheduler domain tree into smaller
				1733	* balance domains, some of which do not cross NUMA boundaries.
				1734	* Tasks that are "trapped" in such domains cannot be migrated
				1735	* elsewhere, so there is no point in (re)trying.
				1736	*/
				1737	if (unlikely(!sd)) {
Wanpeng Li	de1b301	2013-12-12 15:23:24 +0800	[diff] [blame]	1738	p->numa_preferred_nid = task_node(p);
Rik van Riel	46a73e8	2013-11-11 19:29:25 -0500	[diff] [blame]	1739	return -EINVAL;
				1740	}
				1741
Mel Gorman	2c8a50a	2013-10-07 11:29:18 +0100	[diff] [blame]	1742	env.dst_nid = p->numa_preferred_nid;
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1743	dist = env.dist = node_distance(env.src_nid, env.dst_nid);
				1744	taskweight = task_weight(p, env.src_nid, dist);
				1745	groupweight = group_weight(p, env.src_nid, dist);
				1746	update_numa_stats(&env.src_stats, env.src_nid);
				1747	taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
				1748	groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
Mel Gorman	2c8a50a	2013-10-07 11:29:18 +0100	[diff] [blame]	1749	update_numa_stats(&env.dst_stats, env.dst_nid);
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1750
Rik van Riel	a43455a	2014-06-04 16:09:42 -0400	[diff] [blame]	1751	/* Try to find a spot on the preferred nid. */
Rik van Riel	6f9aad0	2015-05-28 09:52:49 -0400	[diff] [blame]	1752	if (numa_has_capacity(&env))
				1753	task_numa_find_cpu(&env, taskimp, groupimp);
Rik van Riel	e1dda8a	2013-10-07 11:29:19 +0100	[diff] [blame]	1754
Rik van Riel	9de05d4	2014-10-09 17:27:47 -0400	[diff] [blame]	1755	/*
				1756	* Look at other nodes in these cases:
				1757	* - there is no space available on the preferred_nid
				1758	* - the task is part of a numa_group that is interleaved across
				1759	* multiple NUMA nodes; in order to better consolidate the group,
				1760	* we need to check other locations.
				1761	*/
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	1762	if (env.best_cpu == -1 \|\| (p->numa_group && p->numa_group->active_nodes > 1)) {
Mel Gorman	2c8a50a	2013-10-07 11:29:18 +0100	[diff] [blame]	1763	for_each_online_node(nid) {
				1764	if (nid == env.src_nid \|\| nid == p->numa_preferred_nid)
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1765	continue;
				1766
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1767	dist = node_distance(env.src_nid, env.dst_nid);
Rik van Riel	6c6b119	2014-10-17 03:29:52 -0400	[diff] [blame]	1768	if (sched_numa_topology_type == NUMA_BACKPLANE &&
				1769	dist != env.dist) {
				1770	taskweight = task_weight(p, env.src_nid, dist);
				1771	groupweight = group_weight(p, env.src_nid, dist);
				1772	}
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1773
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	1774	/* Only consider nodes where both task and groups benefit */
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1775	taskimp = task_weight(p, nid, dist) - taskweight;
				1776	groupimp = group_weight(p, nid, dist) - groupweight;
Rik van Riel	887c290	2013-10-07 11:29:31 +0100	[diff] [blame]	1777	if (taskimp < 0 && groupimp < 0)
Mel Gorman	2c8a50a	2013-10-07 11:29:18 +0100	[diff] [blame]	1778	continue;
				1779
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1780	env.dist = dist;
Mel Gorman	2c8a50a	2013-10-07 11:29:18 +0100	[diff] [blame]	1781	env.dst_nid = nid;
				1782	update_numa_stats(&env.dst_stats, env.dst_nid);
Rik van Riel	6f9aad0	2015-05-28 09:52:49 -0400	[diff] [blame]	1783	if (numa_has_capacity(&env))
				1784	task_numa_find_cpu(&env, taskimp, groupimp);
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1785	}
				1786	}
				1787
Rik van Riel	68d1b02	2014-04-11 13:00:29 -0400	[diff] [blame]	1788	/*
				1789	* If the task is part of a workload that spans multiple NUMA nodes,
				1790	* and is migrating into one of the workload's active nodes, remember
				1791	* this node as the task's preferred numa node, so the workload can
				1792	* settle down.
				1793	* A task that migrated to a second choice node will be better off
				1794	* trying for a better one later. Do not set the preferred node here.
				1795	*/
Rik van Riel	db015da	2014-06-23 11:41:34 -0400	[diff] [blame]	1796	if (p->numa_group) {
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	1797	struct numa_group *ng = p->numa_group;
				1798
Rik van Riel	db015da	2014-06-23 11:41:34 -0400	[diff] [blame]	1799	if (env.best_cpu == -1)
				1800	nid = env.src_nid;
				1801	else
				1802	nid = env.dst_nid;
				1803
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	1804	if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))
Rik van Riel	db015da	2014-06-23 11:41:34 -0400	[diff] [blame]	1805	sched_setnuma(p, env.dst_nid);
				1806	}
				1807
				1808	/* No better CPU than the current one was found. */
				1809	if (env.best_cpu == -1)
				1810	return -EAGAIN;
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	1811
Rik van Riel	04bb2f9	2013-10-07 11:29:36 +0100	[diff] [blame]	1812	/*
				1813	* Reset the scan period if the task is being rescheduled on an
				1814	* alternative node to recheck if the tasks is now properly placed.
				1815	*/
				1816	p->numa_scan_period = task_scan_min(p);
				1817
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1818	if (env.best_task == NULL) {
Mel Gorman	286549d	2014-01-21 15:51:03 -0800	[diff] [blame]	1819	ret = migrate_task_to(p, env.best_cpu);
				1820	if (ret != 0)
				1821	trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1822	return ret;
				1823	}
				1824
				1825	ret = migrate_swap(p, env.best_task);
Mel Gorman	286549d	2014-01-21 15:51:03 -0800	[diff] [blame]	1826	if (ret != 0)
				1827	trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1828	put_task_struct(env.best_task);
				1829	return ret;
Mel Gorman	e6628d5	2013-10-07 11:29:02 +0100	[diff] [blame]	1830	}
				1831
Mel Gorman	6b9a746	2013-10-07 11:29:11 +0100	[diff] [blame]	1832	/* Attempt to migrate a task to a CPU on the preferred node. */
				1833	static void numa_migrate_preferred(struct task_struct *p)
				1834	{
Rik van Riel	5085e2a	2014-04-11 13:00:28 -0400	[diff] [blame]	1835	unsigned long interval = HZ;
				1836
Rik van Riel	2739d3e	2013-10-07 11:29:41 +0100	[diff] [blame]	1837	/* This task has no NUMA fault statistics yet */
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	1838	if (unlikely(p->numa_preferred_nid == -1 \|\| !p->numa_faults))
Rik van Riel	2739d3e	2013-10-07 11:29:41 +0100	[diff] [blame]	1839	return;
				1840
				1841	/* Periodically retry migrating the task to the preferred node */
Rik van Riel	5085e2a	2014-04-11 13:00:28 -0400	[diff] [blame]	1842	interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
				1843	p->numa_migrate_retry = jiffies + interval;
Rik van Riel	2739d3e	2013-10-07 11:29:41 +0100	[diff] [blame]	1844
Mel Gorman	6b9a746	2013-10-07 11:29:11 +0100	[diff] [blame]	1845	/* Success if task is already running on preferred CPU */
Wanpeng Li	de1b301	2013-12-12 15:23:24 +0800	[diff] [blame]	1846	if (task_node(p) == p->numa_preferred_nid)
Mel Gorman	6b9a746	2013-10-07 11:29:11 +0100	[diff] [blame]	1847	return;
				1848
Mel Gorman	6b9a746	2013-10-07 11:29:11 +0100	[diff] [blame]	1849	/* Otherwise, try migrate to a CPU on the preferred node */
Rik van Riel	2739d3e	2013-10-07 11:29:41 +0100	[diff] [blame]	1850	task_numa_migrate(p);
Mel Gorman	6b9a746	2013-10-07 11:29:11 +0100	[diff] [blame]	1851	}
				1852
Rik van Riel	04bb2f9	2013-10-07 11:29:36 +0100	[diff] [blame]	1853	/*
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	1854	* Find out how many nodes on the workload is actively running on. Do this by
Rik van Riel	20e07de	2014-01-27 17:03:43 -0500	[diff] [blame]	1855	* tracking the nodes from which NUMA hinting faults are triggered. This can
				1856	* be different from the set of nodes where the workload's memory is currently
				1857	* located.
Rik van Riel	20e07de	2014-01-27 17:03:43 -0500	[diff] [blame]	1858	*/
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	1859	static void numa_group_count_active_nodes(struct numa_group *numa_group)
Rik van Riel	20e07de	2014-01-27 17:03:43 -0500	[diff] [blame]	1860	{
				1861	unsigned long faults, max_faults = 0;
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	1862	int nid, active_nodes = 0;
Rik van Riel	20e07de	2014-01-27 17:03:43 -0500	[diff] [blame]	1863
				1864	for_each_online_node(nid) {
				1865	faults = group_faults_cpu(numa_group, nid);
				1866	if (faults > max_faults)
				1867	max_faults = faults;
				1868	}
				1869
				1870	for_each_online_node(nid) {
				1871	faults = group_faults_cpu(numa_group, nid);
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	1872	if (faults * ACTIVE_NODE_FRACTION > max_faults)
				1873	active_nodes++;
Rik van Riel	20e07de	2014-01-27 17:03:43 -0500	[diff] [blame]	1874	}
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	1875
				1876	numa_group->max_faults_cpu = max_faults;
				1877	numa_group->active_nodes = active_nodes;
Rik van Riel	20e07de	2014-01-27 17:03:43 -0500	[diff] [blame]	1878	}
				1879
				1880	/*
Rik van Riel	04bb2f9	2013-10-07 11:29:36 +0100	[diff] [blame]	1881	* When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
				1882	* increments. The more local the fault statistics are, the higher the scan
Rik van Riel	a22b4b0	2014-06-23 11:41:35 -0400	[diff] [blame]	1883	* period will be for the next scan window. If local/(local+remote) ratio is
				1884	* below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
				1885	* the scan period will decrease. Aim for 70% local accesses.
Rik van Riel	04bb2f9	2013-10-07 11:29:36 +0100	[diff] [blame]	1886	*/
				1887	#define NUMA_PERIOD_SLOTS 10
Rik van Riel	a22b4b0	2014-06-23 11:41:35 -0400	[diff] [blame]	1888	#define NUMA_PERIOD_THRESHOLD 7
Rik van Riel	04bb2f9	2013-10-07 11:29:36 +0100	[diff] [blame]	1889
				1890	/*
				1891	* Increase the scan period (slow down scanning) if the majority of
				1892	* our memory is already on our local node, or if the majority of
				1893	* the page accesses are shared with other processes.
				1894	* Otherwise, decrease the scan period.
				1895	*/
				1896	static void update_task_scan_period(struct task_struct *p,
				1897	unsigned long shared, unsigned long private)
				1898	{
				1899	unsigned int period_slot;
				1900	int ratio;
				1901	int diff;
				1902
				1903	unsigned long remote = p->numa_faults_locality[0];
				1904	unsigned long local = p->numa_faults_locality[1];
				1905
				1906	/*
				1907	* If there were no record hinting faults then either the task is
				1908	* completely idle or all activity is areas that are not of interest
Mel Gorman	074c238	2015-03-25 15:55:42 -0700	[diff] [blame]	1909	* to automatic numa balancing. Related to that, if there were failed
				1910	* migration then it implies we are migrating too quickly or the local
				1911	* node is overloaded. In either case, scan slower
Rik van Riel	04bb2f9	2013-10-07 11:29:36 +0100	[diff] [blame]	1912	*/
Mel Gorman	074c238	2015-03-25 15:55:42 -0700	[diff] [blame]	1913	if (local + shared == 0 \|\| p->numa_faults_locality[2]) {
Rik van Riel	04bb2f9	2013-10-07 11:29:36 +0100	[diff] [blame]	1914	p->numa_scan_period = min(p->numa_scan_period_max,
				1915	p->numa_scan_period << 1);
				1916
				1917	p->mm->numa_next_scan = jiffies +
				1918	msecs_to_jiffies(p->numa_scan_period);
				1919
				1920	return;
				1921	}
				1922
				1923	/*
				1924	* Prepare to scale scan period relative to the current period.
				1925	* == NUMA_PERIOD_THRESHOLD scan period stays the same
				1926	* < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
				1927	* >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
				1928	*/
				1929	period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
				1930	ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
				1931	if (ratio >= NUMA_PERIOD_THRESHOLD) {
				1932	int slot = ratio - NUMA_PERIOD_THRESHOLD;
				1933	if (!slot)
				1934	slot = 1;
				1935	diff = slot * period_slot;
				1936	} else {
				1937	diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
				1938
				1939	/*
				1940	* Scale scan rate increases based on sharing. There is an
				1941	* inverse relationship between the degree of sharing and
				1942	* the adjustment made to the scanning period. Broadly
				1943	* speaking the intent is that there is little point
				1944	* scanning faster if shared accesses dominate as it may
				1945	* simply bounce migrations uselessly
				1946	*/
Yasuaki Ishimatsu	2847c90	2014-10-22 16:04:35 +0900	[diff] [blame]	1947	ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
Rik van Riel	04bb2f9	2013-10-07 11:29:36 +0100	[diff] [blame]	1948	diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
				1949	}
				1950
				1951	p->numa_scan_period = clamp(p->numa_scan_period + diff,
				1952	task_scan_min(p), task_scan_max(p));
				1953	memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
				1954	}
				1955
Rik van Riel	7e2703e	2014-01-27 17:03:45 -0500	[diff] [blame]	1956	/*
				1957	* Get the fraction of time the task has been running since the last
				1958	* NUMA placement cycle. The scheduler keeps similar statistics, but
				1959	* decays those on a 32ms period, which is orders of magnitude off
				1960	* from the dozens-of-seconds NUMA balancing period. Use the scheduler
				1961	* stats only if the task is so new there are no NUMA statistics yet.
				1962	*/
				1963	static u64 numa_get_avg_runtime(struct task_struct p, u64 period)
				1964	{
				1965	u64 runtime, delta, now;
				1966	/* Use the start of this time slice to avoid calculations. */
				1967	now = p->se.exec_start;
				1968	runtime = p->se.sum_exec_runtime;
				1969
				1970	if (p->last_task_numa_placement) {
				1971	delta = runtime - p->last_sum_exec_runtime;
				1972	*period = now - p->last_task_numa_placement;
				1973	} else {
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	1974	delta = p->se.avg.load_sum / p->se.load.weight;
				1975	*period = LOAD_AVG_MAX;
Rik van Riel	7e2703e	2014-01-27 17:03:45 -0500	[diff] [blame]	1976	}
				1977
				1978	p->last_sum_exec_runtime = runtime;
				1979	p->last_task_numa_placement = now;
				1980
				1981	return delta;
				1982	}
				1983
Rik van Riel	5400941	2014-10-17 03:29:53 -0400	[diff] [blame]	1984	/*
				1985	* Determine the preferred nid for a task in a numa_group. This needs to
				1986	* be done in a way that produces consistent results with group_weight,
				1987	* otherwise workloads might not converge.
				1988	*/
				1989	static int preferred_group_nid(struct task_struct *p, int nid)
				1990	{
				1991	nodemask_t nodes;
				1992	int dist;
				1993
				1994	/* Direct connections between all NUMA nodes. */
				1995	if (sched_numa_topology_type == NUMA_DIRECT)
				1996	return nid;
				1997
				1998	/*
				1999	* On a system with glueless mesh NUMA topology, group_weight
				2000	* scores nodes according to the number of NUMA hinting faults on
				2001	* both the node itself, and on nearby nodes.
				2002	*/
				2003	if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
				2004	unsigned long score, max_score = 0;
				2005	int node, max_node = nid;
				2006
				2007	dist = sched_max_numa_distance;
				2008
				2009	for_each_online_node(node) {
				2010	score = group_weight(p, node, dist);
				2011	if (score > max_score) {
				2012	max_score = score;
				2013	max_node = node;
				2014	}
				2015	}
				2016	return max_node;
				2017	}
				2018
				2019	/*
				2020	* Finding the preferred nid in a system with NUMA backplane
				2021	* interconnect topology is more involved. The goal is to locate
				2022	* tasks from numa_groups near each other in the system, and
				2023	* untangle workloads from different sides of the system. This requires
				2024	* searching down the hierarchy of node groups, recursively searching
				2025	* inside the highest scoring group of nodes. The nodemask tricks
				2026	* keep the complexity of the search down.
				2027	*/
				2028	nodes = node_online_map;
				2029	for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
				2030	unsigned long max_faults = 0;
Jan Beulich	8190747	2015-01-23 08:25:38 +0000	[diff] [blame]	2031	nodemask_t max_group = NODE_MASK_NONE;
Rik van Riel	5400941	2014-10-17 03:29:53 -0400	[diff] [blame]	2032	int a, b;
				2033
				2034	/* Are there nodes at this distance from each other? */
				2035	if (!find_numa_distance(dist))
				2036	continue;
				2037
				2038	for_each_node_mask(a, nodes) {
				2039	unsigned long faults = 0;
				2040	nodemask_t this_group;
				2041	nodes_clear(this_group);
				2042
				2043	/* Sum group's NUMA faults; includes a==b case. */
				2044	for_each_node_mask(b, nodes) {
				2045	if (node_distance(a, b) < dist) {
				2046	faults += group_faults(p, b);
				2047	node_set(b, this_group);
				2048	node_clear(b, nodes);
				2049	}
				2050	}
				2051
				2052	/* Remember the top group. */
				2053	if (faults > max_faults) {
				2054	max_faults = faults;
				2055	max_group = this_group;
				2056	/*
				2057	* subtle: at the smallest distance there is
				2058	* just one node left in each "group", the
				2059	* winner is the preferred nid.
				2060	*/
				2061	nid = a;
				2062	}
				2063	}
				2064	/* Next round, evaluate the nodes within max_group. */
Jan Beulich	890a540	2015-02-09 12:30:00 +0100	[diff] [blame]	2065	if (!max_faults)
				2066	break;
Rik van Riel	5400941	2014-10-17 03:29:53 -0400	[diff] [blame]	2067	nodes = max_group;
				2068	}
				2069	return nid;
				2070	}
				2071
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2072	static void task_numa_placement(struct task_struct *p)
				2073	{
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	2074	int seq, nid, max_nid = -1, max_group_nid = -1;
				2075	unsigned long max_faults = 0, max_group_faults = 0;
Rik van Riel	04bb2f9	2013-10-07 11:29:36 +0100	[diff] [blame]	2076	unsigned long fault_types[2] = { 0, 0 };
Rik van Riel	7e2703e	2014-01-27 17:03:45 -0500	[diff] [blame]	2077	unsigned long total_faults;
				2078	u64 runtime, period;
Mel Gorman	7dbd13e	2013-10-07 11:29:29 +0100	[diff] [blame]	2079	spinlock_t *group_lock = NULL;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2080
Jason Low	7e5a2c1	2015-04-30 17:28:14 -0700	[diff] [blame]	2081	/*
				2082	* The p->mm->numa_scan_seq field gets updated without
				2083	* exclusive access. Use READ_ONCE() here to ensure
				2084	* that the field is read in a single access:
				2085	*/
Jason Low	316c1608d	2015-04-28 13:00:20 -0700	[diff] [blame]	2086	seq = READ_ONCE(p->mm->numa_scan_seq);
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2087	if (p->numa_scan_seq == seq)
				2088	return;
				2089	p->numa_scan_seq = seq;
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	2090	p->numa_scan_period_max = task_scan_max(p);
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2091
Rik van Riel	7e2703e	2014-01-27 17:03:45 -0500	[diff] [blame]	2092	total_faults = p->numa_faults_locality[0] +
				2093	p->numa_faults_locality[1];
				2094	runtime = numa_get_avg_runtime(p, &period);
				2095
Mel Gorman	7dbd13e	2013-10-07 11:29:29 +0100	[diff] [blame]	2096	/* If the task is part of a group prevent parallel updates to group stats */
				2097	if (p->numa_group) {
				2098	group_lock = &p->numa_group->lock;
Mike Galbraith	60e69ee	2014-04-07 10:55:15 +0200	[diff] [blame]	2099	spin_lock_irq(group_lock);
Mel Gorman	7dbd13e	2013-10-07 11:29:29 +0100	[diff] [blame]	2100	}
				2101
Mel Gorman	688b758	2013-10-07 11:28:58 +0100	[diff] [blame]	2102	/* Find the node with the highest number of faults */
				2103	for_each_online_node(nid) {
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2104	/* Keep track of the offsets in numa_faults array */
				2105	int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	2106	unsigned long faults = 0, group_faults = 0;
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2107	int priv;
Mel Gorman	745d614	2013-10-07 11:28:59 +0100	[diff] [blame]	2108
Rik van Riel	be1e4e7	2014-01-27 17:03:48 -0500	[diff] [blame]	2109	for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
Rik van Riel	7e2703e	2014-01-27 17:03:45 -0500	[diff] [blame]	2110	long diff, f_diff, f_weight;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2111
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2112	mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
				2113	membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
				2114	cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
				2115	cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
Mel Gorman	745d614	2013-10-07 11:28:59 +0100	[diff] [blame]	2116
Mel Gorman	ac8e895	2013-10-07 11:29:03 +0100	[diff] [blame]	2117	/* Decay existing window, copy faults since last scan */
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2118	diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
				2119	fault_types[priv] += p->numa_faults[membuf_idx];
				2120	p->numa_faults[membuf_idx] = 0;
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	2121
Rik van Riel	7e2703e	2014-01-27 17:03:45 -0500	[diff] [blame]	2122	/*
				2123	* Normalize the faults_from, so all tasks in a group
				2124	* count according to CPU use, instead of by the raw
				2125	* number of faults. Tasks with little runtime have
				2126	* little over-all impact on throughput, and thus their
				2127	* faults are less important.
				2128	*/
				2129	f_weight = div64_u64(runtime << 16, period + 1);
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2130	f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
Rik van Riel	7e2703e	2014-01-27 17:03:45 -0500	[diff] [blame]	2131	(total_faults + 1);
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2132	f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
				2133	p->numa_faults[cpubuf_idx] = 0;
Rik van Riel	50ec8a4	2014-01-27 17:03:42 -0500	[diff] [blame]	2134
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2135	p->numa_faults[mem_idx] += diff;
				2136	p->numa_faults[cpu_idx] += f_diff;
				2137	faults += p->numa_faults[mem_idx];
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	2138	p->total_numa_faults += diff;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2139	if (p->numa_group) {
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2140	/*
				2141	* safe because we can only change our own group
				2142	*
				2143	* mem_idx represents the offset for a given
				2144	* nid and priv in a specific region because it
				2145	* is at the beginning of the numa_faults array.
				2146	*/
				2147	p->numa_group->faults[mem_idx] += diff;
				2148	p->numa_group->faults_cpu[mem_idx] += f_diff;
Mel Gorman	989348b	2013-10-07 11:29:40 +0100	[diff] [blame]	2149	p->numa_group->total_faults += diff;
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2150	group_faults += p->numa_group->faults[mem_idx];
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2151	}
Mel Gorman	ac8e895	2013-10-07 11:29:03 +0100	[diff] [blame]	2152	}
				2153
Mel Gorman	688b758	2013-10-07 11:28:58 +0100	[diff] [blame]	2154	if (faults > max_faults) {
				2155	max_faults = faults;
				2156	max_nid = nid;
				2157	}
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	2158
				2159	if (group_faults > max_group_faults) {
				2160	max_group_faults = group_faults;
				2161	max_group_nid = nid;
				2162	}
				2163	}
				2164
Rik van Riel	04bb2f9	2013-10-07 11:29:36 +0100	[diff] [blame]	2165	update_task_scan_period(p, fault_types[0], fault_types[1]);
				2166
Mel Gorman	7dbd13e	2013-10-07 11:29:29 +0100	[diff] [blame]	2167	if (p->numa_group) {
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	2168	numa_group_count_active_nodes(p->numa_group);
Mike Galbraith	60e69ee	2014-04-07 10:55:15 +0200	[diff] [blame]	2169	spin_unlock_irq(group_lock);
Rik van Riel	5400941	2014-10-17 03:29:53 -0400	[diff] [blame]	2170	max_nid = preferred_group_nid(p, max_group_nid);
Mel Gorman	688b758	2013-10-07 11:28:58 +0100	[diff] [blame]	2171	}
				2172
Rik van Riel	bb97fc3	2014-06-04 16:33:15 -0400	[diff] [blame]	2173	if (max_faults) {
				2174	/* Set the new preferred node */
				2175	if (max_nid != p->numa_preferred_nid)
				2176	sched_setnuma(p, max_nid);
				2177
				2178	if (task_node(p) != p->numa_preferred_nid)
				2179	numa_migrate_preferred(p);
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	2180	}
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2181	}
				2182
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2183	static inline int get_numa_group(struct numa_group *grp)
				2184	{
				2185	return atomic_inc_not_zero(&grp->refcount);
				2186	}
				2187
				2188	static inline void put_numa_group(struct numa_group *grp)
				2189	{
				2190	if (atomic_dec_and_test(&grp->refcount))
				2191	kfree_rcu(grp, rcu);
				2192	}
				2193
Mel Gorman	3e6a941	2013-10-07 11:29:35 +0100	[diff] [blame]	2194	static void task_numa_group(struct task_struct *p, int cpupid, int flags,
				2195	int *priv)
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2196	{
				2197	struct numa_group grp, my_grp;
				2198	struct task_struct *tsk;
				2199	bool join = false;
				2200	int cpu = cpupid_to_cpu(cpupid);
				2201	int i;
				2202
				2203	if (unlikely(!p->numa_group)) {
				2204	unsigned int size = sizeof(struct numa_group) +
Rik van Riel	50ec8a4	2014-01-27 17:03:42 -0500	[diff] [blame]	2205	4nr_node_idssizeof(unsigned long);
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2206
				2207	grp = kzalloc(size, GFP_KERNEL \| __GFP_NOWARN);
				2208	if (!grp)
				2209	return;
				2210
				2211	atomic_set(&grp->refcount, 1);
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	2212	grp->active_nodes = 1;
				2213	grp->max_faults_cpu = 0;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2214	spin_lock_init(&grp->lock);
Mel Gorman	e29cf08	2013-10-07 11:29:22 +0100	[diff] [blame]	2215	grp->gid = p->pid;
Rik van Riel	50ec8a4	2014-01-27 17:03:42 -0500	[diff] [blame]	2216	/* Second half of the array tracks nids where faults happen */
Rik van Riel	be1e4e7	2014-01-27 17:03:48 -0500	[diff] [blame]	2217	grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
				2218	nr_node_ids;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2219
Rik van Riel	be1e4e7	2014-01-27 17:03:48 -0500	[diff] [blame]	2220	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2221	grp->faults[i] = p->numa_faults[i];
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2222
Mel Gorman	989348b	2013-10-07 11:29:40 +0100	[diff] [blame]	2223	grp->total_faults = p->total_numa_faults;
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	2224
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2225	grp->nr_tasks++;
				2226	rcu_assign_pointer(p->numa_group, grp);
				2227	}
				2228
				2229	rcu_read_lock();
Jason Low	316c1608d	2015-04-28 13:00:20 -0700	[diff] [blame]	2230	tsk = READ_ONCE(cpu_rq(cpu)->curr);
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2231
				2232	if (!cpupid_match_pid(tsk, cpupid))
Peter Zijlstra	3354781	2013-10-09 10:24:48 +0200	[diff] [blame]	2233	goto no_join;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2234
				2235	grp = rcu_dereference(tsk->numa_group);
				2236	if (!grp)
Peter Zijlstra	3354781	2013-10-09 10:24:48 +0200	[diff] [blame]	2237	goto no_join;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2238
				2239	my_grp = p->numa_group;
				2240	if (grp == my_grp)
Peter Zijlstra	3354781	2013-10-09 10:24:48 +0200	[diff] [blame]	2241	goto no_join;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2242
				2243	/*
				2244	* Only join the other group if its bigger; if we're the bigger group,
				2245	* the other task will join us.
				2246	*/
				2247	if (my_grp->nr_tasks > grp->nr_tasks)
Peter Zijlstra	3354781	2013-10-09 10:24:48 +0200	[diff] [blame]	2248	goto no_join;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2249
				2250	/*
				2251	* Tie-break on the grp address.
				2252	*/
				2253	if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
Peter Zijlstra	3354781	2013-10-09 10:24:48 +0200	[diff] [blame]	2254	goto no_join;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2255
Rik van Riel	dabe1d9	2013-10-07 11:29:34 +0100	[diff] [blame]	2256	/* Always join threads in the same process. */
				2257	if (tsk->mm == current->mm)
				2258	join = true;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2259
Rik van Riel	dabe1d9	2013-10-07 11:29:34 +0100	[diff] [blame]	2260	/* Simple filter to avoid false positives due to PID collisions */
				2261	if (flags & TNF_SHARED)
				2262	join = true;
				2263
Mel Gorman	3e6a941	2013-10-07 11:29:35 +0100	[diff] [blame]	2264	/* Update priv based on whether false sharing was detected */
				2265	*priv = !join;
				2266
Rik van Riel	dabe1d9	2013-10-07 11:29:34 +0100	[diff] [blame]	2267	if (join && !get_numa_group(grp))
Peter Zijlstra	3354781	2013-10-09 10:24:48 +0200	[diff] [blame]	2268	goto no_join;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2269
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2270	rcu_read_unlock();
				2271
				2272	if (!join)
				2273	return;
				2274
Mike Galbraith	60e69ee	2014-04-07 10:55:15 +0200	[diff] [blame]	2275	BUG_ON(irqs_disabled());
				2276	double_lock_irq(&my_grp->lock, &grp->lock);
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2277
Rik van Riel	be1e4e7	2014-01-27 17:03:48 -0500	[diff] [blame]	2278	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2279	my_grp->faults[i] -= p->numa_faults[i];
				2280	grp->faults[i] += p->numa_faults[i];
Mel Gorman	989348b	2013-10-07 11:29:40 +0100	[diff] [blame]	2281	}
				2282	my_grp->total_faults -= p->total_numa_faults;
				2283	grp->total_faults += p->total_numa_faults;
				2284
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2285	my_grp->nr_tasks--;
				2286	grp->nr_tasks++;
				2287
				2288	spin_unlock(&my_grp->lock);
Mike Galbraith	60e69ee	2014-04-07 10:55:15 +0200	[diff] [blame]	2289	spin_unlock_irq(&grp->lock);
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2290
				2291	rcu_assign_pointer(p->numa_group, grp);
				2292
				2293	put_numa_group(my_grp);
Peter Zijlstra	3354781	2013-10-09 10:24:48 +0200	[diff] [blame]	2294	return;
				2295
				2296	no_join:
				2297	rcu_read_unlock();
				2298	return;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2299	}
				2300
				2301	void task_numa_free(struct task_struct *p)
				2302	{
				2303	struct numa_group *grp = p->numa_group;
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2304	void *numa_faults = p->numa_faults;
Steven Rostedt	e9dd685	2014-05-27 17:02:04 -0400	[diff] [blame]	2305	unsigned long flags;
				2306	int i;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2307
				2308	if (grp) {
Steven Rostedt	e9dd685	2014-05-27 17:02:04 -0400	[diff] [blame]	2309	spin_lock_irqsave(&grp->lock, flags);
Rik van Riel	be1e4e7	2014-01-27 17:03:48 -0500	[diff] [blame]	2310	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2311	grp->faults[i] -= p->numa_faults[i];
Mel Gorman	989348b	2013-10-07 11:29:40 +0100	[diff] [blame]	2312	grp->total_faults -= p->total_numa_faults;
				2313
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2314	grp->nr_tasks--;
Steven Rostedt	e9dd685	2014-05-27 17:02:04 -0400	[diff] [blame]	2315	spin_unlock_irqrestore(&grp->lock, flags);
Andreea-Cristina Bernat	35b123e	2014-08-22 17:50:43 +0300	[diff] [blame]	2316	RCU_INIT_POINTER(p->numa_group, NULL);
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2317	put_numa_group(grp);
				2318	}
				2319
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2320	p->numa_faults = NULL;
Rik van Riel	8272701	2013-10-07 11:29:28 +0100	[diff] [blame]	2321	kfree(numa_faults);
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2322	}
				2323
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2324	/*
				2325	* Got a PROT_NONE fault for a page on @node.
				2326	*/
Rik van Riel	58b46da	2014-01-27 17:03:47 -0500	[diff] [blame]	2327	void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2328	{
				2329	struct task_struct *p = current;
Peter Zijlstra	6688cc0	2013-10-07 11:29:24 +0100	[diff] [blame]	2330	bool migrated = flags & TNF_MIGRATED;
Rik van Riel	58b46da	2014-01-27 17:03:47 -0500	[diff] [blame]	2331	int cpu_node = task_node(current);
Rik van Riel	792568e	2014-04-11 13:00:27 -0400	[diff] [blame]	2332	int local = !!(flags & TNF_FAULT_LOCAL);
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	2333	struct numa_group *ng;
Mel Gorman	ac8e895	2013-10-07 11:29:03 +0100	[diff] [blame]	2334	int priv;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2335
Srikar Dronamraju	2a59572	2015-08-11 21:54:21 +0530	[diff] [blame]	2336	if (!static_branch_likely(&sched_numa_balancing))
Mel Gorman	1a687c2	2012-11-22 11:16:36 +0000	[diff] [blame]	2337	return;
				2338
Mel Gorman	9ff1d9f	2013-10-07 11:29:04 +0100	[diff] [blame]	2339	/* for example, ksmd faulting in a user's mm */
				2340	if (!p->mm)
				2341	return;
				2342
Mel Gorman	f809ca9	2013-10-07 11:28:57 +0100	[diff] [blame]	2343	/* Allocate buffer to track faults on a per-node basis */
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2344	if (unlikely(!p->numa_faults)) {
				2345	int size = sizeof(p->numa_faults)
Rik van Riel	be1e4e7	2014-01-27 17:03:48 -0500	[diff] [blame]	2346	NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
Mel Gorman	f809ca9	2013-10-07 11:28:57 +0100	[diff] [blame]	2347
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2348	p->numa_faults = kzalloc(size, GFP_KERNEL\|__GFP_NOWARN);
				2349	if (!p->numa_faults)
Mel Gorman	f809ca9	2013-10-07 11:28:57 +0100	[diff] [blame]	2350	return;
Mel Gorman	745d614	2013-10-07 11:28:59 +0100	[diff] [blame]	2351
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	2352	p->total_numa_faults = 0;
Rik van Riel	04bb2f9	2013-10-07 11:29:36 +0100	[diff] [blame]	2353	memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
Mel Gorman	f809ca9	2013-10-07 11:28:57 +0100	[diff] [blame]	2354	}
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2355
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	2356	/*
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2357	* First accesses are treated as private, otherwise consider accesses
				2358	* to be private if the accessing pid has not changed
				2359	*/
				2360	if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
				2361	priv = 1;
				2362	} else {
				2363	priv = cpupid_match_pid(p, last_cpupid);
Peter Zijlstra	6688cc0	2013-10-07 11:29:24 +0100	[diff] [blame]	2364	if (!priv && !(flags & TNF_NO_GROUP))
Mel Gorman	3e6a941	2013-10-07 11:29:35 +0100	[diff] [blame]	2365	task_numa_group(p, last_cpupid, flags, &priv);
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2366	}
				2367
Rik van Riel	792568e	2014-04-11 13:00:27 -0400	[diff] [blame]	2368	/*
				2369	* If a workload spans multiple NUMA nodes, a shared fault that
				2370	* occurs wholly within the set of nodes that the workload is
				2371	* actively using should be counted as local. This allows the
				2372	* scan rate to slow down when a workload has settled down.
				2373	*/
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	2374	ng = p->numa_group;
				2375	if (!priv && !local && ng && ng->active_nodes > 1 &&
				2376	numa_is_active_node(cpu_node, ng) &&
				2377	numa_is_active_node(mem_node, ng))
Rik van Riel	792568e	2014-04-11 13:00:27 -0400	[diff] [blame]	2378	local = 1;
				2379
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2380	task_numa_placement(p);
Mel Gorman	f809ca9	2013-10-07 11:28:57 +0100	[diff] [blame]	2381
Rik van Riel	2739d3e	2013-10-07 11:29:41 +0100	[diff] [blame]	2382	/*
				2383	* Retry task to preferred node migration periodically, in case it
				2384	* case it previously failed, or the scheduler moved us.
				2385	*/
				2386	if (time_after(jiffies, p->numa_migrate_retry))
Mel Gorman	6b9a746	2013-10-07 11:29:11 +0100	[diff] [blame]	2387	numa_migrate_preferred(p);
				2388
Ingo Molnar	b32e86b	2013-10-07 11:29:30 +0100	[diff] [blame]	2389	if (migrated)
				2390	p->numa_pages_migrated += pages;
Mel Gorman	074c238	2015-03-25 15:55:42 -0700	[diff] [blame]	2391	if (flags & TNF_MIGRATE_FAIL)
				2392	p->numa_faults_locality[2] += pages;
Ingo Molnar	b32e86b	2013-10-07 11:29:30 +0100	[diff] [blame]	2393
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2394	p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
				2395	p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
Rik van Riel	792568e	2014-04-11 13:00:27 -0400	[diff] [blame]	2396	p->numa_faults_locality[local] += pages;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2397	}
				2398
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2399	static void reset_ptenuma_scan(struct task_struct *p)
				2400	{
Jason Low	7e5a2c1	2015-04-30 17:28:14 -0700	[diff] [blame]	2401	/*
				2402	* We only did a read acquisition of the mmap sem, so
				2403	* p->mm->numa_scan_seq is written to without exclusive access
				2404	* and the update is not guaranteed to be atomic. That's not
				2405	* much of an issue though, since this is just used for
				2406	* statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
				2407	* expensive, to avoid any form of compiler optimizations:
				2408	*/
Jason Low	316c1608d	2015-04-28 13:00:20 -0700	[diff] [blame]	2409	WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2410	p->mm->numa_scan_offset = 0;
				2411	}
				2412
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2413	/*
				2414	* The expensive part of numa migration is done from task_work context.
				2415	* Triggered from task_tick_numa().
				2416	*/
				2417	void task_numa_work(struct callback_head *work)
				2418	{
				2419	unsigned long migrate, next_scan, now = jiffies;
				2420	struct task_struct *p = current;
				2421	struct mm_struct *mm = p->mm;
Rik van Riel	5117084	2015-11-05 15:56:23 -0500	[diff] [blame]	2422	u64 runtime = p->se.sum_exec_runtime;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2423	struct vm_area_struct *vma;
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	2424	unsigned long start, end;
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	2425	unsigned long nr_pte_updates = 0;
Rik van Riel	4620f8c	2015-09-11 09:00:27 -0400	[diff] [blame]	2426	long pages, virtpages;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2427
Peter Zijlstra	9148a3a	2016-09-20 22:34:51 +0200	[diff] [blame]	2428	SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2429
				2430	work->next = work; /* protect against double add */
				2431	/*
				2432	* Who cares about NUMA placement when they're dying.
				2433	*
				2434	* NOTE: make sure not to dereference p->mm before this check,
				2435	* exit_task_work() happens _after_ exit_mm() so we could be called
				2436	* without p->mm even though we still had it when we enqueued this
				2437	* work.
				2438	*/
				2439	if (p->flags & PF_EXITING)
				2440	return;
				2441
Mel Gorman	930aa17	2013-10-07 11:29:37 +0100	[diff] [blame]	2442	if (!mm->numa_next_scan) {
Mel Gorman	7e8d16b	2013-10-07 11:28:54 +0100	[diff] [blame]	2443	mm->numa_next_scan = now +
				2444	msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	2445	}
				2446
				2447	/*
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2448	* Enforce maximal scan/migration frequency..
				2449	*/
				2450	migrate = mm->numa_next_scan;
				2451	if (time_before(now, migrate))
				2452	return;
				2453
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	2454	if (p->numa_scan_period == 0) {
				2455	p->numa_scan_period_max = task_scan_max(p);
				2456	p->numa_scan_period = task_scan_min(p);
				2457	}
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2458
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	2459	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2460	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
				2461	return;
				2462
Mel Gorman	e14808b	2012-11-19 10:59:15 +0000	[diff] [blame]	2463	/*
Peter Zijlstra	19a78d1	2013-10-07 11:28:51 +0100	[diff] [blame]	2464	* Delay this task enough that another task of this mm will likely win
				2465	* the next time around.
				2466	*/
				2467	p->node_stamp += 2 * TICK_NSEC;
				2468
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	2469	start = mm->numa_scan_offset;
				2470	pages = sysctl_numa_balancing_scan_size;
				2471	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
Rik van Riel	4620f8c	2015-09-11 09:00:27 -0400	[diff] [blame]	2472	virtpages = pages * 8; /* Scan up to this much virtual space */
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	2473	if (!pages)
				2474	return;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2475
Rik van Riel	4620f8c	2015-09-11 09:00:27 -0400	[diff] [blame]	2476
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2477	down_read(&mm->mmap_sem);
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	2478	vma = find_vma(mm, start);
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2479	if (!vma) {
				2480	reset_ptenuma_scan(p);
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	2481	start = 0;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2482	vma = mm->mmap;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2483	}
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	2484	for (; vma; vma = vma->vm_next) {
Naoya Horiguchi	6b79c57	2015-04-07 14:26:47 -0700	[diff] [blame]	2485	if (!vma_migratable(vma) \|\| !vma_policy_mof(vma) \|\|
Mel Gorman	8e76d4e	2015-06-10 11:15:00 -0700	[diff] [blame]	2486	is_vm_hugetlb_page(vma) \|\| (vma->vm_flags & VM_MIXEDMAP)) {
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2487	continue;
Naoya Horiguchi	6b79c57	2015-04-07 14:26:47 -0700	[diff] [blame]	2488	}
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2489
Mel Gorman	4591ce4f	2013-10-07 11:29:13 +0100	[diff] [blame]	2490	/*
				2491	* Shared library pages mapped by multiple processes are not
				2492	* migrated as it is expected they are cache replicated. Avoid
				2493	* hinting faults in read-only file-backed mappings or the vdso
				2494	* as migrating the pages will be of marginal benefit.
				2495	*/
				2496	if (!vma->vm_mm \|\|
				2497	(vma->vm_file && (vma->vm_flags & (VM_READ\|VM_WRITE)) == (VM_READ)))
				2498	continue;
				2499
Mel Gorman	3c67f47	2013-12-18 17:08:40 -0800	[diff] [blame]	2500	/*
				2501	* Skip inaccessible VMAs to avoid any confusion between
				2502	* PROT_NONE and NUMA hinting ptes
				2503	*/
				2504	if (!(vma->vm_flags & (VM_READ \| VM_EXEC \| VM_WRITE)))
				2505	continue;
				2506
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	2507	do {
				2508	start = max(start, vma->vm_start);
				2509	end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
				2510	end = min(end, vma->vm_end);
Rik van Riel	4620f8c	2015-09-11 09:00:27 -0400	[diff] [blame]	2511	nr_pte_updates = change_prot_numa(vma, start, end);
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	2512
				2513	/*
Rik van Riel	4620f8c	2015-09-11 09:00:27 -0400	[diff] [blame]	2514	* Try to scan sysctl_numa_balancing_size worth of
				2515	* hpages that have at least one present PTE that
				2516	* is not already pte-numa. If the VMA contains
				2517	* areas that are unused or already full of prot_numa
				2518	* PTEs, scan up to virtpages, to skip through those
				2519	* areas faster.
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	2520	*/
				2521	if (nr_pte_updates)
				2522	pages -= (end - start) >> PAGE_SHIFT;
Rik van Riel	4620f8c	2015-09-11 09:00:27 -0400	[diff] [blame]	2523	virtpages -= (end - start) >> PAGE_SHIFT;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2524
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	2525	start = end;
Rik van Riel	4620f8c	2015-09-11 09:00:27 -0400	[diff] [blame]	2526	if (pages <= 0 \|\| virtpages <= 0)
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	2527	goto out;
Rik van Riel	3cf1962	2014-02-18 17:12:44 -0500	[diff] [blame]	2528
				2529	cond_resched();
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	2530	} while (end != vma->vm_end);
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2531	}
				2532
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	2533	out:
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2534	/*
Peter Zijlstra	c69307d	2013-10-07 11:28:41 +0100	[diff] [blame]	2535	* It is possible to reach the end of the VMA list but the last few
				2536	* VMAs are not guaranteed to the vma_migratable. If they are not, we
				2537	* would find the !migratable VMA on the next scan but not reset the
				2538	* scanner to the start so check it now.
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2539	*/
				2540	if (vma)
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	2541	mm->numa_scan_offset = start;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2542	else
				2543	reset_ptenuma_scan(p);
				2544	up_read(&mm->mmap_sem);
Rik van Riel	5117084	2015-11-05 15:56:23 -0500	[diff] [blame]	2545
				2546	/*
				2547	* Make sure tasks use at least 32x as much time to run other code
				2548	* than they used here, to limit NUMA PTE scanning overhead to 3% max.
				2549	* Usually update_task_scan_period slows down scanning enough; on an
				2550	* overloaded system we need to limit overhead on a per task basis.
				2551	*/
				2552	if (unlikely(p->se.sum_exec_runtime != runtime)) {
				2553	u64 diff = p->se.sum_exec_runtime - runtime;
				2554	p->node_stamp += 32 * diff;
				2555	}
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2556	}
				2557
				2558	/*
				2559	* Drive the periodic memory faults..
				2560	*/
				2561	void task_tick_numa(struct rq rq, struct task_struct curr)
				2562	{
				2563	struct callback_head *work = &curr->numa_work;
				2564	u64 period, now;
				2565
				2566	/*
				2567	* We don't care about NUMA placement if we don't have memory.
				2568	*/
				2569	if (!curr->mm \|\| (curr->flags & PF_EXITING) \|\| work->next != work)
				2570	return;
				2571
				2572	/*
				2573	* Using runtime rather than walltime has the dual advantage that
				2574	* we (mostly) drive the selection from busy threads and that the
				2575	* task needs to have done some actual work before we bother with
				2576	* NUMA placement.
				2577	*/
				2578	now = curr->se.sum_exec_runtime;
				2579	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
				2580
Rik van Riel	25b3e5a	2015-11-05 15:56:22 -0500	[diff] [blame]	2581	if (now > curr->node_stamp + period) {
Peter Zijlstra	4b96a29	2012-10-25 14:16:47 +0200	[diff] [blame]	2582	if (!curr->node_stamp)
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	2583	curr->numa_scan_period = task_scan_min(curr);
Peter Zijlstra	19a78d1	2013-10-07 11:28:51 +0100	[diff] [blame]	2584	curr->node_stamp += period;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2585
				2586	if (!time_before(jiffies, curr->mm->numa_next_scan)) {
				2587	init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
				2588	task_work_add(curr, work, true);
				2589	}
				2590	}
				2591	}
				2592	#else
				2593	static void task_tick_numa(struct rq rq, struct task_struct curr)
				2594	{
				2595	}
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	2596
				2597	static inline void account_numa_enqueue(struct rq rq, struct task_struct p)
				2598	{
				2599	}
				2600
				2601	static inline void account_numa_dequeue(struct rq rq, struct task_struct p)
				2602	{
				2603	}
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2604	#endif /* CONFIG_NUMA_BALANCING */
				2605
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	2606	static void
				2607	account_entity_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
				2608	{
				2609	update_load_add(&cfs_rq->load, se->load.weight);
Peter Zijlstra	c09595f	2008-06-27 13:41:14 +0200	[diff] [blame]	2610	if (!parent_entity(se))
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2611	update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	2612	#ifdef CONFIG_SMP
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	2613	if (entity_is_task(se)) {
				2614	struct rq *rq = rq_of(cfs_rq);
				2615
				2616	account_numa_enqueue(rq, task_of(se));
				2617	list_add(&se->group_node, &rq->cfs_tasks);
				2618	}
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	2619	#endif
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	2620	cfs_rq->nr_running++;
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	2621	}
				2622
				2623	static void
				2624	account_entity_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)
				2625	{
				2626	update_load_sub(&cfs_rq->load, se->load.weight);
Peter Zijlstra	c09595f	2008-06-27 13:41:14 +0200	[diff] [blame]	2627	if (!parent_entity(se))
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2628	update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
Tim Chen	bfdb198	2016-02-01 14:47:59 -0800	[diff] [blame]	2629	#ifdef CONFIG_SMP
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	2630	if (entity_is_task(se)) {
				2631	account_numa_dequeue(rq_of(cfs_rq), task_of(se));
Bharata B Rao	b87f172	2008-09-25 09:53:54 +0530	[diff] [blame]	2632	list_del_init(&se->group_node);
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	2633	}
Tim Chen	bfdb198	2016-02-01 14:47:59 -0800	[diff] [blame]	2634	#endif
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	2635	cfs_rq->nr_running--;
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	2636	}
				2637
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	2638	#ifdef CONFIG_FAIR_GROUP_SCHED
				2639	# ifdef CONFIG_SMP
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	2640	static long calc_cfs_shares(struct cfs_rq cfs_rq, struct task_group tg)
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	2641	{
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	2642	long tg_weight, load, shares;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	2643
Peter Zijlstra	ea1dc6f	2016-06-24 16:11:02 +0200	[diff] [blame]	2644	/*
				2645	* This really should be: cfs_rq->avg.load_avg, but instead we use
				2646	* cfs_rq->load.weight, which is its upper bound. This helps ramp up
				2647	* the shares for small weight interactive tasks.
				2648	*/
				2649	load = scale_load_down(cfs_rq->load.weight);
				2650
				2651	tg_weight = atomic_long_read(&tg->load_avg);
				2652
				2653	/* Ensure tg_weight >= load */
				2654	tg_weight -= cfs_rq->tg_load_avg_contrib;
				2655	tg_weight += load;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	2656
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	2657	shares = (tg->shares * load);
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	2658	if (tg_weight)
				2659	shares /= tg_weight;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	2660
Dietmar Eggemann	b8fd842	2017-01-11 11:29:47 +0000	[diff] [blame]	2661	/*
				2662	* MIN_SHARES has to be unscaled here to support per-CPU partitioning
				2663	* of a group with small tg->shares value. It is a floor value which is
				2664	* assigned as a minimum load.weight to the sched_entity representing
				2665	* the group on a CPU.
				2666	*
				2667	* E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024
				2668	* on an 8-core system with 8 tasks each runnable on one CPU shares has
				2669	* to be 1510241/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In
				2670	* case no task is runnable on a CPU MIN_SHARES=2 should be returned
				2671	* instead of 0.
				2672	*/
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	2673	if (shares < MIN_SHARES)
				2674	shares = MIN_SHARES;
				2675	if (shares > tg->shares)
				2676	shares = tg->shares;
				2677
				2678	return shares;
				2679	}
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	2680	# else /* CONFIG_SMP */
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	2681	static inline long calc_cfs_shares(struct cfs_rq cfs_rq, struct task_group tg)
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	2682	{
				2683	return tg->shares;
				2684	}
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	2685	# endif /* CONFIG_SMP */
Peter Zijlstra	ea1dc6f	2016-06-24 16:11:02 +0200	[diff] [blame]	2686
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2687	static void reweight_entity(struct cfs_rq cfs_rq, struct sched_entity se,
				2688	unsigned long weight)
				2689	{
Paul Turner	19e5eeb	2010-12-15 19:10:18 -0800	[diff] [blame]	2690	if (se->on_rq) {
				2691	/* commit outstanding execution time */
				2692	if (cfs_rq->curr == se)
				2693	update_curr(cfs_rq);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2694	account_entity_dequeue(cfs_rq, se);
Paul Turner	19e5eeb	2010-12-15 19:10:18 -0800	[diff] [blame]	2695	}
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2696
				2697	update_load_set(&se->load, weight);
				2698
				2699	if (se->on_rq)
				2700	account_entity_enqueue(cfs_rq, se);
				2701	}
				2702
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	2703	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
				2704
Vincent Guittot	89ee048	2016-12-21 16:50:26 +0100	[diff] [blame]	2705	static void update_cfs_shares(struct sched_entity *se)
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2706	{
Vincent Guittot	89ee048	2016-12-21 16:50:26 +0100	[diff] [blame]	2707	struct cfs_rq *cfs_rq = group_cfs_rq(se);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2708	struct task_group *tg;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	2709	long shares;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2710
Vincent Guittot	89ee048	2016-12-21 16:50:26 +0100	[diff] [blame]	2711	if (!cfs_rq)
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2712	return;
Vincent Guittot	89ee048	2016-12-21 16:50:26 +0100	[diff] [blame]	2713
				2714	if (throttled_hierarchy(cfs_rq))
				2715	return;
				2716
				2717	tg = cfs_rq->tg;
				2718
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	2719	#ifndef CONFIG_SMP
				2720	if (likely(se->load.weight == tg->shares))
				2721	return;
				2722	#endif
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	2723	shares = calc_cfs_shares(cfs_rq, tg);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2724
				2725	reweight_entity(cfs_rq_of(se), se, shares);
				2726	}
Vincent Guittot	89ee048	2016-12-21 16:50:26 +0100	[diff] [blame]	2727
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2728	#else /* CONFIG_FAIR_GROUP_SCHED */
Vincent Guittot	89ee048	2016-12-21 16:50:26 +0100	[diff] [blame]	2729	static inline void update_cfs_shares(struct sched_entity *se)
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2730	{
				2731	}
				2732	#endif /* CONFIG_FAIR_GROUP_SCHED */
				2733
Alex Shi	141965c	2013-06-26 13:05:39 +0800	[diff] [blame]	2734	#ifdef CONFIG_SMP
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	2735	/* Precomputed fixed inverse multiplies for multiplication by y^n */
				2736	static const u32 runnable_avg_yN_inv[] = {
				2737	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
				2738	0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
				2739	0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
				2740	0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
				2741	0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
				2742	0x85aac367, 0x82cd8698,
				2743	};
				2744
				2745	/*
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2746	* Approximate:
				2747	* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
				2748	*/
Yuyang Du	a481db3	2017-02-13 05:44:23 +0800	[diff] [blame]	2749	static u64 decay_load(u64 val, u64 n)
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2750	{
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	2751	unsigned int local_n;
				2752
Peter Zijlstra	05296e7	2017-03-31 10:51:41 +0200	[diff] [blame^]	2753	if (unlikely(n > LOAD_AVG_PERIOD * 63))
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	2754	return 0;
				2755
				2756	/* after bounds checking we can collapse to 32-bit */
				2757	local_n = n;
				2758
				2759	/*
				2760	* As y^PERIOD = 1/2, we can combine
Zhihui Zhang	9c58c79	2014-09-20 21:24:36 -0400	[diff] [blame]	2761	* y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
				2762	* With a look-up table which covers y^n (n<PERIOD)
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	2763	*
				2764	* To achieve constant time decay_load.
				2765	*/
				2766	if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
				2767	val >>= local_n / LOAD_AVG_PERIOD;
				2768	local_n %= LOAD_AVG_PERIOD;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2769	}
				2770
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2771	val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
				2772	return val;
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	2773	}
				2774
Peter Zijlstra	05296e7	2017-03-31 10:51:41 +0200	[diff] [blame^]	2775	static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	2776	{
Peter Zijlstra	05296e7	2017-03-31 10:51:41 +0200	[diff] [blame^]	2777	u32 c1, c2, c3 = d3; /* y^0 == 1 */
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	2778
Yuyang Du	a481db3	2017-02-13 05:44:23 +0800	[diff] [blame]	2779	/*
				2780	* c1 = d1 y^(p+1)
				2781	*/
Peter Zijlstra	05296e7	2017-03-31 10:51:41 +0200	[diff] [blame^]	2782	c1 = decay_load((u64)d1, periods);
Yuyang Du	a481db3	2017-02-13 05:44:23 +0800	[diff] [blame]	2783
Yuyang Du	a481db3	2017-02-13 05:44:23 +0800	[diff] [blame]	2784	/*
Peter Zijlstra	05296e7	2017-03-31 10:51:41 +0200	[diff] [blame^]	2785	* p
				2786	* c2 = 1024 \Sum y^n
				2787	* n=1
Yuyang Du	a481db3	2017-02-13 05:44:23 +0800	[diff] [blame]	2788	*
Peter Zijlstra	05296e7	2017-03-31 10:51:41 +0200	[diff] [blame^]	2789	* inf inf
				2790	* = 1024 ( \Sum y^n - \Sum y^n - y^0 )
				2791	* n=0 n=p+1
Yuyang Du	a481db3	2017-02-13 05:44:23 +0800	[diff] [blame]	2792	*/
Peter Zijlstra	05296e7	2017-03-31 10:51:41 +0200	[diff] [blame^]	2793	c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
Yuyang Du	a481db3	2017-02-13 05:44:23 +0800	[diff] [blame]	2794
				2795	return c1 + c2 + c3;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2796	}
				2797
Peter Zijlstra	54a2138	2015-09-07 15:05:42 +0200	[diff] [blame]	2798	#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
Dietmar Eggemann	e0f5f3a	2015-08-14 17:23:09 +0100	[diff] [blame]	2799
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2800	/*
Yuyang Du	a481db3	2017-02-13 05:44:23 +0800	[diff] [blame]	2801	* Accumulate the three separate parts of the sum; d1 the remainder
				2802	* of the last (incomplete) period, d2 the span of full periods and d3
				2803	* the remainder of the (incomplete) current period.
				2804	*
				2805	* d1 d2 d3
				2806	* ^ ^ ^
				2807	* \| \| \|
				2808	* \|<->\|<----------------->\|<--->\|
				2809	* ... \|---x---\|------\| ... \|------\|-----x (now)
				2810	*
				2811	* p
				2812	* u' = (u + d1) y^(p+1) + 1024 \Sum y^n + d3 y^0
				2813	* n=1
				2814	*
				2815	* = u y^(p+1) + (Step 1)
				2816	*
				2817	* p
				2818	* d1 y^(p+1) + 1024 \Sum y^n + d3 y^0 (Step 2)
				2819	* n=1
				2820	*/
				2821	static __always_inline u32
				2822	accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
				2823	unsigned long weight, int running, struct cfs_rq *cfs_rq)
				2824	{
				2825	unsigned long scale_freq, scale_cpu;
Peter Zijlstra	05296e7	2017-03-31 10:51:41 +0200	[diff] [blame^]	2826	u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
Yuyang Du	a481db3	2017-02-13 05:44:23 +0800	[diff] [blame]	2827	u64 periods;
Yuyang Du	a481db3	2017-02-13 05:44:23 +0800	[diff] [blame]	2828
				2829	scale_freq = arch_scale_freq_capacity(NULL, cpu);
				2830	scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
				2831
				2832	delta += sa->period_contrib;
				2833	periods = delta / 1024; /* A period is 1024us (~1ms) */
				2834
				2835	/*
				2836	* Step 1: decay old *_sum if we crossed period boundaries.
				2837	*/
				2838	if (periods) {
				2839	sa->load_sum = decay_load(sa->load_sum, periods);
				2840	if (cfs_rq) {
				2841	cfs_rq->runnable_load_sum =
				2842	decay_load(cfs_rq->runnable_load_sum, periods);
				2843	}
				2844	sa->util_sum = decay_load((u64)(sa->util_sum), periods);
Yuyang Du	a481db3	2017-02-13 05:44:23 +0800	[diff] [blame]	2845
Peter Zijlstra	05296e7	2017-03-31 10:51:41 +0200	[diff] [blame^]	2846	/*
				2847	* Step 2
				2848	*/
				2849	delta %= 1024;
				2850	contrib = __accumulate_pelt_segments(periods,
				2851	1024 - sa->period_contrib, delta);
				2852	}
Yuyang Du	a481db3	2017-02-13 05:44:23 +0800	[diff] [blame]	2853	sa->period_contrib = delta;
				2854
				2855	contrib = cap_scale(contrib, scale_freq);
				2856	if (weight) {
				2857	sa->load_sum += weight * contrib;
				2858	if (cfs_rq)
				2859	cfs_rq->runnable_load_sum += weight * contrib;
				2860	}
				2861	if (running)
				2862	sa->util_sum += contrib * scale_cpu;
				2863
				2864	return periods;
				2865	}
				2866
				2867	/*
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2868	* We can represent the historical contribution to runnable average as the
				2869	* coefficients of a geometric series. To do this we sub-divide our runnable
				2870	* history into segments of approximately 1ms (1024us); label the segment that
				2871	* occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
				2872	*
				2873	* [<- 1024us ->\|<- 1024us ->\|<- 1024us ->\| ...
				2874	* p0 p1 p2
				2875	* (now) (~1ms ago) (~2ms ago)
				2876	*
				2877	* Let u_i denote the fraction of p_i that the entity was runnable.
				2878	*
				2879	* We then designate the fractions u_i as our co-efficients, yielding the
				2880	* following representation of historical load:
				2881	* u_0 + u_1y + u_2y^2 + u_3*y^3 + ...
				2882	*
				2883	* We choose y based on the with of a reasonably scheduling period, fixing:
				2884	* y^32 = 0.5
				2885	*
				2886	* This means that the contribution to load ~32ms ago (u_32) will be weighted
				2887	* approximately half as much as the contribution to load within the last ms
				2888	* (u_0).
				2889	*
				2890	* When a period "rolls over" and we have new u_0`, multiplying the previous
				2891	* sum again by y is sufficient to update:
				2892	* load_avg = u_0` + y(u_0 + u_1y + u_2*y^2 + ... )
				2893	* = u_0 + u_1y + u_2y^2 + ... [re-labeling u_i --> u_{i+1}]
				2894	*/
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2895	static __always_inline int
Peter Zijlstra	0ccb977	2017-03-28 11:08:20 +0200	[diff] [blame]	2896	___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	2897	unsigned long weight, int running, struct cfs_rq *cfs_rq)
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2898	{
Yuyang Du	a481db3	2017-02-13 05:44:23 +0800	[diff] [blame]	2899	u64 delta;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2900
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2901	delta = now - sa->last_update_time;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2902	/*
				2903	* This should only happen when time goes backwards, which it
				2904	* unfortunately does during sched clock init when we swap over to TSC.
				2905	*/
				2906	if ((s64)delta < 0) {
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2907	sa->last_update_time = now;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2908	return 0;
				2909	}
				2910
				2911	/*
				2912	* Use 1024ns as the unit of measurement since it's a reasonable
				2913	* approximation of 1us and fast to compute.
				2914	*/
				2915	delta >>= 10;
				2916	if (!delta)
				2917	return 0;
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2918	sa->last_update_time = now;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2919
Yuyang Du	a481db3	2017-02-13 05:44:23 +0800	[diff] [blame]	2920	/*
				2921	* Now we know we crossed measurement unit boundaries. The *_avg
				2922	* accrues by two steps:
				2923	*
				2924	* Step 1: accumulate *_sum since last_update_time. If we haven't
				2925	* crossed period boundaries, finish.
				2926	*/
				2927	if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
				2928	return 0;
Dietmar Eggemann	6f2b045	2015-09-07 14:57:22 +0100	[diff] [blame]	2929
Yuyang Du	a481db3	2017-02-13 05:44:23 +0800	[diff] [blame]	2930	/*
				2931	* Step 2: update *_avg.
				2932	*/
				2933	sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
				2934	if (cfs_rq) {
				2935	cfs_rq->runnable_load_avg =
				2936	div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2937	}
Yuyang Du	a481db3	2017-02-13 05:44:23 +0800	[diff] [blame]	2938	sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2939
Yuyang Du	a481db3	2017-02-13 05:44:23 +0800	[diff] [blame]	2940	return 1;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2941	}
				2942
Peter Zijlstra	0ccb977	2017-03-28 11:08:20 +0200	[diff] [blame]	2943	static int
				2944	__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
				2945	{
				2946	return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL);
				2947	}
				2948
				2949	static int
				2950	__update_load_avg_se(u64 now, int cpu, struct cfs_rq cfs_rq, struct sched_entity se)
				2951	{
				2952	return ___update_load_avg(now, cpu, &se->avg,
				2953	se->on_rq * scale_load_down(se->load.weight),
				2954	cfs_rq->curr == se, NULL);
				2955	}
				2956
				2957	static int
				2958	__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
				2959	{
				2960	return ___update_load_avg(now, cpu, &cfs_rq->avg,
				2961	scale_load_down(cfs_rq->load.weight),
				2962	cfs_rq->curr != NULL, cfs_rq);
				2963	}
				2964
Vincent Guittot	09a43ac	2016-11-08 10:53:45 +0100	[diff] [blame]	2965	/*
				2966	* Signed add and clamp on underflow.
				2967	*
				2968	* Explicitly do a load-store to ensure the intermediate value never hits
				2969	* memory. This allows lockless observations without ever seeing the negative
				2970	* values.
				2971	*/
				2972	#define add_positive(_ptr, _val) do { \
				2973	typeof(_ptr) ptr = (_ptr); \
				2974	typeof(_val) val = (_val); \
				2975	typeof(ptr) res, var = READ_ONCE(ptr); \
				2976	\
				2977	res = var + val; \
				2978	\
				2979	if (val < 0 && res > var) \
				2980	res = 0; \
				2981	\
				2982	WRITE_ONCE(*ptr, res); \
				2983	} while (0)
				2984
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	2985	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	7c3edd2	2016-07-13 10:56:25 +0200	[diff] [blame]	2986	/**
				2987	* update_tg_load_avg - update the tg's load avg
				2988	* @cfs_rq: the cfs_rq whose avg changed
				2989	* @force: update regardless of how small the difference
				2990	*
				2991	* This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
				2992	* However, because tg->load_avg is a global value there are performance
				2993	* considerations.
				2994	*
				2995	* In order to avoid having to look at the other cfs_rq's, we use a
				2996	* differential update where we store the last value we propagated. This in
				2997	* turn allows skipping updates if the differential is 'small'.
				2998	*
				2999	* Updating tg's load_avg is necessary before update_cfs_share() (which is
				3000	* done) and effective_load() (which is not done because it is too costly).
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	3001	*/
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3002	static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	3003	{
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3004	long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	3005
Waiman Long	aa0b7ae	2015-12-02 13:41:50 -0500	[diff] [blame]	3006	/*
				3007	* No need to update load_avg for root_task_group as it is not used.
				3008	*/
				3009	if (cfs_rq->tg == &root_task_group)
				3010	return;
				3011
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3012	if (force \|\| abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
				3013	atomic_long_add(delta, &cfs_rq->tg->load_avg);
				3014	cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	3015	}
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	3016	}
Dietmar Eggemann	f5f9739	2014-02-26 11:19:33 +0000	[diff] [blame]	3017
Byungchul Park	ad936d8	2015-10-24 01:16:19 +0900	[diff] [blame]	3018	/*
				3019	* Called within set_task_rq() right before setting a task's cpu. The
				3020	* caller only guarantees p->pi_lock is held; no other assumptions,
				3021	* including the state of rq->lock, should be made.
				3022	*/
				3023	void set_task_rq_fair(struct sched_entity *se,
				3024	struct cfs_rq prev, struct cfs_rq next)
				3025	{
Peter Zijlstra	0ccb977	2017-03-28 11:08:20 +0200	[diff] [blame]	3026	u64 p_last_update_time;
				3027	u64 n_last_update_time;
				3028
Byungchul Park	ad936d8	2015-10-24 01:16:19 +0900	[diff] [blame]	3029	if (!sched_feat(ATTACH_AGE_LOAD))
				3030	return;
				3031
				3032	/*
				3033	* We are supposed to update the task to "current" time, then its up to
				3034	* date and ready to go to new CPU/cfs_rq. But we have difficulty in
				3035	* getting what current time is, so simply throw away the out-of-date
				3036	* time. This will result in the wakee task is less decayed, but giving
				3037	* the wakee more load sounds not bad.
				3038	*/
Peter Zijlstra	0ccb977	2017-03-28 11:08:20 +0200	[diff] [blame]	3039	if (!(se->avg.last_update_time && prev))
				3040	return;
Byungchul Park	ad936d8	2015-10-24 01:16:19 +0900	[diff] [blame]	3041
				3042	#ifndef CONFIG_64BIT
Peter Zijlstra	0ccb977	2017-03-28 11:08:20 +0200	[diff] [blame]	3043	{
Byungchul Park	ad936d8	2015-10-24 01:16:19 +0900	[diff] [blame]	3044	u64 p_last_update_time_copy;
				3045	u64 n_last_update_time_copy;
				3046
				3047	do {
				3048	p_last_update_time_copy = prev->load_last_update_time_copy;
				3049	n_last_update_time_copy = next->load_last_update_time_copy;
				3050
				3051	smp_rmb();
				3052
				3053	p_last_update_time = prev->avg.last_update_time;
				3054	n_last_update_time = next->avg.last_update_time;
				3055
				3056	} while (p_last_update_time != p_last_update_time_copy \|\|
				3057	n_last_update_time != n_last_update_time_copy);
Byungchul Park	ad936d8	2015-10-24 01:16:19 +0900	[diff] [blame]	3058	}
Peter Zijlstra	0ccb977	2017-03-28 11:08:20 +0200	[diff] [blame]	3059	#else
				3060	p_last_update_time = prev->avg.last_update_time;
				3061	n_last_update_time = next->avg.last_update_time;
				3062	#endif
				3063	__update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se);
				3064	se->avg.last_update_time = n_last_update_time;
Byungchul Park	ad936d8	2015-10-24 01:16:19 +0900	[diff] [blame]	3065	}
Vincent Guittot	09a43ac	2016-11-08 10:53:45 +0100	[diff] [blame]	3066
				3067	/* Take into account change of utilization of a child task group */
				3068	static inline void
				3069	update_tg_cfs_util(struct cfs_rq cfs_rq, struct sched_entity se)
				3070	{
				3071	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
				3072	long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
				3073
				3074	/* Nothing to update */
				3075	if (!delta)
				3076	return;
				3077
				3078	/* Set new sched_entity's utilization */
				3079	se->avg.util_avg = gcfs_rq->avg.util_avg;
				3080	se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
				3081
				3082	/* Update parent cfs_rq utilization */
				3083	add_positive(&cfs_rq->avg.util_avg, delta);
				3084	cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
				3085	}
				3086
				3087	/* Take into account change of load of a child task group */
				3088	static inline void
				3089	update_tg_cfs_load(struct cfs_rq cfs_rq, struct sched_entity se)
				3090	{
				3091	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
				3092	long delta, load = gcfs_rq->avg.load_avg;
				3093
				3094	/*
				3095	* If the load of group cfs_rq is null, the load of the
				3096	* sched_entity will also be null so we can skip the formula
				3097	*/
				3098	if (load) {
				3099	long tg_load;
				3100
				3101	/* Get tg's load and ensure tg_load > 0 */
				3102	tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
				3103
				3104	/* Ensure tg_load >= load and updated with current load*/
				3105	tg_load -= gcfs_rq->tg_load_avg_contrib;
				3106	tg_load += load;
				3107
				3108	/*
				3109	* We need to compute a correction term in the case that the
				3110	* task group is consuming more CPU than a task of equal
				3111	* weight. A task with a weight equals to tg->shares will have
				3112	* a load less or equal to scale_load_down(tg->shares).
				3113	* Similarly, the sched_entities that represent the task group
				3114	* at parent level, can't have a load higher than
				3115	* scale_load_down(tg->shares). And the Sum of sched_entities'
				3116	* load must be <= scale_load_down(tg->shares).
				3117	*/
				3118	if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
				3119	/* scale gcfs_rq's load into tg's shares*/
				3120	load *= scale_load_down(gcfs_rq->tg->shares);
				3121	load /= tg_load;
				3122	}
				3123	}
				3124
				3125	delta = load - se->avg.load_avg;
				3126
				3127	/* Nothing to update */
				3128	if (!delta)
				3129	return;
				3130
				3131	/* Set new sched_entity's load */
				3132	se->avg.load_avg = load;
				3133	se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
				3134
				3135	/* Update parent cfs_rq load */
				3136	add_positive(&cfs_rq->avg.load_avg, delta);
				3137	cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
				3138
				3139	/*
				3140	* If the sched_entity is already enqueued, we also have to update the
				3141	* runnable load avg.
				3142	*/
				3143	if (se->on_rq) {
				3144	/* Update parent cfs_rq runnable_load_avg */
				3145	add_positive(&cfs_rq->runnable_load_avg, delta);
				3146	cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
				3147	}
				3148	}
				3149
				3150	static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
				3151	{
				3152	cfs_rq->propagate_avg = 1;
				3153	}
				3154
				3155	static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
				3156	{
				3157	struct cfs_rq *cfs_rq = group_cfs_rq(se);
				3158
				3159	if (!cfs_rq->propagate_avg)
				3160	return 0;
				3161
				3162	cfs_rq->propagate_avg = 0;
				3163	return 1;
				3164	}
				3165
				3166	/* Update task and its cfs_rq load average */
				3167	static inline int propagate_entity_load_avg(struct sched_entity *se)
				3168	{
				3169	struct cfs_rq *cfs_rq;
				3170
				3171	if (entity_is_task(se))
				3172	return 0;
				3173
				3174	if (!test_and_clear_tg_cfs_propagate(se))
				3175	return 0;
				3176
				3177	cfs_rq = cfs_rq_of(se);
				3178
				3179	set_tg_cfs_propagate(cfs_rq);
				3180
				3181	update_tg_cfs_util(cfs_rq, se);
				3182	update_tg_cfs_load(cfs_rq, se);
				3183
				3184	return 1;
				3185	}
				3186
Vincent Guittot	bc42789	2017-03-17 14:47:22 +0100	[diff] [blame]	3187	/*
				3188	* Check if we need to update the load and the utilization of a blocked
				3189	* group_entity:
				3190	*/
				3191	static inline bool skip_blocked_update(struct sched_entity *se)
				3192	{
				3193	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
				3194
				3195	/*
				3196	* If sched_entity still have not zero load or utilization, we have to
				3197	* decay it:
				3198	*/
				3199	if (se->avg.load_avg \|\| se->avg.util_avg)
				3200	return false;
				3201
				3202	/*
				3203	* If there is a pending propagation, we have to update the load and
				3204	* the utilization of the sched_entity:
				3205	*/
				3206	if (gcfs_rq->propagate_avg)
				3207	return false;
				3208
				3209	/*
				3210	* Otherwise, the load and the utilization of the sched_entity is
				3211	* already zero and there is no pending propagation, so it will be a
				3212	* waste of time to try to decay it:
				3213	*/
				3214	return true;
				3215	}
				3216
Peter Zijlstra	6e83125	2014-02-11 16:11:48 +0100	[diff] [blame]	3217	#else /* CONFIG_FAIR_GROUP_SCHED */
Vincent Guittot	09a43ac	2016-11-08 10:53:45 +0100	[diff] [blame]	3218
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3219	static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
Vincent Guittot	09a43ac	2016-11-08 10:53:45 +0100	[diff] [blame]	3220
				3221	static inline int propagate_entity_load_avg(struct sched_entity *se)
				3222	{
				3223	return 0;
				3224	}
				3225
				3226	static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
				3227
Peter Zijlstra	6e83125	2014-02-11 16:11:48 +0100	[diff] [blame]	3228	#endif /* CONFIG_FAIR_GROUP_SCHED */
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	3229
Steve Muckle	a2c6c91	2016-03-24 15:26:07 -0700	[diff] [blame]	3230	static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3231	{
Rafael J. Wysocki	58919e8	2016-08-16 22:14:55 +0200	[diff] [blame]	3232	if (&this_rq()->cfs == cfs_rq) {
Steve Muckle	21e96f8	2016-03-21 17:21:07 -0700	[diff] [blame]	3233	/*
				3234	* There are a few boundary cases this might miss but it should
				3235	* get called often enough that that should (hopefully) not be
				3236	* a real problem -- added to that it only calls on the local
				3237	* CPU, so if we enqueue remotely we'll miss an update, but
				3238	* the next tick/schedule should update.
				3239	*
				3240	* It will not get called when we go idle, because the idle
				3241	* thread is a different class (!fair), nor will the utilization
				3242	* number include things like RT tasks.
				3243	*
				3244	* As is, the util number is not freq-invariant (we'd have to
				3245	* implement arch_scale_freq_capacity() for that).
				3246	*
				3247	* See cpu_util().
				3248	*/
Rafael J. Wysocki	12bde33	2016-08-10 03:11:17 +0200	[diff] [blame]	3249	cpufreq_update_util(rq_of(cfs_rq), 0);
Steve Muckle	21e96f8	2016-03-21 17:21:07 -0700	[diff] [blame]	3250	}
Steve Muckle	a2c6c91	2016-03-24 15:26:07 -0700	[diff] [blame]	3251	}
				3252
Peter Zijlstra	8974189	2016-06-16 10:50:40 +0200	[diff] [blame]	3253	/*
				3254	* Unsigned subtract and clamp on underflow.
				3255	*
				3256	* Explicitly do a load-store to ensure the intermediate value never hits
				3257	* memory. This allows lockless observations without ever seeing the negative
				3258	* values.
				3259	*/
				3260	#define sub_positive(_ptr, _val) do { \
				3261	typeof(_ptr) ptr = (_ptr); \
				3262	typeof(*ptr) val = (_val); \
				3263	typeof(ptr) res, var = READ_ONCE(ptr); \
				3264	res = var - val; \
				3265	if (res > var) \
				3266	res = 0; \
				3267	WRITE_ONCE(*ptr, res); \
				3268	} while (0)
				3269
Peter Zijlstra	3d30544f	2016-06-21 14:27:50 +0200	[diff] [blame]	3270	/**
				3271	* update_cfs_rq_load_avg - update the cfs_rq's load/util averages
				3272	* @now: current time, as per cfs_rq_clock_task()
				3273	* @cfs_rq: cfs_rq to update
				3274	* @update_freq: should we call cfs_rq_util_change() or will the call do so
				3275	*
				3276	* The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
				3277	* avg. The immediate corollary is that all (fair) tasks must be attached, see
				3278	* post_init_entity_util_avg().
				3279	*
				3280	* cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
				3281	*
Peter Zijlstra	7c3edd2	2016-07-13 10:56:25 +0200	[diff] [blame]	3282	* Returns true if the load decayed or we removed load.
				3283	*
				3284	* Since both these conditions indicate a changed cfs_rq->avg.load we should
				3285	* call update_tg_load_avg() when this function returns true.
Peter Zijlstra	3d30544f	2016-06-21 14:27:50 +0200	[diff] [blame]	3286	*/
Steve Muckle	a2c6c91	2016-03-24 15:26:07 -0700	[diff] [blame]	3287	static inline int
				3288	update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
				3289	{
				3290	struct sched_avg *sa = &cfs_rq->avg;
				3291	int decayed, removed_load = 0, removed_util = 0;
				3292
				3293	if (atomic_long_read(&cfs_rq->removed_load_avg)) {
				3294	s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
Peter Zijlstra	8974189	2016-06-16 10:50:40 +0200	[diff] [blame]	3295	sub_positive(&sa->load_avg, r);
				3296	sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
Steve Muckle	a2c6c91	2016-03-24 15:26:07 -0700	[diff] [blame]	3297	removed_load = 1;
Vincent Guittot	4e51607	2016-11-08 10:53:46 +0100	[diff] [blame]	3298	set_tg_cfs_propagate(cfs_rq);
Steve Muckle	a2c6c91	2016-03-24 15:26:07 -0700	[diff] [blame]	3299	}
				3300
				3301	if (atomic_long_read(&cfs_rq->removed_util_avg)) {
				3302	long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
Peter Zijlstra	8974189	2016-06-16 10:50:40 +0200	[diff] [blame]	3303	sub_positive(&sa->util_avg, r);
				3304	sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
Steve Muckle	a2c6c91	2016-03-24 15:26:07 -0700	[diff] [blame]	3305	removed_util = 1;
Vincent Guittot	4e51607	2016-11-08 10:53:46 +0100	[diff] [blame]	3306	set_tg_cfs_propagate(cfs_rq);
Steve Muckle	a2c6c91	2016-03-24 15:26:07 -0700	[diff] [blame]	3307	}
				3308
Peter Zijlstra	0ccb977	2017-03-28 11:08:20 +0200	[diff] [blame]	3309	decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
Steve Muckle	a2c6c91	2016-03-24 15:26:07 -0700	[diff] [blame]	3310
				3311	#ifndef CONFIG_64BIT
				3312	smp_wmb();
				3313	cfs_rq->load_last_update_time_copy = sa->last_update_time;
				3314	#endif
				3315
				3316	if (update_freq && (decayed \|\| removed_util))
				3317	cfs_rq_util_change(cfs_rq);
Steve Muckle	21e96f8	2016-03-21 17:21:07 -0700	[diff] [blame]	3318
Steve Muckle	41e0d37	2016-03-21 17:21:08 -0700	[diff] [blame]	3319	return decayed \|\| removed_load;
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3320	}
				3321
Vincent Guittot	d31b1a6	2016-11-08 10:53:44 +0100	[diff] [blame]	3322	/*
				3323	* Optional action to be done while updating the load average
				3324	*/
				3325	#define UPDATE_TG 0x1
				3326	#define SKIP_AGE_LOAD 0x2
				3327
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3328	/* Update task and its cfs_rq load average */
Vincent Guittot	d31b1a6	2016-11-08 10:53:44 +0100	[diff] [blame]	3329	static inline void update_load_avg(struct sched_entity *se, int flags)
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	3330	{
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	3331	struct cfs_rq *cfs_rq = cfs_rq_of(se);
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3332	u64 now = cfs_rq_clock_task(cfs_rq);
Rafael J. Wysocki	34e2c55	2016-02-15 20:20:42 +0100	[diff] [blame]	3333	struct rq *rq = rq_of(cfs_rq);
				3334	int cpu = cpu_of(rq);
Vincent Guittot	09a43ac	2016-11-08 10:53:45 +0100	[diff] [blame]	3335	int decayed;
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	3336
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	3337	/*
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3338	* Track task load average for carrying it to new CPU after migrated, and
				3339	* track group sched_entity load average for task_h_load calc in migration
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	3340	*/
Peter Zijlstra	0ccb977	2017-03-28 11:08:20 +0200	[diff] [blame]	3341	if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
				3342	__update_load_avg_se(now, cpu, cfs_rq, se);
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	3343
Vincent Guittot	09a43ac	2016-11-08 10:53:45 +0100	[diff] [blame]	3344	decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
				3345	decayed \|= propagate_entity_load_avg(se);
				3346
				3347	if (decayed && (flags & UPDATE_TG))
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3348	update_tg_load_avg(cfs_rq, 0);
				3349	}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	3350
Peter Zijlstra	3d30544f	2016-06-21 14:27:50 +0200	[diff] [blame]	3351	/**
				3352	* attach_entity_load_avg - attach this entity to its cfs_rq load avg
				3353	* @cfs_rq: cfs_rq to attach to
				3354	* @se: sched_entity to attach
				3355	*
				3356	* Must call update_cfs_rq_load_avg() before this, since we rely on
				3357	* cfs_rq->avg.last_update_time being current.
				3358	*/
Byungchul Park	a05e8c5	2015-08-20 20:21:56 +0900	[diff] [blame]	3359	static void attach_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se)
				3360	{
				3361	se->avg.last_update_time = cfs_rq->avg.last_update_time;
				3362	cfs_rq->avg.load_avg += se->avg.load_avg;
				3363	cfs_rq->avg.load_sum += se->avg.load_sum;
				3364	cfs_rq->avg.util_avg += se->avg.util_avg;
				3365	cfs_rq->avg.util_sum += se->avg.util_sum;
Vincent Guittot	09a43ac	2016-11-08 10:53:45 +0100	[diff] [blame]	3366	set_tg_cfs_propagate(cfs_rq);
Steve Muckle	a2c6c91	2016-03-24 15:26:07 -0700	[diff] [blame]	3367
				3368	cfs_rq_util_change(cfs_rq);
Byungchul Park	a05e8c5	2015-08-20 20:21:56 +0900	[diff] [blame]	3369	}
				3370
Peter Zijlstra	3d30544f	2016-06-21 14:27:50 +0200	[diff] [blame]	3371	/**
				3372	* detach_entity_load_avg - detach this entity from its cfs_rq load avg
				3373	* @cfs_rq: cfs_rq to detach from
				3374	* @se: sched_entity to detach
				3375	*
				3376	* Must call update_cfs_rq_load_avg() before this, since we rely on
				3377	* cfs_rq->avg.last_update_time being current.
				3378	*/
Byungchul Park	a05e8c5	2015-08-20 20:21:56 +0900	[diff] [blame]	3379	static void detach_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se)
				3380	{
Byungchul Park	a05e8c5	2015-08-20 20:21:56 +0900	[diff] [blame]	3381
Peter Zijlstra	8974189	2016-06-16 10:50:40 +0200	[diff] [blame]	3382	sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
				3383	sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
				3384	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
				3385	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
Vincent Guittot	09a43ac	2016-11-08 10:53:45 +0100	[diff] [blame]	3386	set_tg_cfs_propagate(cfs_rq);
Steve Muckle	a2c6c91	2016-03-24 15:26:07 -0700	[diff] [blame]	3387
				3388	cfs_rq_util_change(cfs_rq);
Byungchul Park	a05e8c5	2015-08-20 20:21:56 +0900	[diff] [blame]	3389	}
				3390
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3391	/* Add the load generated by se into cfs_rq's load average */
				3392	static inline void
				3393	enqueue_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se)
				3394	{
				3395	struct sched_avg *sa = &se->avg;
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3396
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	3397	cfs_rq->runnable_load_avg += sa->load_avg;
				3398	cfs_rq->runnable_load_sum += sa->load_sum;
				3399
Vincent Guittot	d31b1a6	2016-11-08 10:53:44 +0100	[diff] [blame]	3400	if (!sa->last_update_time) {
Byungchul Park	a05e8c5	2015-08-20 20:21:56 +0900	[diff] [blame]	3401	attach_entity_load_avg(cfs_rq, se);
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3402	update_tg_load_avg(cfs_rq, 0);
Vincent Guittot	d31b1a6	2016-11-08 10:53:44 +0100	[diff] [blame]	3403	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	3404	}
				3405
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	3406	/* Remove the runnable load generated by se from cfs_rq's runnable load average */
				3407	static inline void
				3408	dequeue_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se)
				3409	{
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	3410	cfs_rq->runnable_load_avg =
				3411	max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
				3412	cfs_rq->runnable_load_sum =
Byungchul Park	a05e8c5	2015-08-20 20:21:56 +0900	[diff] [blame]	3413	max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	3414	}
				3415
Yuyang Du	0905f04	2015-12-17 07:34:27 +0800	[diff] [blame]	3416	#ifndef CONFIG_64BIT
				3417	static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
				3418	{
				3419	u64 last_update_time_copy;
				3420	u64 last_update_time;
				3421
				3422	do {
				3423	last_update_time_copy = cfs_rq->load_last_update_time_copy;
				3424	smp_rmb();
				3425	last_update_time = cfs_rq->avg.last_update_time;
				3426	} while (last_update_time != last_update_time_copy);
				3427
				3428	return last_update_time;
				3429	}
				3430	#else
				3431	static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
				3432	{
				3433	return cfs_rq->avg.last_update_time;
				3434	}
				3435	#endif
				3436
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	3437	/*
Morten Rasmussen	104cb16	2016-10-14 14:41:07 +0100	[diff] [blame]	3438	* Synchronize entity load avg of dequeued entity without locking
				3439	* the previous rq.
				3440	*/
				3441	void sync_entity_load_avg(struct sched_entity *se)
				3442	{
				3443	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				3444	u64 last_update_time;
				3445
				3446	last_update_time = cfs_rq_last_update_time(cfs_rq);
Peter Zijlstra	0ccb977	2017-03-28 11:08:20 +0200	[diff] [blame]	3447	__update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se);
Morten Rasmussen	104cb16	2016-10-14 14:41:07 +0100	[diff] [blame]	3448	}
				3449
				3450	/*
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3451	* Task first catches up with cfs_rq, and then subtract
				3452	* itself from the cfs_rq (task must be off the queue now).
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	3453	*/
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3454	void remove_entity_load_avg(struct sched_entity *se)
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	3455	{
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3456	struct cfs_rq *cfs_rq = cfs_rq_of(se);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	3457
Yuyang Du	0905f04	2015-12-17 07:34:27 +0800	[diff] [blame]	3458	/*
Peter Zijlstra	7dc603c	2016-06-16 13:29:28 +0200	[diff] [blame]	3459	* tasks cannot exit without having gone through wake_up_new_task() ->
				3460	* post_init_entity_util_avg() which will have added things to the
				3461	* cfs_rq, so we can remove unconditionally.
				3462	*
				3463	* Similarly for groups, they will have passed through
				3464	* post_init_entity_util_avg() before unregister_sched_fair_group()
				3465	* calls this.
Yuyang Du	0905f04	2015-12-17 07:34:27 +0800	[diff] [blame]	3466	*/
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	3467
Morten Rasmussen	104cb16	2016-10-14 14:41:07 +0100	[diff] [blame]	3468	sync_entity_load_avg(se);
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3469	atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
				3470	atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	3471	}
Vincent Guittot	642dbc3	2013-04-18 18:34:26 +0200	[diff] [blame]	3472
Yuyang Du	7ea241a	2015-07-15 08:04:42 +0800	[diff] [blame]	3473	static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
				3474	{
				3475	return cfs_rq->runnable_load_avg;
				3476	}
				3477
				3478	static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
				3479	{
				3480	return cfs_rq->avg.load_avg;
				3481	}
				3482
Matt Fleming	46f69fa	2016-09-21 14:38:12 +0100	[diff] [blame]	3483	static int idle_balance(struct rq this_rq, struct rq_flags rf);
Peter Zijlstra	6e83125	2014-02-11 16:11:48 +0100	[diff] [blame]	3484
Peter Zijlstra	38033c3	2014-01-23 20:32:21 +0100	[diff] [blame]	3485	#else /* CONFIG_SMP */
				3486
Peter Zijlstra	0101147	2016-06-17 11:20:46 +0200	[diff] [blame]	3487	static inline int
				3488	update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
				3489	{
				3490	return 0;
				3491	}
				3492
Vincent Guittot	d31b1a6	2016-11-08 10:53:44 +0100	[diff] [blame]	3493	#define UPDATE_TG 0x0
				3494	#define SKIP_AGE_LOAD 0x0
				3495
				3496	static inline void update_load_avg(struct sched_entity *se, int not_used1)
Rafael J. Wysocki	536bd00	2016-05-06 14:58:43 +0200	[diff] [blame]	3497	{
Rafael J. Wysocki	12bde33	2016-08-10 03:11:17 +0200	[diff] [blame]	3498	cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
Rafael J. Wysocki	536bd00	2016-05-06 14:58:43 +0200	[diff] [blame]	3499	}
				3500
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3501	static inline void
				3502	enqueue_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) {}
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	3503	static inline void
				3504	dequeue_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) {}
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3505	static inline void remove_entity_load_avg(struct sched_entity *se) {}
Peter Zijlstra	6e83125	2014-02-11 16:11:48 +0100	[diff] [blame]	3506
Byungchul Park	a05e8c5	2015-08-20 20:21:56 +0900	[diff] [blame]	3507	static inline void
				3508	attach_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) {}
				3509	static inline void
				3510	detach_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) {}
				3511
Matt Fleming	46f69fa	2016-09-21 14:38:12 +0100	[diff] [blame]	3512	static inline int idle_balance(struct rq rq, struct rq_flags rf)
Peter Zijlstra	6e83125	2014-02-11 16:11:48 +0100	[diff] [blame]	3513	{
				3514	return 0;
				3515	}
				3516
Peter Zijlstra	38033c3	2014-01-23 20:32:21 +0100	[diff] [blame]	3517	#endif /* CONFIG_SMP */
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	3518
Peter Zijlstra	ddc9729	2007-10-15 17:00:10 +0200	[diff] [blame]	3519	static void check_spread(struct cfs_rq cfs_rq, struct sched_entity se)
				3520	{
				3521	#ifdef CONFIG_SCHED_DEBUG
				3522	s64 d = se->vruntime - cfs_rq->min_vruntime;
				3523
				3524	if (d < 0)
				3525	d = -d;
				3526
				3527	if (d > 3*sysctl_sched_latency)
Josh Poimboeuf	ae92882	2016-06-17 12:43:24 -0500	[diff] [blame]	3528	schedstat_inc(cfs_rq->nr_spread_over);
Peter Zijlstra	ddc9729	2007-10-15 17:00:10 +0200	[diff] [blame]	3529	#endif
				3530	}
				3531
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3532	static void
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	3533	place_entity(struct cfs_rq cfs_rq, struct sched_entity se, int initial)
				3534	{
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	3535	u64 vruntime = cfs_rq->min_vruntime;
Peter Zijlstra	94dfb5e	2007-10-15 17:00:05 +0200	[diff] [blame]	3536
Peter Zijlstra	2cb8600	2007-11-09 22:39:37 +0100	[diff] [blame]	3537	/*
				3538	* The 'current' period is already promised to the current tasks,
				3539	* however the extra weight of the new task will slow them down a
				3540	* little, place the new task so that it fits in the slot that
				3541	* stays open at the end.
				3542	*/
Peter Zijlstra	94dfb5e	2007-10-15 17:00:05 +0200	[diff] [blame]	3543	if (initial && sched_feat(START_DEBIT))
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	3544	vruntime += sched_vslice(cfs_rq, se);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	3545
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	3546	/* sleeps up to a single latency don't count. */
Mike Galbraith	5ca9880	2010-03-11 17:17:17 +0100	[diff] [blame]	3547	if (!initial) {
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	3548	unsigned long thresh = sysctl_sched_latency;
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	3549
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	3550	/*
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	3551	* Halve their sleep time's effect, to allow
				3552	* for a gentler effect of sleepers:
				3553	*/
				3554	if (sched_feat(GENTLE_FAIR_SLEEPERS))
				3555	thresh >>= 1;
Ingo Molnar	51e0304	2009-09-16 08:54:45 +0200	[diff] [blame]	3556
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	3557	vruntime -= thresh;
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	3558	}
				3559
Mike Galbraith	b5d9d73	2009-09-08 11:12:28 +0200	[diff] [blame]	3560	/* ensure we never gain time by being placed backwards. */
Viresh Kumar	16c8f1c	2012-11-08 13:33:46 +0530	[diff] [blame]	3561	se->vruntime = max_vruntime(se->vruntime, vruntime);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	3562	}
				3563
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	3564	static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
				3565
Mel Gorman	cb25176	2016-02-05 09:08:36 +0000	[diff] [blame]	3566	static inline void check_schedstat_required(void)
				3567	{
				3568	#ifdef CONFIG_SCHEDSTATS
				3569	if (schedstat_enabled())
				3570	return;
				3571
				3572	/* Force schedstat enabled if a dependent tracepoint is active */
				3573	if (trace_sched_stat_wait_enabled() \|\|
				3574	trace_sched_stat_sleep_enabled() \|\|
				3575	trace_sched_stat_iowait_enabled() \|\|
				3576	trace_sched_stat_blocked_enabled() \|\|
				3577	trace_sched_stat_runtime_enabled()) {
Josh Poimboeuf	eda8dca	2016-06-13 02:32:09 -0500	[diff] [blame]	3578	printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
Mel Gorman	cb25176	2016-02-05 09:08:36 +0000	[diff] [blame]	3579	"stat_blocked and stat_runtime require the "
				3580	"kernel parameter schedstats=enabled or "
				3581	"kernel.sched_schedstats=1\n");
				3582	}
				3583	#endif
				3584	}
				3585
Peter Zijlstra	b5179ac	2016-05-11 16:10:34 +0200	[diff] [blame]	3586
				3587	/*
				3588	* MIGRATION
				3589	*
				3590	* dequeue
				3591	* update_curr()
				3592	* update_min_vruntime()
				3593	* vruntime -= min_vruntime
				3594	*
				3595	* enqueue
				3596	* update_curr()
				3597	* update_min_vruntime()
				3598	* vruntime += min_vruntime
				3599	*
				3600	* this way the vruntime transition between RQs is done when both
				3601	* min_vruntime are up-to-date.
				3602	*
				3603	* WAKEUP (remote)
				3604	*
Peter Zijlstra	59efa0b	2016-05-10 18:24:37 +0200	[diff] [blame]	3605	* ->migrate_task_rq_fair() (p->state == TASK_WAKING)
Peter Zijlstra	b5179ac	2016-05-11 16:10:34 +0200	[diff] [blame]	3606	* vruntime -= min_vruntime
				3607	*
				3608	* enqueue
				3609	* update_curr()
				3610	* update_min_vruntime()
				3611	* vruntime += min_vruntime
				3612	*
				3613	* this way we don't have the most up-to-date min_vruntime on the originating
				3614	* CPU and an up-to-date min_vruntime on the destination CPU.
				3615	*/
				3616
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	3617	static void
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	3618	enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3619	{
Peter Zijlstra	2f95035	2016-05-11 19:27:56 +0200	[diff] [blame]	3620	bool renorm = !(flags & ENQUEUE_WAKEUP) \|\| (flags & ENQUEUE_MIGRATED);
				3621	bool curr = cfs_rq->curr == se;
Peter Zijlstra	3a47d51	2016-03-09 13:04:03 +0100	[diff] [blame]	3622
Ingo Molnar	53d3bc7	2016-05-11 08:25:53 +0200	[diff] [blame]	3623	/*
Peter Zijlstra	2f95035	2016-05-11 19:27:56 +0200	[diff] [blame]	3624	* If we're the current task, we must renormalise before calling
				3625	* update_curr().
Ingo Molnar	53d3bc7	2016-05-11 08:25:53 +0200	[diff] [blame]	3626	*/
Peter Zijlstra	2f95035	2016-05-11 19:27:56 +0200	[diff] [blame]	3627	if (renorm && curr)
				3628	se->vruntime += cfs_rq->min_vruntime;
				3629
Ingo Molnar	b7cc089	2007-08-09 11:16:47 +0200	[diff] [blame]	3630	update_curr(cfs_rq);
Peter Zijlstra	2f95035	2016-05-11 19:27:56 +0200	[diff] [blame]	3631
				3632	/*
				3633	* Otherwise, renormalise after, such that we're placed at the current
				3634	* moment in time, instead of some random moment in the past. Being
				3635	* placed in the past could significantly boost this task to the
				3636	* fairness detriment of existing tasks.
				3637	*/
				3638	if (renorm && !curr)
				3639	se->vruntime += cfs_rq->min_vruntime;
				3640
Vincent Guittot	89ee048	2016-12-21 16:50:26 +0100	[diff] [blame]	3641	/*
				3642	* When enqueuing a sched_entity, we must:
				3643	* - Update loads to have both entity and cfs_rq synced with now.
				3644	* - Add its load to cfs_rq->runnable_avg
				3645	* - For group_entity, update its weight to reflect the new share of
				3646	* its group cfs_rq
				3647	* - Add its new weight to cfs_rq->load.weight
				3648	*/
Vincent Guittot	d31b1a6	2016-11-08 10:53:44 +0100	[diff] [blame]	3649	update_load_avg(se, UPDATE_TG);
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3650	enqueue_entity_load_avg(cfs_rq, se);
Vincent Guittot	89ee048	2016-12-21 16:50:26 +0100	[diff] [blame]	3651	update_cfs_shares(se);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	3652	account_entity_enqueue(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3653
Josh Poimboeuf	1a3d027	2016-06-17 12:43:23 -0500	[diff] [blame]	3654	if (flags & ENQUEUE_WAKEUP)
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	3655	place_entity(cfs_rq, se, 0);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3656
Mel Gorman	cb25176	2016-02-05 09:08:36 +0000	[diff] [blame]	3657	check_schedstat_required();
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	3658	update_stats_enqueue(cfs_rq, se, flags);
				3659	check_spread(cfs_rq, se);
Peter Zijlstra	2f95035	2016-05-11 19:27:56 +0200	[diff] [blame]	3660	if (!curr)
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	3661	__enqueue_entity(cfs_rq, se);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	3662	se->on_rq = 1;
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	3663
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	3664	if (cfs_rq->nr_running == 1) {
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	3665	list_add_leaf_cfs_rq(cfs_rq);
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	3666	check_enqueue_throttle(cfs_rq);
				3667	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3668	}
				3669
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	3670	static void __clear_buddies_last(struct sched_entity *se)
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	3671	{
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	3672	for_each_sched_entity(se) {
				3673	struct cfs_rq *cfs_rq = cfs_rq_of(se);
Peter Zijlstra	f104479	2012-02-11 06:05:00 +0100	[diff] [blame]	3674	if (cfs_rq->last != se)
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	3675	break;
Peter Zijlstra	f104479	2012-02-11 06:05:00 +0100	[diff] [blame]	3676
				3677	cfs_rq->last = NULL;
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	3678	}
				3679	}
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	3680
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	3681	static void __clear_buddies_next(struct sched_entity *se)
				3682	{
				3683	for_each_sched_entity(se) {
				3684	struct cfs_rq *cfs_rq = cfs_rq_of(se);
Peter Zijlstra	f104479	2012-02-11 06:05:00 +0100	[diff] [blame]	3685	if (cfs_rq->next != se)
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	3686	break;
Peter Zijlstra	f104479	2012-02-11 06:05:00 +0100	[diff] [blame]	3687
				3688	cfs_rq->next = NULL;
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	3689	}
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	3690	}
				3691
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3692	static void __clear_buddies_skip(struct sched_entity *se)
				3693	{
				3694	for_each_sched_entity(se) {
				3695	struct cfs_rq *cfs_rq = cfs_rq_of(se);
Peter Zijlstra	f104479	2012-02-11 06:05:00 +0100	[diff] [blame]	3696	if (cfs_rq->skip != se)
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3697	break;
Peter Zijlstra	f104479	2012-02-11 06:05:00 +0100	[diff] [blame]	3698
				3699	cfs_rq->skip = NULL;
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3700	}
				3701	}
				3702
Peter Zijlstra	a571bbe	2009-01-28 14:51:40 +0100	[diff] [blame]	3703	static void clear_buddies(struct cfs_rq cfs_rq, struct sched_entity se)
				3704	{
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	3705	if (cfs_rq->last == se)
				3706	__clear_buddies_last(se);
				3707
				3708	if (cfs_rq->next == se)
				3709	__clear_buddies_next(se);
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3710
				3711	if (cfs_rq->skip == se)
				3712	__clear_buddies_skip(se);
Peter Zijlstra	a571bbe	2009-01-28 14:51:40 +0100	[diff] [blame]	3713	}
				3714
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	3715	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	3716
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3717	static void
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	3718	dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3719	{
Dmitry Adamushko	a2a2d68	2007-10-15 17:00:13 +0200	[diff] [blame]	3720	/*
				3721	* Update run-time statistics of the 'current'.
				3722	*/
				3723	update_curr(cfs_rq);
Vincent Guittot	89ee048	2016-12-21 16:50:26 +0100	[diff] [blame]	3724
				3725	/*
				3726	* When dequeuing a sched_entity, we must:
				3727	* - Update loads to have both entity and cfs_rq synced with now.
				3728	* - Substract its load from the cfs_rq->runnable_avg.
				3729	* - Substract its previous weight from cfs_rq->load.weight.
				3730	* - For group entity, update its weight to reflect the new share
				3731	* of its group cfs_rq.
				3732	*/
Vincent Guittot	d31b1a6	2016-11-08 10:53:44 +0100	[diff] [blame]	3733	update_load_avg(se, UPDATE_TG);
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	3734	dequeue_entity_load_avg(cfs_rq, se);
Dmitry Adamushko	a2a2d68	2007-10-15 17:00:13 +0200	[diff] [blame]	3735
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	3736	update_stats_dequeue(cfs_rq, se, flags);
Peter Zijlstra	67e9fb2	2007-10-15 17:00:10 +0200	[diff] [blame]	3737
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	3738	clear_buddies(cfs_rq, se);
Peter Zijlstra	4793241	2008-11-04 21:25:09 +0100	[diff] [blame]	3739
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	3740	if (se != cfs_rq->curr)
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	3741	__dequeue_entity(cfs_rq, se);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	3742	se->on_rq = 0;
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	3743	account_entity_dequeue(cfs_rq, se);
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	3744
				3745	/*
Peter Zijlstra	b60205c	2016-09-20 21:58:12 +0200	[diff] [blame]	3746	* Normalize after update_curr(); which will also have moved
				3747	* min_vruntime if @se is the one holding it back. But before doing
				3748	* update_min_vruntime() again, which will discount @se's position and
				3749	* can move min_vruntime forward still more.
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	3750	*/
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	3751	if (!(flags & DEQUEUE_SLEEP))
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	3752	se->vruntime -= cfs_rq->min_vruntime;
Peter Zijlstra	1e87623	2011-05-17 16:21:10 -0700	[diff] [blame]	3753
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	3754	/* return excess runtime on last dequeue */
				3755	return_cfs_rq_runtime(cfs_rq);
				3756
Vincent Guittot	89ee048	2016-12-21 16:50:26 +0100	[diff] [blame]	3757	update_cfs_shares(se);
Peter Zijlstra	b60205c	2016-09-20 21:58:12 +0200	[diff] [blame]	3758
				3759	/*
				3760	* Now advance min_vruntime if @se was the entity holding it back,
				3761	* except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
				3762	* put back on, and if we advance min_vruntime, we'll be placed back
				3763	* further than we started -- ie. we'll be penalized.
				3764	*/
				3765	if ((flags & (DEQUEUE_SAVE \| DEQUEUE_MOVE)) == DEQUEUE_SAVE)
				3766	update_min_vruntime(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3767	}
				3768
				3769	/*
				3770	* Preempt the current task with a newly woken task if needed:
				3771	*/
Peter Zijlstra	7c92e54	2007-09-05 14:32:49 +0200	[diff] [blame]	3772	static void
Ingo Molnar	2e09bf5	2007-10-15 17:00:05 +0200	[diff] [blame]	3773	check_preempt_tick(struct cfs_rq cfs_rq, struct sched_entity curr)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3774	{
Peter Zijlstra	1169783	2007-09-05 14:32:49 +0200	[diff] [blame]	3775	unsigned long ideal_runtime, delta_exec;
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	3776	struct sched_entity *se;
				3777	s64 delta;
Peter Zijlstra	1169783	2007-09-05 14:32:49 +0200	[diff] [blame]	3778
Peter Zijlstra	6d0f0eb	2007-10-15 17:00:05 +0200	[diff] [blame]	3779	ideal_runtime = sched_slice(cfs_rq, curr);
Peter Zijlstra	1169783	2007-09-05 14:32:49 +0200	[diff] [blame]	3780	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
Mike Galbraith	a9f3e2b	2009-01-28 14:51:39 +0100	[diff] [blame]	3781	if (delta_exec > ideal_runtime) {
Kirill Tkhai	8875125	2014-06-29 00:03:57 +0400	[diff] [blame]	3782	resched_curr(rq_of(cfs_rq));
Mike Galbraith	a9f3e2b	2009-01-28 14:51:39 +0100	[diff] [blame]	3783	/*
				3784	* The current task ran long enough, ensure it doesn't get
				3785	* re-elected due to buddy favours.
				3786	*/
				3787	clear_buddies(cfs_rq, curr);
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	3788	return;
				3789	}
				3790
				3791	/*
				3792	* Ensure that a task that missed wakeup preemption by a
				3793	* narrow margin doesn't have to wait for a full slice.
				3794	* This also mitigates buddy induced latencies under load.
				3795	*/
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	3796	if (delta_exec < sysctl_sched_min_granularity)
				3797	return;
				3798
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	3799	se = __pick_first_entity(cfs_rq);
				3800	delta = curr->vruntime - se->vruntime;
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	3801
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	3802	if (delta < 0)
				3803	return;
Mike Galbraith	d7d8294	2011-01-05 05:41:17 +0100	[diff] [blame]	3804
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	3805	if (delta > ideal_runtime)
Kirill Tkhai	8875125	2014-06-29 00:03:57 +0400	[diff] [blame]	3806	resched_curr(rq_of(cfs_rq));
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3807	}
				3808
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	3809	static void
Ingo Molnar	8494f41	2007-08-09 11:16:48 +0200	[diff] [blame]	3810	set_next_entity(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3811	{
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	3812	/* 'current' is not kept within the tree. */
				3813	if (se->on_rq) {
				3814	/*
				3815	* Any task has to be enqueued before it get to execute on
				3816	* a CPU. So account for the time it spent waiting on the
				3817	* runqueue.
				3818	*/
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	3819	update_stats_wait_end(cfs_rq, se);
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	3820	__dequeue_entity(cfs_rq, se);
Vincent Guittot	d31b1a6	2016-11-08 10:53:44 +0100	[diff] [blame]	3821	update_load_avg(se, UPDATE_TG);
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	3822	}
				3823
Ingo Molnar	79303e9	2007-08-09 11:16:47 +0200	[diff] [blame]	3824	update_stats_curr_start(cfs_rq, se);
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	3825	cfs_rq->curr = se;
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	3826
Ingo Molnar	eba1ed4	2007-10-15 17:00:02 +0200	[diff] [blame]	3827	/*
				3828	* Track our maximum slice length, if the CPU's load is at
				3829	* least twice that of our own weight (i.e. dont track it
				3830	* when there are only lesser-weight tasks around):
				3831	*/
Mel Gorman	cb25176	2016-02-05 09:08:36 +0000	[diff] [blame]	3832	if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	3833	schedstat_set(se->statistics.slice_max,
				3834	max((u64)schedstat_val(se->statistics.slice_max),
				3835	se->sum_exec_runtime - se->prev_sum_exec_runtime));
Ingo Molnar	eba1ed4	2007-10-15 17:00:02 +0200	[diff] [blame]	3836	}
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	3837
Peter Zijlstra	4a55b45	2007-09-05 14:32:49 +0200	[diff] [blame]	3838	se->prev_sum_exec_runtime = se->sum_exec_runtime;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3839	}
				3840
Peter Zijlstra	3f3a490	2008-10-24 11:06:16 +0200	[diff] [blame]	3841	static int
				3842	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se);
				3843
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3844	/*
				3845	* Pick the next process, keeping these things in mind, in this order:
				3846	* 1) keep things fair between processes/task groups
				3847	* 2) pick the "next" process, since someone really wants that to run
				3848	* 3) pick the "last" process, for cache locality
				3849	* 4) do not run the "skip" process, if something else is available
				3850	*/
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	3851	static struct sched_entity *
				3852	pick_next_entity(struct cfs_rq cfs_rq, struct sched_entity curr)
Peter Zijlstra	aa2ac25	2008-03-14 21:12:12 +0100	[diff] [blame]	3853	{
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	3854	struct sched_entity *left = __pick_first_entity(cfs_rq);
				3855	struct sched_entity *se;
				3856
				3857	/*
				3858	* If curr is set we have to see if its left of the leftmost entity
				3859	* still in the tree, provided there was anything in the tree at all.
				3860	*/
				3861	if (!left \|\| (curr && entity_before(curr, left)))
				3862	left = curr;
				3863
				3864	se = left; /* ideally we run the leftmost entity */
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	3865
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3866	/*
				3867	* Avoid running the skip buddy, if running something else can
				3868	* be done without getting too unfair.
				3869	*/
				3870	if (cfs_rq->skip == se) {
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	3871	struct sched_entity *second;
				3872
				3873	if (se == curr) {
				3874	second = __pick_first_entity(cfs_rq);
				3875	} else {
				3876	second = __pick_next_entity(se);
				3877	if (!second \|\| (curr && entity_before(curr, second)))
				3878	second = curr;
				3879	}
				3880
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3881	if (second && wakeup_preempt_entity(second, left) < 1)
				3882	se = second;
				3883	}
Peter Zijlstra	aa2ac25	2008-03-14 21:12:12 +0100	[diff] [blame]	3884
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	3885	/*
				3886	* Prefer last buddy, try to return the CPU to a preempted task.
				3887	*/
				3888	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
				3889	se = cfs_rq->last;
				3890
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3891	/*
				3892	* Someone really wants this to run. If it's not unfair, run it.
				3893	*/
				3894	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
				3895	se = cfs_rq->next;
				3896
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	3897	clear_buddies(cfs_rq, se);
Peter Zijlstra	4793241	2008-11-04 21:25:09 +0100	[diff] [blame]	3898
				3899	return se;
Peter Zijlstra	aa2ac25	2008-03-14 21:12:12 +0100	[diff] [blame]	3900	}
				3901
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	3902	static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	3903
Ingo Molnar	ab6cde2	2007-08-09 11:16:48 +0200	[diff] [blame]	3904	static void put_prev_entity(struct cfs_rq cfs_rq, struct sched_entity prev)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3905	{
				3906	/*
				3907	* If still on the runqueue then deactivate_task()
				3908	* was not called and update_curr() has to be done:
				3909	*/
				3910	if (prev->on_rq)
Ingo Molnar	b7cc089	2007-08-09 11:16:47 +0200	[diff] [blame]	3911	update_curr(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3912
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	3913	/* throttle cfs_rqs exceeding runtime */
				3914	check_cfs_rq_runtime(cfs_rq);
				3915
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	3916	check_spread(cfs_rq, prev);
Mel Gorman	cb25176	2016-02-05 09:08:36 +0000	[diff] [blame]	3917
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	3918	if (prev->on_rq) {
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	3919	update_stats_wait_start(cfs_rq, prev);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	3920	/* Put 'current' back into the tree. */
				3921	__enqueue_entity(cfs_rq, prev);
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	3922	/* in !on_rq case, update occurred at dequeue */
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3923	update_load_avg(prev, 0);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	3924	}
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	3925	cfs_rq->curr = NULL;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3926	}
				3927
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3928	static void
				3929	entity_tick(struct cfs_rq cfs_rq, struct sched_entity curr, int queued)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3930	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3931	/*
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	3932	* Update run-time statistics of the 'current'.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3933	*/
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	3934	update_curr(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3935
Paul Turner	43365bd	2010-12-15 19:10:17 -0800	[diff] [blame]	3936	/*
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	3937	* Ensure that runnable average is periodically updated.
				3938	*/
Vincent Guittot	d31b1a6	2016-11-08 10:53:44 +0100	[diff] [blame]	3939	update_load_avg(curr, UPDATE_TG);
Vincent Guittot	89ee048	2016-12-21 16:50:26 +0100	[diff] [blame]	3940	update_cfs_shares(curr);
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	3941
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3942	#ifdef CONFIG_SCHED_HRTICK
				3943	/*
				3944	* queued ticks are scheduled to match the slice, so don't bother
				3945	* validating it and just reschedule.
				3946	*/
Harvey Harrison	983ed7a	2008-04-24 18:17:55 -0700	[diff] [blame]	3947	if (queued) {
Kirill Tkhai	8875125	2014-06-29 00:03:57 +0400	[diff] [blame]	3948	resched_curr(rq_of(cfs_rq));
Harvey Harrison	983ed7a	2008-04-24 18:17:55 -0700	[diff] [blame]	3949	return;
				3950	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3951	/*
				3952	* don't let the period tick interfere with the hrtick preemption
				3953	*/
				3954	if (!sched_feat(DOUBLE_TICK) &&
				3955	hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
				3956	return;
				3957	#endif
				3958
Yong Zhang	2c2efae	2011-07-29 16:20:33 +0800	[diff] [blame]	3959	if (cfs_rq->nr_running > 1)
Ingo Molnar	2e09bf5	2007-10-15 17:00:05 +0200	[diff] [blame]	3960	check_preempt_tick(cfs_rq, curr);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3961	}
				3962
Paul Turner	ab84d31	2011-07-21 09:43:28 -0700	[diff] [blame]	3963
				3964	/**************************************************
				3965	* CFS bandwidth control machinery
				3966	*/
				3967
				3968	#ifdef CONFIG_CFS_BANDWIDTH
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3969
				3970	#ifdef HAVE_JUMP_LABEL
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	3971	static struct static_key __cfs_bandwidth_used;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3972
				3973	static inline bool cfs_bandwidth_used(void)
				3974	{
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	3975	return static_key_false(&__cfs_bandwidth_used);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3976	}
				3977
Ben Segall	1ee14e6	2013-10-16 11:16:12 -0700	[diff] [blame]	3978	void cfs_bandwidth_usage_inc(void)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3979	{
Ben Segall	1ee14e6	2013-10-16 11:16:12 -0700	[diff] [blame]	3980	static_key_slow_inc(&__cfs_bandwidth_used);
				3981	}
				3982
				3983	void cfs_bandwidth_usage_dec(void)
				3984	{
				3985	static_key_slow_dec(&__cfs_bandwidth_used);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3986	}
				3987	#else /* HAVE_JUMP_LABEL */
				3988	static bool cfs_bandwidth_used(void)
				3989	{
				3990	return true;
				3991	}
				3992
Ben Segall	1ee14e6	2013-10-16 11:16:12 -0700	[diff] [blame]	3993	void cfs_bandwidth_usage_inc(void) {}
				3994	void cfs_bandwidth_usage_dec(void) {}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3995	#endif /* HAVE_JUMP_LABEL */
				3996
Paul Turner	ab84d31	2011-07-21 09:43:28 -0700	[diff] [blame]	3997	/*
				3998	* default period for cfs group bandwidth.
				3999	* default: 0.1s, units: nanoseconds
				4000	*/
				4001	static inline u64 default_cfs_period(void)
				4002	{
				4003	return 100000000ULL;
				4004	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	4005
				4006	static inline u64 sched_cfs_bandwidth_slice(void)
				4007	{
				4008	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
				4009	}
				4010
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	4011	/*
				4012	* Replenish runtime according to assigned quota and update expiration time.
				4013	* We use sched_clock_cpu directly instead of rq->clock to avoid adding
				4014	* additional synchronization around rq->lock.
				4015	*
				4016	* requires cfs_b->lock
				4017	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4018	void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	4019	{
				4020	u64 now;
				4021
				4022	if (cfs_b->quota == RUNTIME_INF)
				4023	return;
				4024
				4025	now = sched_clock_cpu(smp_processor_id());
				4026	cfs_b->runtime = cfs_b->quota;
				4027	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
				4028	}
				4029
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4030	static inline struct cfs_bandwidth tg_cfs_bandwidth(struct task_group tg)
				4031	{
				4032	return &tg->cfs_bandwidth;
				4033	}
				4034
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	4035	/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
				4036	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
				4037	{
				4038	if (unlikely(cfs_rq->throttle_count))
Xunlei Pang	1a99ae3	2016-05-10 21:03:18 +0800	[diff] [blame]	4039	return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	4040
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	4041	return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	4042	}
				4043
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4044	/* returns 0 on failure to allocate runtime */
				4045	static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	4046	{
				4047	struct task_group *tg = cfs_rq->tg;
				4048	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	4049	u64 amount = 0, min_amount, expires;
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	4050
				4051	/* note: this is a positive sum as runtime_remaining <= 0 */
				4052	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
				4053
				4054	raw_spin_lock(&cfs_b->lock);
				4055	if (cfs_b->quota == RUNTIME_INF)
				4056	amount = min_amount;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	4057	else {
Peter Zijlstra	77a4d1a	2015-04-15 11:41:57 +0200	[diff] [blame]	4058	start_cfs_bandwidth(cfs_b);
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	4059
				4060	if (cfs_b->runtime > 0) {
				4061	amount = min(cfs_b->runtime, min_amount);
				4062	cfs_b->runtime -= amount;
				4063	cfs_b->idle = 0;
				4064	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	4065	}
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	4066	expires = cfs_b->runtime_expires;
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	4067	raw_spin_unlock(&cfs_b->lock);
				4068
				4069	cfs_rq->runtime_remaining += amount;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	4070	/*
				4071	* we may have advanced our local expiration to account for allowed
				4072	* spread between our sched_clock and the one on which runtime was
				4073	* issued.
				4074	*/
				4075	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
				4076	cfs_rq->runtime_expires = expires;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4077
				4078	return cfs_rq->runtime_remaining > 0;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	4079	}
				4080
				4081	/*
				4082	* Note: This depends on the synchronization provided by sched_clock and the
				4083	* fact that rq->clock snapshots this value.
				4084	*/
				4085	static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				4086	{
				4087	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	4088
				4089	/* if the deadline is ahead of our clock, nothing to do */
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	4090	if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	4091	return;
				4092
				4093	if (cfs_rq->runtime_remaining < 0)
				4094	return;
				4095
				4096	/*
				4097	* If the local deadline has passed we have to consider the
				4098	* possibility that our sched_clock is 'fast' and the global deadline
				4099	* has not truly expired.
				4100	*
				4101	* Fortunately we can check determine whether this the case by checking
Ben Segall	51f2176	2014-05-19 15:49:45 -0700	[diff] [blame]	4102	* whether the global deadline has advanced. It is valid to compare
				4103	* cfs_b->runtime_expires without any locks since we only care about
				4104	* exact equality, so a partial write will still work.
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	4105	*/
				4106
Ben Segall	51f2176	2014-05-19 15:49:45 -0700	[diff] [blame]	4107	if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	4108	/* extend local deadline, drift is bounded above by 2 ticks */
				4109	cfs_rq->runtime_expires += TICK_NSEC;
				4110	} else {
				4111	/* global deadline is ahead, expiration has passed */
				4112	cfs_rq->runtime_remaining = 0;
				4113	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	4114	}
				4115
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	4116	static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	4117	{
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	4118	/* dock delta_exec before expiring quota (as it could span periods) */
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	4119	cfs_rq->runtime_remaining -= delta_exec;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	4120	expire_cfs_rq_runtime(cfs_rq);
				4121
				4122	if (likely(cfs_rq->runtime_remaining > 0))
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	4123	return;
				4124
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4125	/*
				4126	* if we're unable to extend our runtime we resched so that the active
				4127	* hierarchy can be throttled
				4128	*/
				4129	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
Kirill Tkhai	8875125	2014-06-29 00:03:57 +0400	[diff] [blame]	4130	resched_curr(rq_of(cfs_rq));
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	4131	}
				4132
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	4133	static __always_inline
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	4134	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	4135	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	4136	if (!cfs_bandwidth_used() \|\| !cfs_rq->runtime_enabled)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	4137	return;
				4138
				4139	__account_cfs_rq_runtime(cfs_rq, delta_exec);
				4140	}
				4141
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4142	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
				4143	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	4144	return cfs_bandwidth_used() && cfs_rq->throttled;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4145	}
				4146
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	4147	/* check whether cfs_rq, or any parent, is throttled */
				4148	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
				4149	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	4150	return cfs_bandwidth_used() && cfs_rq->throttle_count;
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	4151	}
				4152
				4153	/*
				4154	* Ensure that neither of the group entities corresponding to src_cpu or
				4155	* dest_cpu are members of a throttled hierarchy when performing group
				4156	* load-balance operations.
				4157	*/
				4158	static inline int throttled_lb_pair(struct task_group *tg,
				4159	int src_cpu, int dest_cpu)
				4160	{
				4161	struct cfs_rq src_cfs_rq, dest_cfs_rq;
				4162
				4163	src_cfs_rq = tg->cfs_rq[src_cpu];
				4164	dest_cfs_rq = tg->cfs_rq[dest_cpu];
				4165
				4166	return throttled_hierarchy(src_cfs_rq) \|\|
				4167	throttled_hierarchy(dest_cfs_rq);
				4168	}
				4169
				4170	/* updated child weight may affect parent so we have to do this bottom up */
				4171	static int tg_unthrottle_up(struct task_group tg, void data)
				4172	{
				4173	struct rq *rq = data;
				4174	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
				4175
				4176	cfs_rq->throttle_count--;
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	4177	if (!cfs_rq->throttle_count) {
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	4178	/* adjust cfs_rq_clock_task() */
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	4179	cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	4180	cfs_rq->throttled_clock_task;
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	4181	}
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	4182
				4183	return 0;
				4184	}
				4185
				4186	static int tg_throttle_down(struct task_group tg, void data)
				4187	{
				4188	struct rq *rq = data;
				4189	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
				4190
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	4191	/* group is entering throttled state, stop time */
				4192	if (!cfs_rq->throttle_count)
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	4193	cfs_rq->throttled_clock_task = rq_clock_task(rq);
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	4194	cfs_rq->throttle_count++;
				4195
				4196	return 0;
				4197	}
				4198
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	4199	static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4200	{
				4201	struct rq *rq = rq_of(cfs_rq);
				4202	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				4203	struct sched_entity *se;
				4204	long task_delta, dequeue = 1;
Peter Zijlstra	77a4d1a	2015-04-15 11:41:57 +0200	[diff] [blame]	4205	bool empty;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4206
				4207	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
				4208
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	4209	/* freeze hierarchy runnable averages while throttled */
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	4210	rcu_read_lock();
				4211	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
				4212	rcu_read_unlock();
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4213
				4214	task_delta = cfs_rq->h_nr_running;
				4215	for_each_sched_entity(se) {
				4216	struct cfs_rq *qcfs_rq = cfs_rq_of(se);
				4217	/* throttled entity or throttle-on-deactivate */
				4218	if (!se->on_rq)
				4219	break;
				4220
				4221	if (dequeue)
				4222	dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
				4223	qcfs_rq->h_nr_running -= task_delta;
				4224
				4225	if (qcfs_rq->load.weight)
				4226	dequeue = 0;
				4227	}
				4228
				4229	if (!se)
Kirill Tkhai	7246544	2014-05-09 03:00:14 +0400	[diff] [blame]	4230	sub_nr_running(rq, task_delta);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4231
				4232	cfs_rq->throttled = 1;
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	4233	cfs_rq->throttled_clock = rq_clock(rq);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4234	raw_spin_lock(&cfs_b->lock);
Cong Wang	d49db34	2015-06-24 12:41:47 -0700	[diff] [blame]	4235	empty = list_empty(&cfs_b->throttled_cfs_rq);
Peter Zijlstra	77a4d1a	2015-04-15 11:41:57 +0200	[diff] [blame]	4236
Ben Segall	c06f04c	2014-06-20 15:21:20 -0700	[diff] [blame]	4237	/*
				4238	* Add to the _head_ of the list, so that an already-started
				4239	* distribute_cfs_runtime will not see us
				4240	*/
				4241	list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
Peter Zijlstra	77a4d1a	2015-04-15 11:41:57 +0200	[diff] [blame]	4242
				4243	/*
				4244	* If we're the first throttled task, make sure the bandwidth
				4245	* timer is running.
				4246	*/
				4247	if (empty)
				4248	start_cfs_bandwidth(cfs_b);
				4249
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4250	raw_spin_unlock(&cfs_b->lock);
				4251	}
				4252
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4253	void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4254	{
				4255	struct rq *rq = rq_of(cfs_rq);
				4256	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				4257	struct sched_entity *se;
				4258	int enqueue = 1;
				4259	long task_delta;
				4260
Michael Wang	22b958d	2013-06-04 14:23:39 +0800	[diff] [blame]	4261	se = cfs_rq->tg->se[cpu_of(rq)];
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4262
				4263	cfs_rq->throttled = 0;
Frederic Weisbecker	1a55af2	2013-04-12 01:51:01 +0200	[diff] [blame]	4264
				4265	update_rq_clock(rq);
				4266
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4267	raw_spin_lock(&cfs_b->lock);
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	4268	cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4269	list_del_rcu(&cfs_rq->throttled_list);
				4270	raw_spin_unlock(&cfs_b->lock);
				4271
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	4272	/* update hierarchical throttle state */
				4273	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
				4274
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4275	if (!cfs_rq->load.weight)
				4276	return;
				4277
				4278	task_delta = cfs_rq->h_nr_running;
				4279	for_each_sched_entity(se) {
				4280	if (se->on_rq)
				4281	enqueue = 0;
				4282
				4283	cfs_rq = cfs_rq_of(se);
				4284	if (enqueue)
				4285	enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
				4286	cfs_rq->h_nr_running += task_delta;
				4287
				4288	if (cfs_rq_throttled(cfs_rq))
				4289	break;
				4290	}
				4291
				4292	if (!se)
Kirill Tkhai	7246544	2014-05-09 03:00:14 +0400	[diff] [blame]	4293	add_nr_running(rq, task_delta);
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4294
				4295	/* determine whether we need to wake up potentially idle cpu */
				4296	if (rq->curr == rq->idle && rq->cfs.nr_running)
Kirill Tkhai	8875125	2014-06-29 00:03:57 +0400	[diff] [blame]	4297	resched_curr(rq);
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4298	}
				4299
				4300	static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
				4301	u64 remaining, u64 expires)
				4302	{
				4303	struct cfs_rq *cfs_rq;
Ben Segall	c06f04c	2014-06-20 15:21:20 -0700	[diff] [blame]	4304	u64 runtime;
				4305	u64 starting_runtime = remaining;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4306
				4307	rcu_read_lock();
				4308	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
				4309	throttled_list) {
				4310	struct rq *rq = rq_of(cfs_rq);
Peter Zijlstra	8a8c69c	2016-10-04 16:04:35 +0200	[diff] [blame]	4311	struct rq_flags rf;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4312
Peter Zijlstra	8a8c69c	2016-10-04 16:04:35 +0200	[diff] [blame]	4313	rq_lock(rq, &rf);
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4314	if (!cfs_rq_throttled(cfs_rq))
				4315	goto next;
				4316
				4317	runtime = -cfs_rq->runtime_remaining + 1;
				4318	if (runtime > remaining)
				4319	runtime = remaining;
				4320	remaining -= runtime;
				4321
				4322	cfs_rq->runtime_remaining += runtime;
				4323	cfs_rq->runtime_expires = expires;
				4324
				4325	/* we check whether we're throttled above */
				4326	if (cfs_rq->runtime_remaining > 0)
				4327	unthrottle_cfs_rq(cfs_rq);
				4328
				4329	next:
Peter Zijlstra	8a8c69c	2016-10-04 16:04:35 +0200	[diff] [blame]	4330	rq_unlock(rq, &rf);
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4331
				4332	if (!remaining)
				4333	break;
				4334	}
				4335	rcu_read_unlock();
				4336
Ben Segall	c06f04c	2014-06-20 15:21:20 -0700	[diff] [blame]	4337	return starting_runtime - remaining;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4338	}
				4339
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	4340	/*
				4341	* Responsible for refilling a task_group's bandwidth and unthrottling its
				4342	* cfs_rqs as appropriate. If there has been no activity within the last
				4343	* period the timer is deactivated until scheduling resumes; cfs_b->idle is
				4344	* used to track this state.
				4345	*/
				4346	static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
				4347	{
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4348	u64 runtime, runtime_expires;
Ben Segall	51f2176	2014-05-19 15:49:45 -0700	[diff] [blame]	4349	int throttled;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	4350
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	4351	/* no need to continue the timer with no bandwidth constraint */
				4352	if (cfs_b->quota == RUNTIME_INF)
Ben Segall	51f2176	2014-05-19 15:49:45 -0700	[diff] [blame]	4353	goto out_deactivate;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	4354
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4355	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
Nikhil Rao	e8da1b1	2011-07-21 09:43:40 -0700	[diff] [blame]	4356	cfs_b->nr_periods += overrun;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4357
Ben Segall	51f2176	2014-05-19 15:49:45 -0700	[diff] [blame]	4358	/*
				4359	* idle depends on !throttled (for the case of a large deficit), and if
				4360	* we're going inactive then everything else can be deferred
				4361	*/
				4362	if (cfs_b->idle && !throttled)
				4363	goto out_deactivate;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	4364
				4365	__refill_cfs_bandwidth_runtime(cfs_b);
				4366
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4367	if (!throttled) {
				4368	/* mark as potentially idle for the upcoming period */
				4369	cfs_b->idle = 1;
Ben Segall	51f2176	2014-05-19 15:49:45 -0700	[diff] [blame]	4370	return 0;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4371	}
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	4372
Nikhil Rao	e8da1b1	2011-07-21 09:43:40 -0700	[diff] [blame]	4373	/* account preceding periods in which throttling occurred */
				4374	cfs_b->nr_throttled += overrun;
				4375
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4376	runtime_expires = cfs_b->runtime_expires;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4377
				4378	/*
Ben Segall	c06f04c	2014-06-20 15:21:20 -0700	[diff] [blame]	4379	* This check is repeated as we are holding onto the new bandwidth while
				4380	* we unthrottle. This can potentially race with an unthrottled group
				4381	* trying to acquire new bandwidth from the global pool. This can result
				4382	* in us over-using our runtime if it is all used during this loop, but
				4383	* only by limited amounts in that extreme case.
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4384	*/
Ben Segall	c06f04c	2014-06-20 15:21:20 -0700	[diff] [blame]	4385	while (throttled && cfs_b->runtime > 0) {
				4386	runtime = cfs_b->runtime;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4387	raw_spin_unlock(&cfs_b->lock);
				4388	/* we can't nest cfs_b->lock while distributing bandwidth */
				4389	runtime = distribute_cfs_runtime(cfs_b, runtime,
				4390	runtime_expires);
				4391	raw_spin_lock(&cfs_b->lock);
				4392
				4393	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
Ben Segall	c06f04c	2014-06-20 15:21:20 -0700	[diff] [blame]	4394
				4395	cfs_b->runtime -= min(runtime, cfs_b->runtime);
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4396	}
				4397
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4398	/*
				4399	* While we are ensured activity in the period following an
				4400	* unthrottle, this also covers the case in which the new bandwidth is
				4401	* insufficient to cover the existing bandwidth deficit. (Forcing the
				4402	* timer to remain active while there are any throttled entities.)
				4403	*/
				4404	cfs_b->idle = 0;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	4405
Ben Segall	51f2176	2014-05-19 15:49:45 -0700	[diff] [blame]	4406	return 0;
				4407
				4408	out_deactivate:
Ben Segall	51f2176	2014-05-19 15:49:45 -0700	[diff] [blame]	4409	return 1;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	4410	}
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	4411
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	4412	/* a cfs_rq won't donate quota below this amount */
				4413	static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
				4414	/* minimum remaining period time to redistribute slack quota */
				4415	static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
				4416	/* how long we wait to gather additional slack before distributing */
				4417	static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
				4418
Ben Segall	db06e78	2013-10-16 11:16:17 -0700	[diff] [blame]	4419	/*
				4420	* Are we near the end of the current quota period?
				4421	*
				4422	* Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
Thomas Gleixner	4961b6e	2015-04-14 21:09:05 +0000	[diff] [blame]	4423	* hrtimer base being cleared by hrtimer_start. In the case of
Ben Segall	db06e78	2013-10-16 11:16:17 -0700	[diff] [blame]	4424	* migrate_hrtimers, base is never cleared, so we are fine.
				4425	*/
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	4426	static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
				4427	{
				4428	struct hrtimer *refresh_timer = &cfs_b->period_timer;
				4429	u64 remaining;
				4430
				4431	/* if the call-back is running a quota refresh is already occurring */
				4432	if (hrtimer_callback_running(refresh_timer))
				4433	return 1;
				4434
				4435	/* is a quota refresh about to occur? */
				4436	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
				4437	if (remaining < min_expire)
				4438	return 1;
				4439
				4440	return 0;
				4441	}
				4442
				4443	static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
				4444	{
				4445	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
				4446
				4447	/* if there's a quota refresh soon don't bother with slack */
				4448	if (runtime_refresh_within(cfs_b, min_left))
				4449	return;
				4450
Peter Zijlstra	4cfafd3	2015-05-14 12:23:11 +0200	[diff] [blame]	4451	hrtimer_start(&cfs_b->slack_timer,
				4452	ns_to_ktime(cfs_bandwidth_slack_period),
				4453	HRTIMER_MODE_REL);
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	4454	}
				4455
				4456	/* we know any runtime found here is valid as update_curr() precedes return */
				4457	static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				4458	{
				4459	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				4460	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
				4461
				4462	if (slack_runtime <= 0)
				4463	return;
				4464
				4465	raw_spin_lock(&cfs_b->lock);
				4466	if (cfs_b->quota != RUNTIME_INF &&
				4467	cfs_rq->runtime_expires == cfs_b->runtime_expires) {
				4468	cfs_b->runtime += slack_runtime;
				4469
				4470	/* we are under rq->lock, defer unthrottling using a timer */
				4471	if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
				4472	!list_empty(&cfs_b->throttled_cfs_rq))
				4473	start_cfs_slack_bandwidth(cfs_b);
				4474	}
				4475	raw_spin_unlock(&cfs_b->lock);
				4476
				4477	/* even if it's not valid for return we don't want to try again */
				4478	cfs_rq->runtime_remaining -= slack_runtime;
				4479	}
				4480
				4481	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				4482	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	4483	if (!cfs_bandwidth_used())
				4484	return;
				4485
Paul Turner	fccfdc6	2011-11-07 20:26:34 -0800	[diff] [blame]	4486	if (!cfs_rq->runtime_enabled \|\| cfs_rq->nr_running)
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	4487	return;
				4488
				4489	__return_cfs_rq_runtime(cfs_rq);
				4490	}
				4491
				4492	/*
				4493	* This is done with a timer (instead of inline with bandwidth return) since
				4494	* it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
				4495	*/
				4496	static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
				4497	{
				4498	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
				4499	u64 expires;
				4500
				4501	/* confirm we're still not at a refresh boundary */
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	4502	raw_spin_lock(&cfs_b->lock);
Ben Segall	db06e78	2013-10-16 11:16:17 -0700	[diff] [blame]	4503	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
				4504	raw_spin_unlock(&cfs_b->lock);
				4505	return;
				4506	}
				4507
Ben Segall	c06f04c	2014-06-20 15:21:20 -0700	[diff] [blame]	4508	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	4509	runtime = cfs_b->runtime;
Ben Segall	c06f04c	2014-06-20 15:21:20 -0700	[diff] [blame]	4510
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	4511	expires = cfs_b->runtime_expires;
				4512	raw_spin_unlock(&cfs_b->lock);
				4513
				4514	if (!runtime)
				4515	return;
				4516
				4517	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
				4518
				4519	raw_spin_lock(&cfs_b->lock);
				4520	if (expires == cfs_b->runtime_expires)
Ben Segall	c06f04c	2014-06-20 15:21:20 -0700	[diff] [blame]	4521	cfs_b->runtime -= min(runtime, cfs_b->runtime);
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	4522	raw_spin_unlock(&cfs_b->lock);
				4523	}
				4524
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	4525	/*
				4526	* When a group wakes up we want to make sure that its quota is not already
				4527	* expired/exceeded, otherwise it may be allowed to steal additional ticks of
				4528	* runtime as update_curr() throttling can not not trigger until it's on-rq.
				4529	*/
				4530	static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
				4531	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	4532	if (!cfs_bandwidth_used())
				4533	return;
				4534
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	4535	/* an active group must be handled by the update_curr()->put() path */
				4536	if (!cfs_rq->runtime_enabled \|\| cfs_rq->curr)
				4537	return;
				4538
				4539	/* ensure the group is not already throttled */
				4540	if (cfs_rq_throttled(cfs_rq))
				4541	return;
				4542
				4543	/* update runtime allocation */
				4544	account_cfs_rq_runtime(cfs_rq, 0);
				4545	if (cfs_rq->runtime_remaining <= 0)
				4546	throttle_cfs_rq(cfs_rq);
				4547	}
				4548
Peter Zijlstra	55e16d3	2016-06-22 15:14:26 +0200	[diff] [blame]	4549	static void sync_throttle(struct task_group *tg, int cpu)
				4550	{
				4551	struct cfs_rq pcfs_rq, cfs_rq;
				4552
				4553	if (!cfs_bandwidth_used())
				4554	return;
				4555
				4556	if (!tg->parent)
				4557	return;
				4558
				4559	cfs_rq = tg->cfs_rq[cpu];
				4560	pcfs_rq = tg->parent->cfs_rq[cpu];
				4561
				4562	cfs_rq->throttle_count = pcfs_rq->throttle_count;
Xunlei Pang	b892212	2016-07-09 15:54:22 +0800	[diff] [blame]	4563	cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
Peter Zijlstra	55e16d3	2016-06-22 15:14:26 +0200	[diff] [blame]	4564	}
				4565
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	4566	/* conditionally throttle active cfs_rq's from put_prev_entity() */
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	4567	static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	4568	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	4569	if (!cfs_bandwidth_used())
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	4570	return false;
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	4571
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	4572	if (likely(!cfs_rq->runtime_enabled \|\| cfs_rq->runtime_remaining > 0))
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	4573	return false;
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	4574
				4575	/*
				4576	* it's possible for a throttled entity to be forced into a running
				4577	* state (e.g. set_curr_task), in this case we're finished.
				4578	*/
				4579	if (cfs_rq_throttled(cfs_rq))
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	4580	return true;
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	4581
				4582	throttle_cfs_rq(cfs_rq);
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	4583	return true;
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	4584	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4585
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4586	static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
				4587	{
				4588	struct cfs_bandwidth *cfs_b =
				4589	container_of(timer, struct cfs_bandwidth, slack_timer);
Peter Zijlstra	77a4d1a	2015-04-15 11:41:57 +0200	[diff] [blame]	4590
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4591	do_sched_cfs_slack_timer(cfs_b);
				4592
				4593	return HRTIMER_NORESTART;
				4594	}
				4595
				4596	static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
				4597	{
				4598	struct cfs_bandwidth *cfs_b =
				4599	container_of(timer, struct cfs_bandwidth, period_timer);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4600	int overrun;
				4601	int idle = 0;
				4602
Ben Segall	51f2176	2014-05-19 15:49:45 -0700	[diff] [blame]	4603	raw_spin_lock(&cfs_b->lock);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4604	for (;;) {
Peter Zijlstra	77a4d1a	2015-04-15 11:41:57 +0200	[diff] [blame]	4605	overrun = hrtimer_forward_now(timer, cfs_b->period);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4606	if (!overrun)
				4607	break;
				4608
				4609	idle = do_sched_cfs_period_timer(cfs_b, overrun);
				4610	}
Peter Zijlstra	4cfafd3	2015-05-14 12:23:11 +0200	[diff] [blame]	4611	if (idle)
				4612	cfs_b->period_active = 0;
Ben Segall	51f2176	2014-05-19 15:49:45 -0700	[diff] [blame]	4613	raw_spin_unlock(&cfs_b->lock);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4614
				4615	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
				4616	}
				4617
				4618	void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				4619	{
				4620	raw_spin_lock_init(&cfs_b->lock);
				4621	cfs_b->runtime = 0;
				4622	cfs_b->quota = RUNTIME_INF;
				4623	cfs_b->period = ns_to_ktime(default_cfs_period());
				4624
				4625	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
Peter Zijlstra	4cfafd3	2015-05-14 12:23:11 +0200	[diff] [blame]	4626	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4627	cfs_b->period_timer.function = sched_cfs_period_timer;
				4628	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
				4629	cfs_b->slack_timer.function = sched_cfs_slack_timer;
				4630	}
				4631
				4632	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				4633	{
				4634	cfs_rq->runtime_enabled = 0;
				4635	INIT_LIST_HEAD(&cfs_rq->throttled_list);
				4636	}
				4637
Peter Zijlstra	77a4d1a	2015-04-15 11:41:57 +0200	[diff] [blame]	4638	void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4639	{
Peter Zijlstra	4cfafd3	2015-05-14 12:23:11 +0200	[diff] [blame]	4640	lockdep_assert_held(&cfs_b->lock);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4641
Peter Zijlstra	4cfafd3	2015-05-14 12:23:11 +0200	[diff] [blame]	4642	if (!cfs_b->period_active) {
				4643	cfs_b->period_active = 1;
				4644	hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
				4645	hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
				4646	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4647	}
				4648
				4649	static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				4650	{
Tetsuo Handa	7f1a169	2014-12-25 15:51:21 +0900	[diff] [blame]	4651	/* init_cfs_bandwidth() was not called */
				4652	if (!cfs_b->throttled_cfs_rq.next)
				4653	return;
				4654
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4655	hrtimer_cancel(&cfs_b->period_timer);
				4656	hrtimer_cancel(&cfs_b->slack_timer);
				4657	}
				4658
Kirill Tkhai	0e59bda	2014-06-25 12:19:42 +0400	[diff] [blame]	4659	static void __maybe_unused update_runtime_enabled(struct rq *rq)
				4660	{
				4661	struct cfs_rq *cfs_rq;
				4662
				4663	for_each_leaf_cfs_rq(rq, cfs_rq) {
				4664	struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
				4665
				4666	raw_spin_lock(&cfs_b->lock);
				4667	cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
				4668	raw_spin_unlock(&cfs_b->lock);
				4669	}
				4670	}
				4671
Arnd Bergmann	38dc334	2013-01-25 14:14:22 +0000	[diff] [blame]	4672	static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4673	{
				4674	struct cfs_rq *cfs_rq;
				4675
				4676	for_each_leaf_cfs_rq(rq, cfs_rq) {
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4677	if (!cfs_rq->runtime_enabled)
				4678	continue;
				4679
				4680	/*
				4681	* clock_task is not advancing so we just need to make sure
				4682	* there's some valid quota amount
				4683	*/
Ben Segall	51f2176	2014-05-19 15:49:45 -0700	[diff] [blame]	4684	cfs_rq->runtime_remaining = 1;
Kirill Tkhai	0e59bda	2014-06-25 12:19:42 +0400	[diff] [blame]	4685	/*
				4686	* Offline rq is schedulable till cpu is completely disabled
				4687	* in take_cpu_down(), so we prevent new cfs throttling here.
				4688	*/
				4689	cfs_rq->runtime_enabled = 0;
				4690
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4691	if (cfs_rq_throttled(cfs_rq))
				4692	unthrottle_cfs_rq(cfs_rq);
				4693	}
				4694	}
				4695
				4696	#else /* CONFIG_CFS_BANDWIDTH */
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	4697	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
				4698	{
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	4699	return rq_clock_task(rq_of(cfs_rq));
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	4700	}
				4701
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	4702	static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	4703	static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	4704	static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
Peter Zijlstra	55e16d3	2016-06-22 15:14:26 +0200	[diff] [blame]	4705	static inline void sync_throttle(struct task_group *tg, int cpu) {}
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	4706	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4707
				4708	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
				4709	{
				4710	return 0;
				4711	}
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	4712
				4713	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
				4714	{
				4715	return 0;
				4716	}
				4717
				4718	static inline int throttled_lb_pair(struct task_group *tg,
				4719	int src_cpu, int dest_cpu)
				4720	{
				4721	return 0;
				4722	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4723
				4724	void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
				4725
				4726	#ifdef CONFIG_FAIR_GROUP_SCHED
				4727	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
Paul Turner	ab84d31	2011-07-21 09:43:28 -0700	[diff] [blame]	4728	#endif
				4729
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4730	static inline struct cfs_bandwidth tg_cfs_bandwidth(struct task_group tg)
				4731	{
				4732	return NULL;
				4733	}
				4734	static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
Kirill Tkhai	0e59bda	2014-06-25 12:19:42 +0400	[diff] [blame]	4735	static inline void update_runtime_enabled(struct rq *rq) {}
Peter Boonstoppel	a4c96ae	2012-08-09 15:34:47 -0700	[diff] [blame]	4736	static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4737
				4738	#endif /* CONFIG_CFS_BANDWIDTH */
				4739
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4740	/**************************************************
				4741	* CFS operations on tasks:
				4742	*/
				4743
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	4744	#ifdef CONFIG_SCHED_HRTICK
				4745	static void hrtick_start_fair(struct rq rq, struct task_struct p)
				4746	{
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	4747	struct sched_entity *se = &p->se;
				4748	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				4749
Peter Zijlstra	9148a3a	2016-09-20 22:34:51 +0200	[diff] [blame]	4750	SCHED_WARN_ON(task_rq(p) != rq);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	4751
Srivatsa Vaddagiri	8bf46a3	2016-09-16 18:28:51 -0700	[diff] [blame]	4752	if (rq->cfs.h_nr_running > 1) {
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	4753	u64 slice = sched_slice(cfs_rq, se);
				4754	u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
				4755	s64 delta = slice - ran;
				4756
				4757	if (delta < 0) {
				4758	if (rq->curr == p)
Kirill Tkhai	8875125	2014-06-29 00:03:57 +0400	[diff] [blame]	4759	resched_curr(rq);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	4760	return;
				4761	}
Peter Zijlstra	3165651	2008-07-18 18:01:23 +0200	[diff] [blame]	4762	hrtick_start(rq, delta);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	4763	}
				4764	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	4765
				4766	/*
				4767	* called from enqueue/dequeue and updates the hrtick when the
				4768	* current task is from our class and nr_running is low enough
				4769	* to matter.
				4770	*/
				4771	static void hrtick_update(struct rq *rq)
				4772	{
				4773	struct task_struct *curr = rq->curr;
				4774
Mike Galbraith	b39e66e	2011-11-22 15:20:07 +0100	[diff] [blame]	4775	if (!hrtick_enabled(rq) \|\| curr->sched_class != &fair_sched_class)
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	4776	return;
				4777
				4778	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
				4779	hrtick_start_fair(rq, curr);
				4780	}
Dhaval Giani	55e12e5	2008-06-24 23:39:43 +0530	[diff] [blame]	4781	#else /* !CONFIG_SCHED_HRTICK */
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	4782	static inline void
				4783	hrtick_start_fair(struct rq rq, struct task_struct p)
				4784	{
				4785	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	4786
				4787	static inline void hrtick_update(struct rq *rq)
				4788	{
				4789	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	4790	#endif
				4791
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4792	/*
				4793	* The enqueue_task method is called before nr_running is
				4794	* increased. Here we update the fair scheduling stats and
				4795	* then put the task into the rbtree:
				4796	*/
Thomas Gleixner	ea87bb7	2010-01-20 20:58:57 +0000	[diff] [blame]	4797	static void
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	4798	enqueue_task_fair(struct rq rq, struct task_struct p, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4799	{
				4800	struct cfs_rq *cfs_rq;
Peter Zijlstra	62fb185	2008-02-25 17:34:02 +0100	[diff] [blame]	4801	struct sched_entity *se = &p->se;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4802
Rafael J. Wysocki	8c34ab1	2016-09-09 23:59:33 +0200	[diff] [blame]	4803	/*
				4804	* If in_iowait is set, the code below may not trigger any cpufreq
				4805	* utilization updates, so do it here explicitly with the IOWAIT flag
				4806	* passed.
				4807	*/
				4808	if (p->in_iowait)
				4809	cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
				4810
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4811	for_each_sched_entity(se) {
Peter Zijlstra	62fb185	2008-02-25 17:34:02 +0100	[diff] [blame]	4812	if (se->on_rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4813	break;
				4814	cfs_rq = cfs_rq_of(se);
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	4815	enqueue_entity(cfs_rq, se, flags);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4816
				4817	/*
				4818	* end evaluation on encountering a throttled cfs_rq
				4819	*
				4820	* note: in the case of encountering a throttled cfs_rq we will
				4821	* post the final h_nr_running increment below.
Peter Zijlstra	e210bff	2016-06-16 18:51:48 +0200	[diff] [blame]	4822	*/
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4823	if (cfs_rq_throttled(cfs_rq))
				4824	break;
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	4825	cfs_rq->h_nr_running++;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4826
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	4827	flags = ENQUEUE_WAKEUP;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4828	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	4829
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	4830	for_each_sched_entity(se) {
Lin Ming	0f31714	2011-07-22 09:14:31 +0800	[diff] [blame]	4831	cfs_rq = cfs_rq_of(se);
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	4832	cfs_rq->h_nr_running++;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	4833
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4834	if (cfs_rq_throttled(cfs_rq))
				4835	break;
				4836
Vincent Guittot	d31b1a6	2016-11-08 10:53:44 +0100	[diff] [blame]	4837	update_load_avg(se, UPDATE_TG);
Vincent Guittot	89ee048	2016-12-21 16:50:26 +0100	[diff] [blame]	4838	update_cfs_shares(se);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	4839	}
				4840
Yuyang Du	cd126af	2015-07-15 08:04:36 +0800	[diff] [blame]	4841	if (!se)
Kirill Tkhai	7246544	2014-05-09 03:00:14 +0400	[diff] [blame]	4842	add_nr_running(rq, 1);
Yuyang Du	cd126af	2015-07-15 08:04:36 +0800	[diff] [blame]	4843
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	4844	hrtick_update(rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4845	}
				4846
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	4847	static void set_next_buddy(struct sched_entity *se);
				4848
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4849	/*
				4850	* The dequeue_task method is called before nr_running is
				4851	* decreased. We remove the task from the rbtree and
				4852	* update the fair scheduling stats:
				4853	*/
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	4854	static void dequeue_task_fair(struct rq rq, struct task_struct p, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4855	{
				4856	struct cfs_rq *cfs_rq;
Peter Zijlstra	62fb185	2008-02-25 17:34:02 +0100	[diff] [blame]	4857	struct sched_entity *se = &p->se;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	4858	int task_sleep = flags & DEQUEUE_SLEEP;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4859
				4860	for_each_sched_entity(se) {
				4861	cfs_rq = cfs_rq_of(se);
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	4862	dequeue_entity(cfs_rq, se, flags);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4863
				4864	/*
				4865	* end evaluation on encountering a throttled cfs_rq
				4866	*
				4867	* note: in the case of encountering a throttled cfs_rq we will
				4868	* post the final h_nr_running decrement below.
				4869	*/
				4870	if (cfs_rq_throttled(cfs_rq))
				4871	break;
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	4872	cfs_rq->h_nr_running--;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	4873
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4874	/* Don't dequeue parent if it has other entities besides us */
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	4875	if (cfs_rq->load.weight) {
Konstantin Khlebnikov	754bd59	2016-06-16 15:57:15 +0300	[diff] [blame]	4876	/* Avoid re-evaluating load for this entity: */
				4877	se = parent_entity(se);
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	4878	/*
				4879	* Bias pick_next to pick a task from this cfs_rq, as
				4880	* p is sleeping when it is within its sched_slice.
				4881	*/
Konstantin Khlebnikov	754bd59	2016-06-16 15:57:15 +0300	[diff] [blame]	4882	if (task_sleep && se && !throttled_hierarchy(cfs_rq))
				4883	set_next_buddy(se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4884	break;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	4885	}
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	4886	flags \|= DEQUEUE_SLEEP;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4887	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	4888
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	4889	for_each_sched_entity(se) {
Lin Ming	0f31714	2011-07-22 09:14:31 +0800	[diff] [blame]	4890	cfs_rq = cfs_rq_of(se);
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	4891	cfs_rq->h_nr_running--;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	4892
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4893	if (cfs_rq_throttled(cfs_rq))
				4894	break;
				4895
Vincent Guittot	d31b1a6	2016-11-08 10:53:44 +0100	[diff] [blame]	4896	update_load_avg(se, UPDATE_TG);
Vincent Guittot	89ee048	2016-12-21 16:50:26 +0100	[diff] [blame]	4897	update_cfs_shares(se);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	4898	}
				4899
Yuyang Du	cd126af	2015-07-15 08:04:36 +0800	[diff] [blame]	4900	if (!se)
Kirill Tkhai	7246544	2014-05-09 03:00:14 +0400	[diff] [blame]	4901	sub_nr_running(rq, 1);
Yuyang Du	cd126af	2015-07-15 08:04:36 +0800	[diff] [blame]	4902
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	4903	hrtick_update(rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4904	}
				4905
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	4906	#ifdef CONFIG_SMP
Peter Zijlstra	10e2f1a	2016-05-09 10:38:05 +0200	[diff] [blame]	4907
				4908	/* Working cpumask for: load_balance, load_balance_newidle. */
				4909	DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
				4910	DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
				4911
Frederic Weisbecker	9fd81dd	2016-04-19 17:36:51 +0200	[diff] [blame]	4912	#ifdef CONFIG_NO_HZ_COMMON
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4913	/*
				4914	* per rq 'load' arrray crap; XXX kill this.
				4915	*/
				4916
				4917	/*
Peter Zijlstra	d937cdc	2015-10-19 13:49:30 +0200	[diff] [blame]	4918	* The exact cpuload calculated at every tick would be:
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4919	*
Peter Zijlstra	d937cdc	2015-10-19 13:49:30 +0200	[diff] [blame]	4920	* load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
				4921	*
				4922	* If a cpu misses updates for n ticks (as it was idle) and update gets
				4923	* called on the n+1-th tick when cpu may be busy, then we have:
				4924	*
				4925	* load_n = (1 - 1/2^i)^n * load_0
				4926	* load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4927	*
				4928	* decay_load_missed() below does efficient calculation of
Peter Zijlstra	d937cdc	2015-10-19 13:49:30 +0200	[diff] [blame]	4929	*
				4930	* load' = (1 - 1/2^i)^n * load
				4931	*
				4932	* Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
				4933	* This allows us to precompute the above in said factors, thereby allowing the
				4934	* reduction of an arbitrary n in O(log_2 n) steps. (See also
				4935	* fixed_power_int())
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4936	*
				4937	* The calculation is approximated on a 128 point scale.
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4938	*/
				4939	#define DEGRADE_SHIFT 7
Peter Zijlstra	d937cdc	2015-10-19 13:49:30 +0200	[diff] [blame]	4940
				4941	static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
				4942	static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
				4943	{ 0, 0, 0, 0, 0, 0, 0, 0 },
				4944	{ 64, 32, 8, 0, 0, 0, 0, 0 },
				4945	{ 96, 72, 40, 12, 1, 0, 0, 0 },
				4946	{ 112, 98, 75, 43, 15, 1, 0, 0 },
				4947	{ 120, 112, 98, 76, 45, 16, 2, 0 }
				4948	};
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4949
				4950	/*
				4951	* Update cpu_load for any missed ticks, due to tickless idle. The backlog
				4952	* would be when CPU is idle and so we just decay the old load without
				4953	* adding any new load.
				4954	*/
				4955	static unsigned long
				4956	decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
				4957	{
				4958	int j = 0;
				4959
				4960	if (!missed_updates)
				4961	return load;
				4962
				4963	if (missed_updates >= degrade_zero_ticks[idx])
				4964	return 0;
				4965
				4966	if (idx == 1)
				4967	return load >> missed_updates;
				4968
				4969	while (missed_updates) {
				4970	if (missed_updates % 2)
				4971	load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
				4972
				4973	missed_updates >>= 1;
				4974	j++;
				4975	}
				4976	return load;
				4977	}
Frederic Weisbecker	9fd81dd	2016-04-19 17:36:51 +0200	[diff] [blame]	4978	#endif /* CONFIG_NO_HZ_COMMON */
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4979
Byungchul Park	5954327	2015-10-14 18:47:35 +0900	[diff] [blame]	4980	/**
Frederic Weisbecker	cee1afc	2016-04-13 15:56:50 +0200	[diff] [blame]	4981	* __cpu_load_update - update the rq->cpu_load[] statistics
Byungchul Park	5954327	2015-10-14 18:47:35 +0900	[diff] [blame]	4982	* @this_rq: The rq to update statistics for
				4983	* @this_load: The current load
				4984	* @pending_updates: The number of missed updates
Byungchul Park	5954327	2015-10-14 18:47:35 +0900	[diff] [blame]	4985	*
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4986	* Update rq->cpu_load[] statistics. This function is usually called every
Byungchul Park	5954327	2015-10-14 18:47:35 +0900	[diff] [blame]	4987	* scheduler tick (TICK_NSEC).
				4988	*
				4989	* This function computes a decaying average:
				4990	*
				4991	* load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
				4992	*
				4993	* Because of NOHZ it might not get called on every tick which gives need for
				4994	* the @pending_updates argument.
				4995	*
				4996	* load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
				4997	* = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
				4998	* = A * (A * load[i]_n-2 + B) + B
				4999	* = A * (A * (A * load[i]_n-3 + B) + B) + B
				5000	* = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
				5001	* = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
				5002	* = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
				5003	* = (1 - 1/2^i)^n * (load[i]_0 - load) + load
				5004	*
				5005	* In the above we've assumed load_n := load, which is true for NOHZ_FULL as
				5006	* any change in load would have resulted in the tick being turned back on.
				5007	*
				5008	* For regular NOHZ, this reduces to:
				5009	*
				5010	* load[i]_n = (1 - 1/2^i)^n * load[i]_0
				5011	*
				5012	* see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
Frederic Weisbecker	1f41906	2016-04-13 15:56:51 +0200	[diff] [blame]	5013	* term.
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	5014	*/
Frederic Weisbecker	1f41906	2016-04-13 15:56:51 +0200	[diff] [blame]	5015	static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
				5016	unsigned long pending_updates)
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	5017	{
Frederic Weisbecker	9fd81dd	2016-04-19 17:36:51 +0200	[diff] [blame]	5018	unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	5019	int i, scale;
				5020
				5021	this_rq->nr_load_updates++;
				5022
				5023	/* Update our load: */
				5024	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
				5025	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
				5026	unsigned long old_load, new_load;
				5027
				5028	/* scale is effectively 1 << i now, and >> i divides by scale */
				5029
Byungchul Park	7400d3b	2016-01-15 16:07:49 +0900	[diff] [blame]	5030	old_load = this_rq->cpu_load[i];
Frederic Weisbecker	9fd81dd	2016-04-19 17:36:51 +0200	[diff] [blame]	5031	#ifdef CONFIG_NO_HZ_COMMON
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	5032	old_load = decay_load_missed(old_load, pending_updates - 1, i);
Byungchul Park	7400d3b	2016-01-15 16:07:49 +0900	[diff] [blame]	5033	if (tickless_load) {
				5034	old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
				5035	/*
				5036	* old_load can never be a negative value because a
				5037	* decayed tickless_load cannot be greater than the
				5038	* original tickless_load.
				5039	*/
				5040	old_load += tickless_load;
				5041	}
Frederic Weisbecker	9fd81dd	2016-04-19 17:36:51 +0200	[diff] [blame]	5042	#endif
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	5043	new_load = this_load;
				5044	/*
				5045	* Round up the averaging division if load is increasing. This
				5046	* prevents us from getting stuck on 9 if the load is 10, for
				5047	* example.
				5048	*/
				5049	if (new_load > old_load)
				5050	new_load += scale - 1;
				5051
				5052	this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
				5053	}
				5054
				5055	sched_avg_update(this_rq);
				5056	}
				5057
Yuyang Du	7ea241a	2015-07-15 08:04:42 +0800	[diff] [blame]	5058	/* Used instead of source_load when we know the type == 0 */
				5059	static unsigned long weighted_cpuload(const int cpu)
				5060	{
				5061	return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
				5062	}
				5063
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	5064	#ifdef CONFIG_NO_HZ_COMMON
Frederic Weisbecker	1f41906	2016-04-13 15:56:51 +0200	[diff] [blame]	5065	/*
				5066	* There is no sane way to deal with nohz on smp when using jiffies because the
				5067	* cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
				5068	* causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
				5069	*
				5070	* Therefore we need to avoid the delta approach from the regular tick when
				5071	* possible since that would seriously skew the load calculation. This is why we
				5072	* use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on
				5073	* jiffies deltas for updates happening while in nohz mode (idle ticks, idle
				5074	* loop exit, nohz_idle_balance, nohz full exit...)
				5075	*
				5076	* This means we might still be one tick off for nohz periods.
				5077	*/
				5078
				5079	static void cpu_load_update_nohz(struct rq *this_rq,
				5080	unsigned long curr_jiffies,
				5081	unsigned long load)
Frederic Weisbecker	be68a68	2016-01-13 17:01:29 +0100	[diff] [blame]	5082	{
				5083	unsigned long pending_updates;
				5084
				5085	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
				5086	if (pending_updates) {
				5087	this_rq->last_load_update_tick = curr_jiffies;
				5088	/*
				5089	* In the regular NOHZ case, we were idle, this means load 0.
				5090	* In the NOHZ_FULL case, we were non-idle, we should consider
				5091	* its weighted load.
				5092	*/
Frederic Weisbecker	1f41906	2016-04-13 15:56:51 +0200	[diff] [blame]	5093	cpu_load_update(this_rq, load, pending_updates);
Frederic Weisbecker	be68a68	2016-01-13 17:01:29 +0100	[diff] [blame]	5094	}
				5095	}
				5096
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	5097	/*
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	5098	* Called from nohz_idle_balance() to update the load ratings before doing the
				5099	* idle balance.
				5100	*/
Frederic Weisbecker	cee1afc	2016-04-13 15:56:50 +0200	[diff] [blame]	5101	static void cpu_load_update_idle(struct rq *this_rq)
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	5102	{
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	5103	/*
				5104	* bail if there's load or we're actually up-to-date.
				5105	*/
Frederic Weisbecker	be68a68	2016-01-13 17:01:29 +0100	[diff] [blame]	5106	if (weighted_cpuload(cpu_of(this_rq)))
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	5107	return;
				5108
Frederic Weisbecker	1f41906	2016-04-13 15:56:51 +0200	[diff] [blame]	5109	cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	5110	}
				5111
				5112	/*
Frederic Weisbecker	1f41906	2016-04-13 15:56:51 +0200	[diff] [blame]	5113	* Record CPU load on nohz entry so we know the tickless load to account
				5114	* on nohz exit. cpu_load[0] happens then to be updated more frequently
				5115	* than other cpu_load[idx] but it should be fine as cpu_load readers
				5116	* shouldn't rely into synchronized cpu_load[*] updates.
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	5117	*/
Frederic Weisbecker	1f41906	2016-04-13 15:56:51 +0200	[diff] [blame]	5118	void cpu_load_update_nohz_start(void)
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	5119	{
				5120	struct rq *this_rq = this_rq();
Frederic Weisbecker	1f41906	2016-04-13 15:56:51 +0200	[diff] [blame]	5121
				5122	/*
				5123	* This is all lockless but should be fine. If weighted_cpuload changes
				5124	* concurrently we'll exit nohz. And cpu_load write can race with
				5125	* cpu_load_update_idle() but both updater would be writing the same.
				5126	*/
				5127	this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
				5128	}
				5129
				5130	/*
				5131	* Account the tickless load in the end of a nohz frame.
				5132	*/
				5133	void cpu_load_update_nohz_stop(void)
				5134	{
Jason Low	316c1608d	2015-04-28 13:00:20 -0700	[diff] [blame]	5135	unsigned long curr_jiffies = READ_ONCE(jiffies);
Frederic Weisbecker	1f41906	2016-04-13 15:56:51 +0200	[diff] [blame]	5136	struct rq *this_rq = this_rq();
				5137	unsigned long load;
Peter Zijlstra	8a8c69c	2016-10-04 16:04:35 +0200	[diff] [blame]	5138	struct rq_flags rf;
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	5139
				5140	if (curr_jiffies == this_rq->last_load_update_tick)
				5141	return;
				5142
Frederic Weisbecker	1f41906	2016-04-13 15:56:51 +0200	[diff] [blame]	5143	load = weighted_cpuload(cpu_of(this_rq));
Peter Zijlstra	8a8c69c	2016-10-04 16:04:35 +0200	[diff] [blame]	5144	rq_lock(this_rq, &rf);
Matt Fleming	b52fad2	2016-05-03 20:46:54 +0100	[diff] [blame]	5145	update_rq_clock(this_rq);
Frederic Weisbecker	1f41906	2016-04-13 15:56:51 +0200	[diff] [blame]	5146	cpu_load_update_nohz(this_rq, curr_jiffies, load);
Peter Zijlstra	8a8c69c	2016-10-04 16:04:35 +0200	[diff] [blame]	5147	rq_unlock(this_rq, &rf);
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	5148	}
Frederic Weisbecker	1f41906	2016-04-13 15:56:51 +0200	[diff] [blame]	5149	#else /* !CONFIG_NO_HZ_COMMON */
				5150	static inline void cpu_load_update_nohz(struct rq *this_rq,
				5151	unsigned long curr_jiffies,
				5152	unsigned long load) { }
				5153	#endif /* CONFIG_NO_HZ_COMMON */
				5154
				5155	static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
				5156	{
Frederic Weisbecker	9fd81dd	2016-04-19 17:36:51 +0200	[diff] [blame]	5157	#ifdef CONFIG_NO_HZ_COMMON
Frederic Weisbecker	1f41906	2016-04-13 15:56:51 +0200	[diff] [blame]	5158	/* See the mess around cpu_load_update_nohz(). */
				5159	this_rq->last_load_update_tick = READ_ONCE(jiffies);
Frederic Weisbecker	9fd81dd	2016-04-19 17:36:51 +0200	[diff] [blame]	5160	#endif
Frederic Weisbecker	1f41906	2016-04-13 15:56:51 +0200	[diff] [blame]	5161	cpu_load_update(this_rq, load, 1);
				5162	}
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	5163
				5164	/*
				5165	* Called from scheduler_tick()
				5166	*/
Frederic Weisbecker	cee1afc	2016-04-13 15:56:50 +0200	[diff] [blame]	5167	void cpu_load_update_active(struct rq *this_rq)
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	5168	{
Yuyang Du	7ea241a	2015-07-15 08:04:42 +0800	[diff] [blame]	5169	unsigned long load = weighted_cpuload(cpu_of(this_rq));
Frederic Weisbecker	1f41906	2016-04-13 15:56:51 +0200	[diff] [blame]	5170
				5171	if (tick_nohz_tick_stopped())
				5172	cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
				5173	else
				5174	cpu_load_update_periodic(this_rq, load);
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	5175	}
				5176
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5177	/*
				5178	* Return a low guess at the load of a migration-source cpu weighted
				5179	* according to the scheduling class and "nice" value.
				5180	*
				5181	* We want to under-estimate the load of migration sources, to
				5182	* balance conservatively.
				5183	*/
				5184	static unsigned long source_load(int cpu, int type)
				5185	{
				5186	struct rq *rq = cpu_rq(cpu);
				5187	unsigned long total = weighted_cpuload(cpu);
				5188
				5189	if (type == 0 \|\| !sched_feat(LB_BIAS))
				5190	return total;
				5191
				5192	return min(rq->cpu_load[type-1], total);
				5193	}
				5194
				5195	/*
				5196	* Return a high guess at the load of a migration-target cpu weighted
				5197	* according to the scheduling class and "nice" value.
				5198	*/
				5199	static unsigned long target_load(int cpu, int type)
				5200	{
				5201	struct rq *rq = cpu_rq(cpu);
				5202	unsigned long total = weighted_cpuload(cpu);
				5203
				5204	if (type == 0 \|\| !sched_feat(LB_BIAS))
				5205	return total;
				5206
				5207	return max(rq->cpu_load[type-1], total);
				5208	}
				5209
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	5210	static unsigned long capacity_of(int cpu)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5211	{
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	5212	return cpu_rq(cpu)->cpu_capacity;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5213	}
				5214
Vincent Guittot	ca6d75e	2015-02-27 16:54:09 +0100	[diff] [blame]	5215	static unsigned long capacity_orig_of(int cpu)
				5216	{
				5217	return cpu_rq(cpu)->cpu_capacity_orig;
				5218	}
				5219
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5220	static unsigned long cpu_avg_load_per_task(int cpu)
				5221	{
				5222	struct rq *rq = cpu_rq(cpu);
Jason Low	316c1608d	2015-04-28 13:00:20 -0700	[diff] [blame]	5223	unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
Yuyang Du	7ea241a	2015-07-15 08:04:42 +0800	[diff] [blame]	5224	unsigned long load_avg = weighted_cpuload(cpu);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5225
				5226	if (nr_running)
Alex Shi	b92486c	2013-06-20 10:18:50 +0800	[diff] [blame]	5227	return load_avg / nr_running;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5228
				5229	return 0;
				5230	}
				5231
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	5232	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	f5bfb7d	2008-06-27 13:41:39 +0200	[diff] [blame]	5233	/*
				5234	* effective_load() calculates the load change as seen from the root_task_group
				5235	*
				5236	* Adding load to a group doesn't make a group heavier, but can cause movement
				5237	* of group shares between cpus. Assuming the shares were perfectly aligned one
				5238	* can calculate the shift in shares.
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	5239	*
				5240	* Calculate the effective load difference if @wl is added (subtracted) to @tg
				5241	* on this @cpu and results in a total addition (subtraction) of @wg to the
				5242	* total group weight.
				5243	*
				5244	* Given a runqueue weight distribution (rw_i) we can compute a shares
				5245	* distribution (s_i) using:
				5246	*
				5247	* s_i = rw_i / \Sum rw_j (1)
				5248	*
				5249	* Suppose we have 4 CPUs and our @tg is a direct child of the root group and
				5250	* has 7 equal weight tasks, distributed as below (rw_i), with the resulting
				5251	* shares distribution (s_i):
				5252	*
				5253	* rw_i = { 2, 4, 1, 0 }
				5254	* s_i = { 2/7, 4/7, 1/7, 0 }
				5255	*
				5256	* As per wake_affine() we're interested in the load of two CPUs (the CPU the
				5257	* task used to run on and the CPU the waker is running on), we need to
				5258	* compute the effect of waking a task on either CPU and, in case of a sync
				5259	* wakeup, compute the effect of the current task going to sleep.
				5260	*
				5261	* So for a change of @wl to the local @cpu with an overall group weight change
				5262	* of @wl we can compute the new shares distribution (s'_i) using:
				5263	*
				5264	* s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
				5265	*
				5266	* Suppose we're interested in CPUs 0 and 1, and want to compute the load
				5267	* differences in waking a task to CPU 0. The additional task changes the
				5268	* weight and shares distributions like:
				5269	*
				5270	* rw'_i = { 3, 4, 1, 0 }
				5271	* s'_i = { 3/8, 4/8, 1/8, 0 }
				5272	*
				5273	* We can then compute the difference in effective weight by using:
				5274	*
				5275	* dw_i = S * (s'_i - s_i) (3)
				5276	*
				5277	* Where 'S' is the group weight as seen by its parent.
				5278	*
				5279	* Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
				5280	* times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
				5281	* 4/7) times the weight of the group.
Peter Zijlstra	f5bfb7d	2008-06-27 13:41:39 +0200	[diff] [blame]	5282	*/
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	5283	static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	5284	{
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	5285	struct sched_entity *se = tg->se[cpu];
Peter Zijlstra	f1d239f	2008-06-27 13:41:38 +0200	[diff] [blame]	5286
Rik van Riel	9722c2d	2014-01-06 11:39:12 +0000	[diff] [blame]	5287	if (!tg->parent) /* the trivial, non-cgroup case */
Peter Zijlstra	f1d239f	2008-06-27 13:41:38 +0200	[diff] [blame]	5288	return wl;
				5289
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	5290	for_each_sched_entity(se) {
Peter Zijlstra	7dd4912	2016-06-24 15:53:54 +0200	[diff] [blame]	5291	struct cfs_rq *cfs_rq = se->my_q;
				5292	long W, w = cfs_rq_load_avg(cfs_rq);
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	5293
Peter Zijlstra	7dd4912	2016-06-24 15:53:54 +0200	[diff] [blame]	5294	tg = cfs_rq->tg;
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	5295
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	5296	/*
				5297	* W = @wg + \Sum rw_j
				5298	*/
Peter Zijlstra	7dd4912	2016-06-24 15:53:54 +0200	[diff] [blame]	5299	W = wg + atomic_long_read(&tg->load_avg);
				5300
				5301	/* Ensure \Sum rw_j >= rw_i */
				5302	W -= cfs_rq->tg_load_avg_contrib;
				5303	W += w;
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	5304
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	5305	/*
				5306	* w = rw_i + @wl
				5307	*/
Peter Zijlstra	7dd4912	2016-06-24 15:53:54 +0200	[diff] [blame]	5308	w += wl;
Peter Zijlstra	940959e	2008-09-23 15:33:42 +0200	[diff] [blame]	5309
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	5310	/*
				5311	* wl = S * s'_i; see (2)
				5312	*/
				5313	if (W > 0 && w < W)
Dietmar Eggemann	ab522e3	2016-08-22 15:00:41 +0100	[diff] [blame]	5314	wl = (w * (long)scale_load_down(tg->shares)) / W;
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	5315	else
Dietmar Eggemann	ab522e3	2016-08-22 15:00:41 +0100	[diff] [blame]	5316	wl = scale_load_down(tg->shares);
Peter Zijlstra	940959e	2008-09-23 15:33:42 +0200	[diff] [blame]	5317
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	5318	/*
				5319	* Per the above, wl is the new se->load.weight value; since
				5320	* those are clipped to [MIN_SHARES, ...) do so now. See
				5321	* calc_cfs_shares().
				5322	*/
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	5323	if (wl < MIN_SHARES)
				5324	wl = MIN_SHARES;
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	5325
				5326	/*
				5327	* wl = dw_i = S * (s'_i - s_i); see (3)
				5328	*/
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	5329	wl -= se->avg.load_avg;
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	5330
				5331	/*
				5332	* Recursively apply this logic to all parent groups to compute
				5333	* the final effective load change on the root group. Since
				5334	* only the @tg group gets extra weight, all parent groups can
				5335	* only redistribute existing shares. @wl is the shift in shares
				5336	* resulting from this level per the above.
				5337	*/
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	5338	wg = 0;
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	5339	}
				5340
				5341	return wl;
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	5342	}
				5343	#else
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	5344
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	5345	static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	5346	{
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	5347	return wl;
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	5348	}
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	5349
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	5350	#endif
				5351
Peter Zijlstra	c58d25f	2016-05-12 09:19:59 +0200	[diff] [blame]	5352	static void record_wakee(struct task_struct *p)
				5353	{
				5354	/*
				5355	* Only decay a single time; tasks that have less then 1 wakeup per
				5356	* jiffy will not have built up many flips.
				5357	*/
				5358	if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
				5359	current->wakee_flips >>= 1;
				5360	current->wakee_flip_decay_ts = jiffies;
				5361	}
				5362
				5363	if (current->last_wakee != p) {
				5364	current->last_wakee = p;
				5365	current->wakee_flips++;
				5366	}
				5367	}
				5368
Mike Galbraith	63b0e9e	2015-07-14 17:39:50 +0200	[diff] [blame]	5369	/*
				5370	* Detect M:N waker/wakee relationships via a switching-frequency heuristic.
Peter Zijlstra	c58d25f	2016-05-12 09:19:59 +0200	[diff] [blame]	5371	*
Mike Galbraith	63b0e9e	2015-07-14 17:39:50 +0200	[diff] [blame]	5372	* A waker of many should wake a different task than the one last awakened
Peter Zijlstra	c58d25f	2016-05-12 09:19:59 +0200	[diff] [blame]	5373	* at a frequency roughly N times higher than one of its wakees.
				5374	*
				5375	* In order to determine whether we should let the load spread vs consolidating
				5376	* to shared cache, we look for a minimum 'flip' frequency of llc_size in one
				5377	* partner, and a factor of lls_size higher frequency in the other.
				5378	*
				5379	* With both conditions met, we can be relatively sure that the relationship is
				5380	* non-monogamous, with partner count exceeding socket size.
				5381	*
				5382	* Waker/wakee being client/server, worker/dispatcher, interrupt source or
				5383	* whatever is irrelevant, spread criteria is apparent partner count exceeds
				5384	* socket size.
Mike Galbraith	63b0e9e	2015-07-14 17:39:50 +0200	[diff] [blame]	5385	*/
Michael Wang	6247041	2013-07-04 12:55:51 +0800	[diff] [blame]	5386	static int wake_wide(struct task_struct *p)
				5387	{
Mike Galbraith	63b0e9e	2015-07-14 17:39:50 +0200	[diff] [blame]	5388	unsigned int master = current->wakee_flips;
				5389	unsigned int slave = p->wakee_flips;
Peter Zijlstra	7d9ffa8	2013-07-04 12:56:46 +0800	[diff] [blame]	5390	int factor = this_cpu_read(sd_llc_size);
Michael Wang	6247041	2013-07-04 12:55:51 +0800	[diff] [blame]	5391
Mike Galbraith	63b0e9e	2015-07-14 17:39:50 +0200	[diff] [blame]	5392	if (master < slave)
				5393	swap(master, slave);
				5394	if (slave < factor \|\| master < slave * factor)
				5395	return 0;
				5396	return 1;
Michael Wang	6247041	2013-07-04 12:55:51 +0800	[diff] [blame]	5397	}
				5398
Morten Rasmussen	772bd008c	2016-06-22 18:03:13 +0100	[diff] [blame]	5399	static int wake_affine(struct sched_domain sd, struct task_struct p,
				5400	int prev_cpu, int sync)
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	5401	{
Paul Turner	e37b6a7	2011-01-21 20:44:59 -0800	[diff] [blame]	5402	s64 this_load, load;
Vincent Guittot	bd61c98	2014-08-26 13:06:50 +0200	[diff] [blame]	5403	s64 this_eff_load, prev_eff_load;
Morten Rasmussen	772bd008c	2016-06-22 18:03:13 +0100	[diff] [blame]	5404	int idx, this_cpu;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	5405	struct task_group *tg;
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	5406	unsigned long weight;
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	5407	int balanced;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	5408
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	5409	idx = sd->wake_idx;
				5410	this_cpu = smp_processor_id();
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	5411	load = source_load(prev_cpu, idx);
				5412	this_load = target_load(this_cpu, idx);
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	5413
				5414	/*
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	5415	* If sync wakeup then subtract the (maximum possible)
				5416	* effect of the currently running task from the load
				5417	* of the current CPU:
				5418	*/
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	5419	if (sync) {
				5420	tg = task_group(current);
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	5421	weight = current->se.avg.load_avg;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	5422
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	5423	this_load += effective_load(tg, this_cpu, -weight, -weight);
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	5424	load += effective_load(tg, prev_cpu, 0, -weight);
				5425	}
				5426
				5427	tg = task_group(p);
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	5428	weight = p->se.avg.load_avg;
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	5429
Peter Zijlstra	71a29aa	2009-09-07 18:28:05 +0200	[diff] [blame]	5430	/*
				5431	* In low-load situations, where prev_cpu is idle and this_cpu is idle
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	5432	* due to the sync cause above having dropped this_load to 0, we'll
				5433	* always have an imbalance, but there's really nothing you can do
				5434	* about that, so that's good too.
Peter Zijlstra	71a29aa	2009-09-07 18:28:05 +0200	[diff] [blame]	5435	*
				5436	* Otherwise check if either cpus are near enough in load to allow this
				5437	* task to be woken on this_cpu.
				5438	*/
Vincent Guittot	bd61c98	2014-08-26 13:06:50 +0200	[diff] [blame]	5439	this_eff_load = 100;
				5440	this_eff_load *= capacity_of(prev_cpu);
Peter Zijlstra	e51fd5e	2010-05-31 12:37:30 +0200	[diff] [blame]	5441
Vincent Guittot	bd61c98	2014-08-26 13:06:50 +0200	[diff] [blame]	5442	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
				5443	prev_eff_load *= capacity_of(this_cpu);
				5444
				5445	if (this_load > 0) {
Peter Zijlstra	e51fd5e	2010-05-31 12:37:30 +0200	[diff] [blame]	5446	this_eff_load *= this_load +
				5447	effective_load(tg, this_cpu, weight, weight);
				5448
Peter Zijlstra	e51fd5e	2010-05-31 12:37:30 +0200	[diff] [blame]	5449	prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
Vincent Guittot	bd61c98	2014-08-26 13:06:50 +0200	[diff] [blame]	5450	}
Peter Zijlstra	e51fd5e	2010-05-31 12:37:30 +0200	[diff] [blame]	5451
Vincent Guittot	bd61c98	2014-08-26 13:06:50 +0200	[diff] [blame]	5452	balanced = this_eff_load <= prev_eff_load;
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	5453
Josh Poimboeuf	ae92882	2016-06-17 12:43:24 -0500	[diff] [blame]	5454	schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	5455
Vincent Guittot	05bfb65	2014-08-26 13:06:45 +0200	[diff] [blame]	5456	if (!balanced)
				5457	return 0;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	5458
Josh Poimboeuf	ae92882	2016-06-17 12:43:24 -0500	[diff] [blame]	5459	schedstat_inc(sd->ttwu_move_affine);
				5460	schedstat_inc(p->se.statistics.nr_wakeups_affine);
Vincent Guittot	05bfb65	2014-08-26 13:06:45 +0200	[diff] [blame]	5461
				5462	return 1;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	5463	}
				5464
Morten Rasmussen	6a0b19c	2016-10-14 14:41:08 +0100	[diff] [blame]	5465	static inline int task_util(struct task_struct *p);
				5466	static int cpu_util_wake(int cpu, struct task_struct *p);
				5467
				5468	static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
				5469	{
				5470	return capacity_orig_of(cpu) - cpu_util_wake(cpu, p);
				5471	}
				5472
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5473	/*
				5474	* find_idlest_group finds and returns the least busy CPU group within the
				5475	* domain.
				5476	*/
				5477	static struct sched_group *
Peter Zijlstra	78e7ed5	2009-09-03 13:16:51 +0200	[diff] [blame]	5478	find_idlest_group(struct sched_domain sd, struct task_struct p,
Vincent Guittot	c44f2a0	2013-10-18 13:52:21 +0200	[diff] [blame]	5479	int this_cpu, int sd_flag)
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	5480	{
Andi Kleen	b3bd3de	2010-08-10 14:17:51 -0700	[diff] [blame]	5481	struct sched_group idlest = NULL, group = sd->groups;
Morten Rasmussen	6a0b19c	2016-10-14 14:41:08 +0100	[diff] [blame]	5482	struct sched_group *most_spare_sg = NULL;
Vincent Guittot	6b94780	2016-12-08 17:56:54 +0100	[diff] [blame]	5483	unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = 0;
				5484	unsigned long min_avg_load = ULONG_MAX, this_avg_load = 0;
Morten Rasmussen	6a0b19c	2016-10-14 14:41:08 +0100	[diff] [blame]	5485	unsigned long most_spare = 0, this_spare = 0;
Vincent Guittot	c44f2a0	2013-10-18 13:52:21 +0200	[diff] [blame]	5486	int load_idx = sd->forkexec_idx;
Vincent Guittot	6b94780	2016-12-08 17:56:54 +0100	[diff] [blame]	5487	int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
				5488	unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
				5489	(sd->imbalance_pct-100) / 100;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	5490
Vincent Guittot	c44f2a0	2013-10-18 13:52:21 +0200	[diff] [blame]	5491	if (sd_flag & SD_BALANCE_WAKE)
				5492	load_idx = sd->wake_idx;
				5493
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5494	do {
Vincent Guittot	6b94780	2016-12-08 17:56:54 +0100	[diff] [blame]	5495	unsigned long load, avg_load, runnable_load;
				5496	unsigned long spare_cap, max_spare_cap;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5497	int local_group;
				5498	int i;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	5499
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5500	/* Skip over this group if it has no CPUs allowed */
				5501	if (!cpumask_intersects(sched_group_cpus(group),
Ingo Molnar	0c98d34	2017-02-05 15:38:10 +0100	[diff] [blame]	5502	&p->cpus_allowed))
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5503	continue;
				5504
				5505	local_group = cpumask_test_cpu(this_cpu,
				5506	sched_group_cpus(group));
				5507
Morten Rasmussen	6a0b19c	2016-10-14 14:41:08 +0100	[diff] [blame]	5508	/*
				5509	* Tally up the load of all CPUs in the group and find
				5510	* the group containing the CPU with most spare capacity.
				5511	*/
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5512	avg_load = 0;
Vincent Guittot	6b94780	2016-12-08 17:56:54 +0100	[diff] [blame]	5513	runnable_load = 0;
Morten Rasmussen	6a0b19c	2016-10-14 14:41:08 +0100	[diff] [blame]	5514	max_spare_cap = 0;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5515
				5516	for_each_cpu(i, sched_group_cpus(group)) {
				5517	/* Bias balancing toward cpus of our domain */
				5518	if (local_group)
				5519	load = source_load(i, load_idx);
				5520	else
				5521	load = target_load(i, load_idx);
				5522
Vincent Guittot	6b94780	2016-12-08 17:56:54 +0100	[diff] [blame]	5523	runnable_load += load;
				5524
				5525	avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
Morten Rasmussen	6a0b19c	2016-10-14 14:41:08 +0100	[diff] [blame]	5526
				5527	spare_cap = capacity_spare_wake(i, p);
				5528
				5529	if (spare_cap > max_spare_cap)
				5530	max_spare_cap = spare_cap;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5531	}
				5532
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	5533	/* Adjust by relative CPU capacity of the group */
Vincent Guittot	6b94780	2016-12-08 17:56:54 +0100	[diff] [blame]	5534	avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
				5535	group->sgc->capacity;
				5536	runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
				5537	group->sgc->capacity;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5538
				5539	if (local_group) {
Vincent Guittot	6b94780	2016-12-08 17:56:54 +0100	[diff] [blame]	5540	this_runnable_load = runnable_load;
				5541	this_avg_load = avg_load;
Morten Rasmussen	6a0b19c	2016-10-14 14:41:08 +0100	[diff] [blame]	5542	this_spare = max_spare_cap;
				5543	} else {
Vincent Guittot	6b94780	2016-12-08 17:56:54 +0100	[diff] [blame]	5544	if (min_runnable_load > (runnable_load + imbalance)) {
				5545	/*
				5546	* The runnable load is significantly smaller
				5547	* so we can pick this new cpu
				5548	*/
				5549	min_runnable_load = runnable_load;
				5550	min_avg_load = avg_load;
				5551	idlest = group;
				5552	} else if ((runnable_load < (min_runnable_load + imbalance)) &&
				5553	(100min_avg_load > imbalance_scaleavg_load)) {
				5554	/*
				5555	* The runnable loads are close so take the
				5556	* blocked load into account through avg_load.
				5557	*/
				5558	min_avg_load = avg_load;
Morten Rasmussen	6a0b19c	2016-10-14 14:41:08 +0100	[diff] [blame]	5559	idlest = group;
				5560	}
				5561
				5562	if (most_spare < max_spare_cap) {
				5563	most_spare = max_spare_cap;
				5564	most_spare_sg = group;
				5565	}
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5566	}
				5567	} while (group = group->next, group != sd->groups);
				5568
Morten Rasmussen	6a0b19c	2016-10-14 14:41:08 +0100	[diff] [blame]	5569	/*
				5570	* The cross-over point between using spare capacity or least load
				5571	* is too conservative for high utilization tasks on partially
				5572	* utilized systems if we require spare_capacity > task_util(p),
				5573	* so we allow for some task stuffing by using
				5574	* spare_capacity > task_util(p)/2.
Vincent Guittot	f519a3f	2016-12-08 17:56:53 +0100	[diff] [blame]	5575	*
				5576	* Spare capacity can't be used for fork because the utilization has
				5577	* not been set yet, we must first select a rq to compute the initial
				5578	* utilization.
Morten Rasmussen	6a0b19c	2016-10-14 14:41:08 +0100	[diff] [blame]	5579	*/
Vincent Guittot	f519a3f	2016-12-08 17:56:53 +0100	[diff] [blame]	5580	if (sd_flag & SD_BALANCE_FORK)
				5581	goto skip_spare;
				5582
Morten Rasmussen	6a0b19c	2016-10-14 14:41:08 +0100	[diff] [blame]	5583	if (this_spare > task_util(p) / 2 &&
Vincent Guittot	6b94780	2016-12-08 17:56:54 +0100	[diff] [blame]	5584	imbalance_scalethis_spare > 100most_spare)
Morten Rasmussen	6a0b19c	2016-10-14 14:41:08 +0100	[diff] [blame]	5585	return NULL;
Vincent Guittot	6b94780	2016-12-08 17:56:54 +0100	[diff] [blame]	5586
				5587	if (most_spare > task_util(p) / 2)
Morten Rasmussen	6a0b19c	2016-10-14 14:41:08 +0100	[diff] [blame]	5588	return most_spare_sg;
				5589
Vincent Guittot	f519a3f	2016-12-08 17:56:53 +0100	[diff] [blame]	5590	skip_spare:
Vincent Guittot	6b94780	2016-12-08 17:56:54 +0100	[diff] [blame]	5591	if (!idlest)
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5592	return NULL;
Vincent Guittot	6b94780	2016-12-08 17:56:54 +0100	[diff] [blame]	5593
				5594	if (min_runnable_load > (this_runnable_load + imbalance))
				5595	return NULL;
				5596
				5597	if ((this_runnable_load < (min_runnable_load + imbalance)) &&
				5598	(100this_avg_load < imbalance_scalemin_avg_load))
				5599	return NULL;
				5600
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5601	return idlest;
				5602	}
				5603
				5604	/*
				5605	* find_idlest_cpu - find the idlest cpu among the cpus in group.
				5606	*/
				5607	static int
				5608	find_idlest_cpu(struct sched_group group, struct task_struct p, int this_cpu)
				5609	{
				5610	unsigned long load, min_load = ULONG_MAX;
Nicolas Pitre	83a0a96	2014-09-04 11:32:10 -0400	[diff] [blame]	5611	unsigned int min_exit_latency = UINT_MAX;
				5612	u64 latest_idle_timestamp = 0;
				5613	int least_loaded_cpu = this_cpu;
				5614	int shallowest_idle_cpu = -1;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5615	int i;
				5616
Morten Rasmussen	eaecf41	2016-06-22 18:03:14 +0100	[diff] [blame]	5617	/* Check if we have any choice: */
				5618	if (group->group_weight == 1)
				5619	return cpumask_first(sched_group_cpus(group));
				5620
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5621	/* Traverse only the allowed CPUs */
Ingo Molnar	0c98d34	2017-02-05 15:38:10 +0100	[diff] [blame]	5622	for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
Nicolas Pitre	83a0a96	2014-09-04 11:32:10 -0400	[diff] [blame]	5623	if (idle_cpu(i)) {
				5624	struct rq *rq = cpu_rq(i);
				5625	struct cpuidle_state *idle = idle_get_state(rq);
				5626	if (idle && idle->exit_latency < min_exit_latency) {
				5627	/*
				5628	* We give priority to a CPU whose idle state
				5629	* has the smallest exit latency irrespective
				5630	* of any idle timestamp.
				5631	*/
				5632	min_exit_latency = idle->exit_latency;
				5633	latest_idle_timestamp = rq->idle_stamp;
				5634	shallowest_idle_cpu = i;
				5635	} else if ((!idle \|\| idle->exit_latency == min_exit_latency) &&
				5636	rq->idle_stamp > latest_idle_timestamp) {
				5637	/*
				5638	* If equal or no active idle state, then
				5639	* the most recently idled CPU might have
				5640	* a warmer cache.
				5641	*/
				5642	latest_idle_timestamp = rq->idle_stamp;
				5643	shallowest_idle_cpu = i;
				5644	}
Yao Dongdong	9f96742	2014-10-28 04:08:06 +0000	[diff] [blame]	5645	} else if (shallowest_idle_cpu == -1) {
Nicolas Pitre	83a0a96	2014-09-04 11:32:10 -0400	[diff] [blame]	5646	load = weighted_cpuload(i);
				5647	if (load < min_load \|\| (load == min_load && i == this_cpu)) {
				5648	min_load = load;
				5649	least_loaded_cpu = i;
				5650	}
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	5651	}
				5652	}
				5653
Nicolas Pitre	83a0a96	2014-09-04 11:32:10 -0400	[diff] [blame]	5654	return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5655	}
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	5656
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5657	/*
Peter Zijlstra	10e2f1a	2016-05-09 10:38:05 +0200	[diff] [blame]	5658	* Implement a for_each_cpu() variant that starts the scan at a given cpu
				5659	* (@start), and wraps around.
				5660	*
				5661	* This is used to scan for idle CPUs; such that not all CPUs looking for an
				5662	* idle CPU find the same CPU. The down-side is that tasks tend to cycle
				5663	* through the LLC domain.
				5664	*
				5665	* Especially tbench is found sensitive to this.
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	5666	*/
Peter Zijlstra	10e2f1a	2016-05-09 10:38:05 +0200	[diff] [blame]	5667
				5668	static int cpumask_next_wrap(int n, const struct cpumask mask, int start, int wrapped)
				5669	{
				5670	int next;
				5671
				5672	again:
				5673	next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1);
				5674
				5675	if (*wrapped) {
				5676	if (next >= start)
				5677	return nr_cpumask_bits;
				5678	} else {
				5679	if (next >= nr_cpumask_bits) {
				5680	*wrapped = 1;
				5681	n = -1;
				5682	goto again;
				5683	}
				5684	}
				5685
				5686	return next;
				5687	}
				5688
				5689	#define for_each_cpu_wrap(cpu, mask, start, wrap) \
				5690	for ((wrap) = 0, (cpu) = (start)-1; \
				5691	(cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \
				5692	(cpu) < nr_cpumask_bits; )
				5693
				5694	#ifdef CONFIG_SCHED_SMT
				5695
				5696	static inline void set_idle_cores(int cpu, int val)
				5697	{
				5698	struct sched_domain_shared *sds;
				5699
				5700	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
				5701	if (sds)
				5702	WRITE_ONCE(sds->has_idle_cores, val);
				5703	}
				5704
				5705	static inline bool test_idle_cores(int cpu, bool def)
				5706	{
				5707	struct sched_domain_shared *sds;
				5708
				5709	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
				5710	if (sds)
				5711	return READ_ONCE(sds->has_idle_cores);
				5712
				5713	return def;
				5714	}
				5715
				5716	/*
				5717	* Scans the local SMT mask to see if the entire core is idle, and records this
				5718	* information in sd_llc_shared->has_idle_cores.
				5719	*
				5720	* Since SMT siblings share all cache levels, inspecting this limited remote
				5721	* state should be fairly cheap.
				5722	*/
Peter Zijlstra	1b568f0	2016-05-09 10:38:41 +0200	[diff] [blame]	5723	void __update_idle_core(struct rq *rq)
Peter Zijlstra	10e2f1a	2016-05-09 10:38:05 +0200	[diff] [blame]	5724	{
				5725	int core = cpu_of(rq);
				5726	int cpu;
				5727
				5728	rcu_read_lock();
				5729	if (test_idle_cores(core, true))
				5730	goto unlock;
				5731
				5732	for_each_cpu(cpu, cpu_smt_mask(core)) {
				5733	if (cpu == core)
				5734	continue;
				5735
				5736	if (!idle_cpu(cpu))
				5737	goto unlock;
				5738	}
				5739
				5740	set_idle_cores(core, 1);
				5741	unlock:
				5742	rcu_read_unlock();
				5743	}
				5744
				5745	/*
				5746	* Scan the entire LLC domain for idle cores; this dynamically switches off if
				5747	* there are no idle cores left in the system; tracked through
				5748	* sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
				5749	*/
				5750	static int select_idle_core(struct task_struct p, struct sched_domain sd, int target)
				5751	{
				5752	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
				5753	int core, cpu, wrap;
				5754
Peter Zijlstra	1b568f0	2016-05-09 10:38:41 +0200	[diff] [blame]	5755	if (!static_branch_likely(&sched_smt_present))
				5756	return -1;
				5757
Peter Zijlstra	10e2f1a	2016-05-09 10:38:05 +0200	[diff] [blame]	5758	if (!test_idle_cores(target, false))
				5759	return -1;
				5760
Ingo Molnar	0c98d34	2017-02-05 15:38:10 +0100	[diff] [blame]	5761	cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
Peter Zijlstra	10e2f1a	2016-05-09 10:38:05 +0200	[diff] [blame]	5762
				5763	for_each_cpu_wrap(core, cpus, target, wrap) {
				5764	bool idle = true;
				5765
				5766	for_each_cpu(cpu, cpu_smt_mask(core)) {
				5767	cpumask_clear_cpu(cpu, cpus);
				5768	if (!idle_cpu(cpu))
				5769	idle = false;
				5770	}
				5771
				5772	if (idle)
				5773	return core;
				5774	}
				5775
				5776	/*
				5777	* Failed to find an idle core; stop looking for one.
				5778	*/
				5779	set_idle_cores(target, 0);
				5780
				5781	return -1;
				5782	}
				5783
				5784	/*
				5785	* Scan the local SMT mask for idle CPUs.
				5786	*/
				5787	static int select_idle_smt(struct task_struct p, struct sched_domain sd, int target)
				5788	{
				5789	int cpu;
				5790
Peter Zijlstra	1b568f0	2016-05-09 10:38:41 +0200	[diff] [blame]	5791	if (!static_branch_likely(&sched_smt_present))
				5792	return -1;
				5793
Peter Zijlstra	10e2f1a	2016-05-09 10:38:05 +0200	[diff] [blame]	5794	for_each_cpu(cpu, cpu_smt_mask(target)) {
Ingo Molnar	0c98d34	2017-02-05 15:38:10 +0100	[diff] [blame]	5795	if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
Peter Zijlstra	10e2f1a	2016-05-09 10:38:05 +0200	[diff] [blame]	5796	continue;
				5797	if (idle_cpu(cpu))
				5798	return cpu;
				5799	}
				5800
				5801	return -1;
				5802	}
				5803
				5804	#else /* CONFIG_SCHED_SMT */
				5805
				5806	static inline int select_idle_core(struct task_struct p, struct sched_domain sd, int target)
				5807	{
				5808	return -1;
				5809	}
				5810
				5811	static inline int select_idle_smt(struct task_struct p, struct sched_domain sd, int target)
				5812	{
				5813	return -1;
				5814	}
				5815
				5816	#endif /* CONFIG_SCHED_SMT */
				5817
				5818	/*
				5819	* Scan the LLC domain for idle CPUs; this is dynamically regulated by
				5820	* comparing the average scan cost (tracked in sd->avg_scan_cost) against the
				5821	* average idle time for this rq (as found in rq->avg_idle).
				5822	*/
				5823	static int select_idle_cpu(struct task_struct p, struct sched_domain sd, int target)
				5824	{
Wanpeng Li	9cfb38a	2016-10-09 08:04:03 +0800	[diff] [blame]	5825	struct sched_domain *this_sd;
				5826	u64 avg_cost, avg_idle = this_rq()->avg_idle;
Peter Zijlstra	10e2f1a	2016-05-09 10:38:05 +0200	[diff] [blame]	5827	u64 time, cost;
				5828	s64 delta;
				5829	int cpu, wrap;
				5830
Wanpeng Li	9cfb38a	2016-10-09 08:04:03 +0800	[diff] [blame]	5831	this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
				5832	if (!this_sd)
				5833	return -1;
				5834
				5835	avg_cost = this_sd->avg_scan_cost;
				5836
Peter Zijlstra	10e2f1a	2016-05-09 10:38:05 +0200	[diff] [blame]	5837	/*
				5838	* Due to large variance we need a large fuzz factor; hackbench in
				5839	* particularly is sensitive here.
				5840	*/
Peter Zijlstra	4c77b18	2017-03-01 11:24:35 +0100	[diff] [blame]	5841	if (sched_feat(SIS_AVG_CPU) && (avg_idle / 512) < avg_cost)
Peter Zijlstra	10e2f1a	2016-05-09 10:38:05 +0200	[diff] [blame]	5842	return -1;
				5843
				5844	time = local_clock();
				5845
				5846	for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
Ingo Molnar	0c98d34	2017-02-05 15:38:10 +0100	[diff] [blame]	5847	if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
Peter Zijlstra	10e2f1a	2016-05-09 10:38:05 +0200	[diff] [blame]	5848	continue;
				5849	if (idle_cpu(cpu))
				5850	break;
				5851	}
				5852
				5853	time = local_clock() - time;
				5854	cost = this_sd->avg_scan_cost;
				5855	delta = (s64)(time - cost) / 8;
				5856	this_sd->avg_scan_cost += delta;
				5857
				5858	return cpu;
				5859	}
				5860
				5861	/*
				5862	* Try and locate an idle core/thread in the LLC cache domain.
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	5863	*/
Morten Rasmussen	772bd008c	2016-06-22 18:03:13 +0100	[diff] [blame]	5864	static int select_idle_sibling(struct task_struct *p, int prev, int target)
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	5865	{
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	5866	struct sched_domain *sd;
Peter Zijlstra	10e2f1a	2016-05-09 10:38:05 +0200	[diff] [blame]	5867	int i;
Mike Galbraith	e0a79f5	2013-01-28 12:19:25 +0100	[diff] [blame]	5868
				5869	if (idle_cpu(target))
				5870	return target;
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	5871
				5872	/*
Peter Zijlstra	10e2f1a	2016-05-09 10:38:05 +0200	[diff] [blame]	5873	* If the previous cpu is cache affine and idle, don't be stupid.
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	5874	*/
Morten Rasmussen	772bd008c	2016-06-22 18:03:13 +0100	[diff] [blame]	5875	if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
				5876	return prev;
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	5877
Peter Zijlstra	518cd62	2011-12-07 15:07:31 +0100	[diff] [blame]	5878	sd = rcu_dereference(per_cpu(sd_llc, target));
Peter Zijlstra	10e2f1a	2016-05-09 10:38:05 +0200	[diff] [blame]	5879	if (!sd)
				5880	return target;
Morten Rasmussen	772bd008c	2016-06-22 18:03:13 +0100	[diff] [blame]	5881
Peter Zijlstra	10e2f1a	2016-05-09 10:38:05 +0200	[diff] [blame]	5882	i = select_idle_core(p, sd, target);
				5883	if ((unsigned)i < nr_cpumask_bits)
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	5884	return i;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	5885
Peter Zijlstra	10e2f1a	2016-05-09 10:38:05 +0200	[diff] [blame]	5886	i = select_idle_cpu(p, sd, target);
				5887	if ((unsigned)i < nr_cpumask_bits)
				5888	return i;
Mike Galbraith	970e178	2012-06-12 05:18:32 +0200	[diff] [blame]	5889
Peter Zijlstra	10e2f1a	2016-05-09 10:38:05 +0200	[diff] [blame]	5890	i = select_idle_smt(p, sd, target);
				5891	if ((unsigned)i < nr_cpumask_bits)
				5892	return i;
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	5893
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	5894	return target;
				5895	}
Dietmar Eggemann	231678b	2015-08-14 17:23:13 +0100	[diff] [blame]	5896
Vincent Guittot	8bb5b00	2015-03-04 08:48:47 +0100	[diff] [blame]	5897	/*
Dietmar Eggemann	9e91d61	2015-08-14 17:23:12 +0100	[diff] [blame]	5898	* cpu_util returns the amount of capacity of a CPU that is used by CFS
Vincent Guittot	8bb5b00	2015-03-04 08:48:47 +0100	[diff] [blame]	5899	* tasks. The unit of the return value must be the one of capacity so we can
Dietmar Eggemann	9e91d61	2015-08-14 17:23:12 +0100	[diff] [blame]	5900	* compare the utilization with the capacity of the CPU that is available for
				5901	* CFS task (ie cpu_capacity).
Dietmar Eggemann	231678b	2015-08-14 17:23:13 +0100	[diff] [blame]	5902	*
				5903	* cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
				5904	* recent utilization of currently non-runnable tasks on a CPU. It represents
				5905	* the amount of utilization of a CPU in the range [0..capacity_orig] where
				5906	* capacity_orig is the cpu_capacity available at the highest frequency
				5907	* (arch_scale_freq_capacity()).
				5908	* The utilization of a CPU converges towards a sum equal to or less than the
				5909	* current capacity (capacity_curr <= capacity_orig) of the CPU because it is
				5910	* the running time on this CPU scaled by capacity_curr.
				5911	*
				5912	* Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
				5913	* higher than capacity_orig because of unfortunate rounding in
				5914	* cfs.avg.util_avg or just after migrating tasks and new task wakeups until
				5915	* the average stabilizes with the new running time. We need to check that the
				5916	* utilization stays within the range of [0..capacity_orig] and cap it if
				5917	* necessary. Without utilization capping, a group could be seen as overloaded
				5918	* (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
				5919	* available capacity. We allow utilization to overshoot capacity_curr (but not
				5920	* capacity_orig) as it useful for predicting the capacity required after task
				5921	* migrations (scheduler-driven DVFS).
Vincent Guittot	8bb5b00	2015-03-04 08:48:47 +0100	[diff] [blame]	5922	*/
Dietmar Eggemann	9e91d61	2015-08-14 17:23:12 +0100	[diff] [blame]	5923	static int cpu_util(int cpu)
Vincent Guittot	8bb5b00	2015-03-04 08:48:47 +0100	[diff] [blame]	5924	{
Dietmar Eggemann	9e91d61	2015-08-14 17:23:12 +0100	[diff] [blame]	5925	unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
Vincent Guittot	8bb5b00	2015-03-04 08:48:47 +0100	[diff] [blame]	5926	unsigned long capacity = capacity_orig_of(cpu);
				5927
Dietmar Eggemann	231678b	2015-08-14 17:23:13 +0100	[diff] [blame]	5928	return (util >= capacity) ? capacity : util;
Vincent Guittot	8bb5b00	2015-03-04 08:48:47 +0100	[diff] [blame]	5929	}
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	5930
Morten Rasmussen	3273163	2016-07-25 14:34:26 +0100	[diff] [blame]	5931	static inline int task_util(struct task_struct *p)
				5932	{
				5933	return p->se.avg.util_avg;
				5934	}
				5935
				5936	/*
Morten Rasmussen	104cb16	2016-10-14 14:41:07 +0100	[diff] [blame]	5937	* cpu_util_wake: Compute cpu utilization with any contributions from
				5938	* the waking task p removed.
				5939	*/
				5940	static int cpu_util_wake(int cpu, struct task_struct *p)
				5941	{
				5942	unsigned long util, capacity;
				5943
				5944	/* Task has no contribution or is new */
				5945	if (cpu != task_cpu(p) \|\| !p->se.avg.last_update_time)
				5946	return cpu_util(cpu);
				5947
				5948	capacity = capacity_orig_of(cpu);
				5949	util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0);
				5950
				5951	return (util >= capacity) ? capacity : util;
				5952	}
				5953
				5954	/*
Morten Rasmussen	3273163	2016-07-25 14:34:26 +0100	[diff] [blame]	5955	* Disable WAKE_AFFINE in the case where task @p doesn't fit in the
				5956	* capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
				5957	*
				5958	* In that case WAKE_AFFINE doesn't make sense and we'll let
				5959	* BALANCE_WAKE sort things out.
				5960	*/
				5961	static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
				5962	{
				5963	long min_cap, max_cap;
				5964
				5965	min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
				5966	max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
				5967
				5968	/* Minimum capacity is close to max, no need to abort wake_affine */
				5969	if (max_cap - min_cap < max_cap >> 3)
				5970	return 0;
				5971
Morten Rasmussen	104cb16	2016-10-14 14:41:07 +0100	[diff] [blame]	5972	/* Bring task utilization in sync with prev_cpu */
				5973	sync_entity_load_avg(&p->se);
				5974
Morten Rasmussen	3273163	2016-07-25 14:34:26 +0100	[diff] [blame]	5975	return min_cap * 1024 < task_util(p) * capacity_margin;
				5976	}
				5977
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	5978	/*
Morten Rasmussen	de91b9c	2014-02-18 14:14:24 +0000	[diff] [blame]	5979	* select_task_rq_fair: Select target runqueue for the waking task in domains
				5980	* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
				5981	* SD_BALANCE_FORK, or SD_BALANCE_EXEC.
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5982	*
Morten Rasmussen	de91b9c	2014-02-18 14:14:24 +0000	[diff] [blame]	5983	* Balances load by selecting the idlest cpu in the idlest group, or under
				5984	* certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5985	*
Morten Rasmussen	de91b9c	2014-02-18 14:14:24 +0000	[diff] [blame]	5986	* Returns the target cpu number.
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5987	*
				5988	* preempt must be disabled.
				5989	*/
Peter Zijlstra	0017d73	2010-03-24 18:34:10 +0100	[diff] [blame]	5990	static int
Peter Zijlstra	ac66f54	2013-10-07 11:29:16 +0100	[diff] [blame]	5991	select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5992	{
Peter Zijlstra	29cd8ba	2009-09-17 09:01:14 +0200	[diff] [blame]	5993	struct sched_domain tmp, affine_sd = NULL, *sd = NULL;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	5994	int cpu = smp_processor_id();
Mike Galbraith	63b0e9e	2015-07-14 17:39:50 +0200	[diff] [blame]	5995	int new_cpu = prev_cpu;
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	5996	int want_affine = 0;
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	5997	int sync = wake_flags & WF_SYNC;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	5998
Peter Zijlstra	c58d25f	2016-05-12 09:19:59 +0200	[diff] [blame]	5999	if (sd_flag & SD_BALANCE_WAKE) {
				6000	record_wakee(p);
Morten Rasmussen	3273163	2016-07-25 14:34:26 +0100	[diff] [blame]	6001	want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
Ingo Molnar	0c98d34	2017-02-05 15:38:10 +0100	[diff] [blame]	6002	&& cpumask_test_cpu(cpu, &p->cpus_allowed);
Peter Zijlstra	c58d25f	2016-05-12 09:19:59 +0200	[diff] [blame]	6003	}
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	6004
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	6005	rcu_read_lock();
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	6006	for_each_domain(cpu, tmp) {
Peter Zijlstra	e4f4288	2009-12-16 18:04:34 +0100	[diff] [blame]	6007	if (!(tmp->flags & SD_LOAD_BALANCE))
Mike Galbraith	63b0e9e	2015-07-14 17:39:50 +0200	[diff] [blame]	6008	break;
Peter Zijlstra	e4f4288	2009-12-16 18:04:34 +0100	[diff] [blame]	6009
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	6010	/*
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	6011	* If both cpu and prev_cpu are part of this domain,
				6012	* cpu is a valid SD_WAKE_AFFINE target.
Peter Zijlstra	fe3bcfe	2009-11-12 15:55:29 +0100	[diff] [blame]	6013	*/
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	6014	if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
				6015	cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
				6016	affine_sd = tmp;
Alex Shi	f03542a	2012-07-26 08:55:34 +0800	[diff] [blame]	6017	break;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	6018	}
				6019
Alex Shi	f03542a	2012-07-26 08:55:34 +0800	[diff] [blame]	6020	if (tmp->flags & sd_flag)
Peter Zijlstra	29cd8ba	2009-09-17 09:01:14 +0200	[diff] [blame]	6021	sd = tmp;
Mike Galbraith	63b0e9e	2015-07-14 17:39:50 +0200	[diff] [blame]	6022	else if (!want_affine)
				6023	break;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	6024	}
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	6025
Mike Galbraith	63b0e9e	2015-07-14 17:39:50 +0200	[diff] [blame]	6026	if (affine_sd) {
				6027	sd = NULL; /* Prefer wake_affine over balance flags */
Morten Rasmussen	772bd008c	2016-06-22 18:03:13 +0100	[diff] [blame]	6028	if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
Mike Galbraith	63b0e9e	2015-07-14 17:39:50 +0200	[diff] [blame]	6029	new_cpu = cpu;
Mike Galbraith	8b911ac	2010-03-11 17:17:16 +0100	[diff] [blame]	6030	}
Peter Zijlstra	3b64089	2009-09-16 13:44:33 +0200	[diff] [blame]	6031
Mike Galbraith	63b0e9e	2015-07-14 17:39:50 +0200	[diff] [blame]	6032	if (!sd) {
				6033	if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
Morten Rasmussen	772bd008c	2016-06-22 18:03:13 +0100	[diff] [blame]	6034	new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
Mike Galbraith	63b0e9e	2015-07-14 17:39:50 +0200	[diff] [blame]	6035
				6036	} else while (sd) {
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	6037	struct sched_group *group;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	6038	int weight;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	6039
Peter Zijlstra	0763a66	2009-09-14 19:37:39 +0200	[diff] [blame]	6040	if (!(sd->flags & sd_flag)) {
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	6041	sd = sd->child;
				6042	continue;
				6043	}
				6044
Vincent Guittot	c44f2a0	2013-10-18 13:52:21 +0200	[diff] [blame]	6045	group = find_idlest_group(sd, p, cpu, sd_flag);
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	6046	if (!group) {
				6047	sd = sd->child;
				6048	continue;
				6049	}
				6050
Peter Zijlstra	d7c33c4	2009-09-11 12:45:38 +0200	[diff] [blame]	6051	new_cpu = find_idlest_cpu(group, p, cpu);
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	6052	if (new_cpu == -1 \|\| new_cpu == cpu) {
				6053	/* Now try balancing at a lower domain level of cpu */
				6054	sd = sd->child;
				6055	continue;
				6056	}
				6057
				6058	/* Now try balancing at a lower domain level of new_cpu */
				6059	cpu = new_cpu;
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	6060	weight = sd->span_weight;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	6061	sd = NULL;
				6062	for_each_domain(cpu, tmp) {
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	6063	if (weight <= tmp->span_weight)
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	6064	break;
Peter Zijlstra	0763a66	2009-09-14 19:37:39 +0200	[diff] [blame]	6065	if (tmp->flags & sd_flag)
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	6066	sd = tmp;
				6067	}
				6068	/* while loop will break here if sd == NULL */
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	6069	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	6070	rcu_read_unlock();
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	6071
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	6072	return new_cpu;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	6073	}
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	6074
				6075	/*
				6076	* Called immediately before a task is migrated to a new cpu; task_cpu(p) and
				6077	* cfs_rq_of(p) references at time of call are still valid and identify the
Byungchul Park	525628c	2015-11-18 09:34:59 +0900	[diff] [blame]	6078	* previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	6079	*/
xiaofeng.yan	5a4fd03	2015-09-23 14:55:59 +0800	[diff] [blame]	6080	static void migrate_task_rq_fair(struct task_struct *p)
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	6081	{
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	6082	/*
Peter Zijlstra	59efa0b	2016-05-10 18:24:37 +0200	[diff] [blame]	6083	* As blocked tasks retain absolute vruntime the migration needs to
				6084	* deal with this by subtracting the old and adding the new
				6085	* min_vruntime -- the latter is done by enqueue_entity() when placing
				6086	* the task on the new runqueue.
				6087	*/
				6088	if (p->state == TASK_WAKING) {
				6089	struct sched_entity *se = &p->se;
				6090	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				6091	u64 min_vruntime;
				6092
				6093	#ifndef CONFIG_64BIT
				6094	u64 min_vruntime_copy;
				6095
				6096	do {
				6097	min_vruntime_copy = cfs_rq->min_vruntime_copy;
				6098	smp_rmb();
				6099	min_vruntime = cfs_rq->min_vruntime;
				6100	} while (min_vruntime != min_vruntime_copy);
				6101	#else
				6102	min_vruntime = cfs_rq->min_vruntime;
				6103	#endif
				6104
				6105	se->vruntime -= min_vruntime;
				6106	}
				6107
				6108	/*
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	6109	* We are supposed to update the task to "current" time, then its up to date
				6110	* and ready to go to new CPU/cfs_rq. But we have difficulty in getting
				6111	* what current time is, so simply throw away the out-of-date time. This
				6112	* will result in the wakee task is less decayed, but giving the wakee more
				6113	* load sounds not bad.
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	6114	*/
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	6115	remove_entity_load_avg(&p->se);
				6116
				6117	/* Tell new CPU we are migrated */
				6118	p->se.avg.last_update_time = 0;
Ben Segall	3944a92	2014-05-15 15:59:20 -0700	[diff] [blame]	6119
				6120	/* We have migrated, no longer consider this task hot */
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	6121	p->se.exec_start = 0;
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	6122	}
Yuyang Du	1269557	2015-07-15 08:04:40 +0800	[diff] [blame]	6123
				6124	static void task_dead_fair(struct task_struct *p)
				6125	{
				6126	remove_entity_load_avg(&p->se);
				6127	}
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	6128	#endif /* CONFIG_SMP */
				6129
Peter Zijlstra	e52fb7c	2009-01-14 12:39:19 +0100	[diff] [blame]	6130	static unsigned long
				6131	wakeup_gran(struct sched_entity curr, struct sched_entity se)
Peter Zijlstra	0bbd333	2008-04-19 19:44:57 +0200	[diff] [blame]	6132	{
				6133	unsigned long gran = sysctl_sched_wakeup_granularity;
				6134
				6135	/*
Peter Zijlstra	e52fb7c	2009-01-14 12:39:19 +0100	[diff] [blame]	6136	* Since its curr running now, convert the gran from real-time
				6137	* to virtual-time in his units.
Mike Galbraith	13814d4	2010-03-11 17:17:04 +0100	[diff] [blame]	6138	*
				6139	* By using 'se' instead of 'curr' we penalize light tasks, so
				6140	* they get preempted easier. That is, if 'se' < 'curr' then
				6141	* the resulting gran will be larger, therefore penalizing the
				6142	* lighter, if otoh 'se' > 'curr' then the resulting gran will
				6143	* be smaller, again penalizing the lighter task.
				6144	*
				6145	* This is especially important for buddies when the leftmost
				6146	* task is higher priority than the buddy.
Peter Zijlstra	0bbd333	2008-04-19 19:44:57 +0200	[diff] [blame]	6147	*/
Shaohua Li	f4ad9bd	2011-04-08 12:53:09 +0800	[diff] [blame]	6148	return calc_delta_fair(gran, se);
Peter Zijlstra	0bbd333	2008-04-19 19:44:57 +0200	[diff] [blame]	6149	}
				6150
				6151	/*
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	6152	* Should 'se' preempt 'curr'.
				6153	*
				6154	* \|s1
				6155	* \|s2
				6156	* \|s3
				6157	* g
				6158	* \|<--->\|c
				6159	*
				6160	* w(c, s1) = -1
				6161	* w(c, s2) = 0
				6162	* w(c, s3) = 1
				6163	*
				6164	*/
				6165	static int
				6166	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se)
				6167	{
				6168	s64 gran, vdiff = curr->vruntime - se->vruntime;
				6169
				6170	if (vdiff <= 0)
				6171	return -1;
				6172
Peter Zijlstra	e52fb7c	2009-01-14 12:39:19 +0100	[diff] [blame]	6173	gran = wakeup_gran(curr, se);
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	6174	if (vdiff > gran)
				6175	return 1;
				6176
				6177	return 0;
				6178	}
				6179
Peter Zijlstra	0247909	2008-11-04 21:25:10 +0100	[diff] [blame]	6180	static void set_last_buddy(struct sched_entity *se)
				6181	{
Venkatesh Pallipadi	69c80f3	2011-04-13 18:21:09 -0700	[diff] [blame]	6182	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
				6183	return;
				6184
				6185	for_each_sched_entity(se)
				6186	cfs_rq_of(se)->last = se;
Peter Zijlstra	0247909	2008-11-04 21:25:10 +0100	[diff] [blame]	6187	}
				6188
				6189	static void set_next_buddy(struct sched_entity *se)
				6190	{
Venkatesh Pallipadi	69c80f3	2011-04-13 18:21:09 -0700	[diff] [blame]	6191	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
				6192	return;
				6193
				6194	for_each_sched_entity(se)
				6195	cfs_rq_of(se)->next = se;
Peter Zijlstra	0247909	2008-11-04 21:25:10 +0100	[diff] [blame]	6196	}
				6197
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	6198	static void set_skip_buddy(struct sched_entity *se)
				6199	{
Venkatesh Pallipadi	69c80f3	2011-04-13 18:21:09 -0700	[diff] [blame]	6200	for_each_sched_entity(se)
				6201	cfs_rq_of(se)->skip = se;
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	6202	}
				6203
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	6204	/*
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6205	* Preempt the current task with a newly woken task if needed:
				6206	*/
Peter Zijlstra	5a9b86f	2009-09-16 13:47:58 +0200	[diff] [blame]	6207	static void check_preempt_wakeup(struct rq rq, struct task_struct p, int wake_flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6208	{
				6209	struct task_struct *curr = rq->curr;
Srivatsa Vaddagiri	8651a86	2007-10-15 17:00:12 +0200	[diff] [blame]	6210	struct sched_entity se = &curr->se, pse = &p->se;
Mike Galbraith	03e89e4	2008-12-16 08:45:30 +0100	[diff] [blame]	6211	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	6212	int scale = cfs_rq->nr_running >= sched_nr_latency;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	6213	int next_buddy_marked = 0;
Mike Galbraith	03e89e4	2008-12-16 08:45:30 +0100	[diff] [blame]	6214
Ingo Molnar	4ae7d5c	2008-03-19 01:42:00 +0100	[diff] [blame]	6215	if (unlikely(se == pse))
				6216	return;
				6217
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	6218	/*
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6219	* This is possible from callers such as attach_tasks(), in which we
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	6220	* unconditionally check_prempt_curr() after an enqueue (which may have
				6221	* lead to a throttle). This both saves work and prevents false
				6222	* next-buddy nomination below.
				6223	*/
				6224	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
				6225	return;
				6226
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	6227	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
Mike Galbraith	3cb63d5	2009-09-11 12:01:17 +0200	[diff] [blame]	6228	set_next_buddy(pse);
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	6229	next_buddy_marked = 1;
				6230	}
Peter Zijlstra	57fdc26	2008-09-23 15:33:45 +0200	[diff] [blame]	6231
Bharata B Rao	aec0a51	2008-08-28 14:42:49 +0530	[diff] [blame]	6232	/*
				6233	* We can come here with TIF_NEED_RESCHED already set from new task
				6234	* wake up path.
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	6235	*
				6236	* Note: this also catches the edge-case of curr being in a throttled
				6237	* group (e.g. via set_curr_task), since update_curr() (in the
				6238	* enqueue of curr) will have resulted in resched being set. This
				6239	* prevents us from potentially nominating it as a false LAST_BUDDY
				6240	* below.
Bharata B Rao	aec0a51	2008-08-28 14:42:49 +0530	[diff] [blame]	6241	*/
				6242	if (test_tsk_need_resched(curr))
				6243	return;
				6244
Darren Hart	a2f5c9a	2011-02-22 13:04:33 -0800	[diff] [blame]	6245	/* Idle tasks are by definition preempted by non-idle tasks. */
				6246	if (unlikely(curr->policy == SCHED_IDLE) &&
				6247	likely(p->policy != SCHED_IDLE))
				6248	goto preempt;
				6249
Ingo Molnar	91c234b	2007-10-15 17:00:18 +0200	[diff] [blame]	6250	/*
Darren Hart	a2f5c9a	2011-02-22 13:04:33 -0800	[diff] [blame]	6251	* Batch and idle tasks do not preempt non-idle tasks (their preemption
				6252	* is driven by the tick):
Ingo Molnar	91c234b	2007-10-15 17:00:18 +0200	[diff] [blame]	6253	*/
Ingo Molnar	8ed92e5	2012-10-14 14:28:50 +0200	[diff] [blame]	6254	if (unlikely(p->policy != SCHED_NORMAL) \|\| !sched_feat(WAKEUP_PREEMPTION))
Ingo Molnar	91c234b	2007-10-15 17:00:18 +0200	[diff] [blame]	6255	return;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6256
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	6257	find_matching_se(&se, &pse);
Paul Turner	9bbd737	2011-07-05 19:07:21 -0700	[diff] [blame]	6258	update_curr(cfs_rq_of(se));
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	6259	BUG_ON(!pse);
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	6260	if (wakeup_preempt_entity(se, pse) == 1) {
				6261	/*
				6262	* Bias pick_next to pick the sched entity that is
				6263	* triggering this preemption.
				6264	*/
				6265	if (!next_buddy_marked)
				6266	set_next_buddy(pse);
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	6267	goto preempt;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	6268	}
Jupyung Lee	a65ac74	2009-11-17 18:51:40 +0900	[diff] [blame]	6269
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	6270	return;
				6271
				6272	preempt:
Kirill Tkhai	8875125	2014-06-29 00:03:57 +0400	[diff] [blame]	6273	resched_curr(rq);
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	6274	/*
				6275	* Only set the backward buddy when the current task is still
				6276	* on the rq. This can happen when a wakeup gets interleaved
				6277	* with schedule on the ->pre_schedule() or idle_balance()
				6278	* point, either of which can * drop the rq lock.
				6279	*
				6280	* Also, during early boot the idle thread is in the fair class,
				6281	* for obvious reasons its a bad idea to schedule back to it.
				6282	*/
				6283	if (unlikely(!se->on_rq \|\| curr == rq->idle))
				6284	return;
				6285
				6286	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
				6287	set_last_buddy(se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6288	}
				6289
Peter Zijlstra	606dba2	2012-02-11 06:05:00 +0100	[diff] [blame]	6290	static struct task_struct *
Matt Fleming	d8ac897	2016-09-21 14:38:10 +0100	[diff] [blame]	6291	pick_next_task_fair(struct rq rq, struct task_struct prev, struct rq_flags *rf)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6292	{
				6293	struct cfs_rq *cfs_rq = &rq->cfs;
				6294	struct sched_entity *se;
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	6295	struct task_struct *p;
Peter Zijlstra	37e117c	2014-02-14 12:25:08 +0100	[diff] [blame]	6296	int new_tasks;
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	6297
Peter Zijlstra	6e83125	2014-02-11 16:11:48 +0100	[diff] [blame]	6298	again:
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	6299	#ifdef CONFIG_FAIR_GROUP_SCHED
				6300	if (!cfs_rq->nr_running)
Peter Zijlstra	38033c3	2014-01-23 20:32:21 +0100	[diff] [blame]	6301	goto idle;
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	6302
Peter Zijlstra	3f1d2a3	2014-02-12 10:49:30 +0100	[diff] [blame]	6303	if (prev->sched_class != &fair_sched_class)
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	6304	goto simple;
				6305
				6306	/*
				6307	* Because of the set_next_buddy() in dequeue_task_fair() it is rather
				6308	* likely that a next task is from the same cgroup as the current.
				6309	*
				6310	* Therefore attempt to avoid putting and setting the entire cgroup
				6311	* hierarchy, only change the part that actually changes.
				6312	*/
				6313
				6314	do {
				6315	struct sched_entity *curr = cfs_rq->curr;
				6316
				6317	/*
				6318	* Since we got here without doing put_prev_entity() we also
				6319	* have to consider cfs_rq->curr. If it is still a runnable
				6320	* entity, update_curr() will update its vruntime, otherwise
				6321	* forget we've ever seen it.
				6322	*/
Ben Segall	54d2736	2015-04-06 15:28:10 -0700	[diff] [blame]	6323	if (curr) {
				6324	if (curr->on_rq)
				6325	update_curr(cfs_rq);
				6326	else
				6327	curr = NULL;
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	6328
Ben Segall	54d2736	2015-04-06 15:28:10 -0700	[diff] [blame]	6329	/*
				6330	* This call to check_cfs_rq_runtime() will do the
				6331	* throttle and dequeue its entity in the parent(s).
				6332	* Therefore the 'simple' nr_running test will indeed
				6333	* be correct.
				6334	*/
				6335	if (unlikely(check_cfs_rq_runtime(cfs_rq)))
				6336	goto simple;
				6337	}
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	6338
				6339	se = pick_next_entity(cfs_rq, curr);
				6340	cfs_rq = group_cfs_rq(se);
				6341	} while (cfs_rq);
				6342
				6343	p = task_of(se);
				6344
				6345	/*
				6346	* Since we haven't yet done put_prev_entity and if the selected task
				6347	* is a different task than we started out with, try and touch the
				6348	* least amount of cfs_rqs.
				6349	*/
				6350	if (prev != p) {
				6351	struct sched_entity *pse = &prev->se;
				6352
				6353	while (!(cfs_rq = is_same_group(se, pse))) {
				6354	int se_depth = se->depth;
				6355	int pse_depth = pse->depth;
				6356
				6357	if (se_depth <= pse_depth) {
				6358	put_prev_entity(cfs_rq_of(pse), pse);
				6359	pse = parent_entity(pse);
				6360	}
				6361	if (se_depth >= pse_depth) {
				6362	set_next_entity(cfs_rq_of(se), se);
				6363	se = parent_entity(se);
				6364	}
				6365	}
				6366
				6367	put_prev_entity(cfs_rq, pse);
				6368	set_next_entity(cfs_rq, se);
				6369	}
				6370
				6371	if (hrtick_enabled(rq))
				6372	hrtick_start_fair(rq, p);
				6373
				6374	return p;
				6375	simple:
				6376	cfs_rq = &rq->cfs;
				6377	#endif
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6378
Tim Blechmann	36ace27	2009-11-24 11:55:45 +0100	[diff] [blame]	6379	if (!cfs_rq->nr_running)
Peter Zijlstra	38033c3	2014-01-23 20:32:21 +0100	[diff] [blame]	6380	goto idle;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6381
Peter Zijlstra	3f1d2a3	2014-02-12 10:49:30 +0100	[diff] [blame]	6382	put_prev_task(rq, prev);
Peter Zijlstra	606dba2	2012-02-11 06:05:00 +0100	[diff] [blame]	6383
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6384	do {
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	6385	se = pick_next_entity(cfs_rq, NULL);
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	6386	set_next_entity(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6387	cfs_rq = group_cfs_rq(se);
				6388	} while (cfs_rq);
				6389
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	6390	p = task_of(se);
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	6391
Mike Galbraith	b39e66e	2011-11-22 15:20:07 +0100	[diff] [blame]	6392	if (hrtick_enabled(rq))
				6393	hrtick_start_fair(rq, p);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	6394
				6395	return p;
Peter Zijlstra	38033c3	2014-01-23 20:32:21 +0100	[diff] [blame]	6396
				6397	idle:
Matt Fleming	46f69fa	2016-09-21 14:38:12 +0100	[diff] [blame]	6398	new_tasks = idle_balance(rq, rf);
				6399
Peter Zijlstra	37e117c	2014-02-14 12:25:08 +0100	[diff] [blame]	6400	/*
				6401	* Because idle_balance() releases (and re-acquires) rq->lock, it is
				6402	* possible for any higher priority task to appear. In that case we
				6403	* must re-start the pick_next_entity() loop.
				6404	*/
Kirill Tkhai	e4aa358	2014-03-06 13:31:55 +0400	[diff] [blame]	6405	if (new_tasks < 0)
Peter Zijlstra	37e117c	2014-02-14 12:25:08 +0100	[diff] [blame]	6406	return RETRY_TASK;
				6407
Kirill Tkhai	e4aa358	2014-03-06 13:31:55 +0400	[diff] [blame]	6408	if (new_tasks > 0)
Peter Zijlstra	38033c3	2014-01-23 20:32:21 +0100	[diff] [blame]	6409	goto again;
Peter Zijlstra	38033c3	2014-01-23 20:32:21 +0100	[diff] [blame]	6410
				6411	return NULL;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6412	}
				6413
				6414	/*
				6415	* Account for a descheduled task:
				6416	*/
Ingo Molnar	31ee529	2007-08-09 11:16:49 +0200	[diff] [blame]	6417	static void put_prev_task_fair(struct rq rq, struct task_struct prev)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6418	{
				6419	struct sched_entity *se = &prev->se;
				6420	struct cfs_rq *cfs_rq;
				6421
				6422	for_each_sched_entity(se) {
				6423	cfs_rq = cfs_rq_of(se);
Ingo Molnar	ab6cde2	2007-08-09 11:16:48 +0200	[diff] [blame]	6424	put_prev_entity(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6425	}
				6426	}
				6427
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	6428	/*
				6429	* sched_yield() is very simple
				6430	*
				6431	* The magic of dealing with the ->skip buddy is in pick_next_entity.
				6432	*/
				6433	static void yield_task_fair(struct rq *rq)
				6434	{
				6435	struct task_struct *curr = rq->curr;
				6436	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
				6437	struct sched_entity *se = &curr->se;
				6438
				6439	/*
				6440	* Are we the only task in the tree?
				6441	*/
				6442	if (unlikely(rq->nr_running == 1))
				6443	return;
				6444
				6445	clear_buddies(cfs_rq, se);
				6446
				6447	if (curr->policy != SCHED_BATCH) {
				6448	update_rq_clock(rq);
				6449	/*
				6450	* Update run-time statistics of the 'current'.
				6451	*/
				6452	update_curr(cfs_rq);
Mike Galbraith	916671c	2011-11-22 15:21:26 +0100	[diff] [blame]	6453	/*
				6454	* Tell update_rq_clock() that we've just updated,
				6455	* so we don't do microscopic update in schedule()
				6456	* and double the fastpath cost.
				6457	*/
Peter Zijlstra	9edfbfe	2015-01-05 11:18:11 +0100	[diff] [blame]	6458	rq_clock_skip_update(rq, true);
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	6459	}
				6460
				6461	set_skip_buddy(se);
				6462	}
				6463
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	6464	static bool yield_to_task_fair(struct rq rq, struct task_struct p, bool preempt)
				6465	{
				6466	struct sched_entity *se = &p->se;
				6467
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	6468	/* throttled hierarchies are not runnable */
				6469	if (!se->on_rq \|\| throttled_hierarchy(cfs_rq_of(se)))
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	6470	return false;
				6471
				6472	/* Tell the scheduler that we'd really like pse to run next. */
				6473	set_next_buddy(se);
				6474
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	6475	yield_task_fair(rq);
				6476
				6477	return true;
				6478	}
				6479
Peter Williams	681f3e6	2007-10-24 18:23:51 +0200	[diff] [blame]	6480	#ifdef CONFIG_SMP
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6481	/**************************************************
Peter Zijlstra	e9c84cb	2012-07-03 13:53:26 +0200	[diff] [blame]	6482	* Fair scheduling class load-balancing methods.
				6483	*
				6484	* BASICS
				6485	*
				6486	* The purpose of load-balancing is to achieve the same basic fairness the
				6487	* per-cpu scheduler provides, namely provide a proportional amount of compute
				6488	* time to each task. This is expressed in the following equation:
				6489	*
				6490	* W_i,n/P_i == W_j,n/P_j for all i,j (1)
				6491	*
				6492	* Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
				6493	* W_i,0 is defined as:
				6494	*
				6495	* W_i,0 = \Sum_j w_i,j (2)
				6496	*
				6497	* Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
Yuyang Du	1c3de5e	2016-03-30 07:07:51 +0800	[diff] [blame]	6498	* is derived from the nice value as per sched_prio_to_weight[].
Peter Zijlstra	e9c84cb	2012-07-03 13:53:26 +0200	[diff] [blame]	6499	*
				6500	* The weight average is an exponential decay average of the instantaneous
				6501	* weight:
				6502	*
				6503	* W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
				6504	*
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	6505	* C_i is the compute capacity of cpu i, typically it is the
Peter Zijlstra	e9c84cb	2012-07-03 13:53:26 +0200	[diff] [blame]	6506	* fraction of 'recent' time available for SCHED_OTHER task execution. But it
				6507	* can also include other factors [XXX].
				6508	*
				6509	* To achieve this balance we define a measure of imbalance which follows
				6510	* directly from (1):
				6511	*
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	6512	* imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4)
Peter Zijlstra	e9c84cb	2012-07-03 13:53:26 +0200	[diff] [blame]	6513	*
				6514	* We them move tasks around to minimize the imbalance. In the continuous
				6515	* function space it is obvious this converges, in the discrete case we get
				6516	* a few fun cases generally called infeasible weight scenarios.
				6517	*
				6518	* [XXX expand on:
				6519	* - infeasible weights;
				6520	* - local vs global optima in the discrete case. ]
				6521	*
				6522	*
				6523	* SCHED DOMAINS
				6524	*
				6525	* In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
				6526	* for all i,j solution, we create a tree of cpus that follows the hardware
				6527	* topology where each level pairs two lower groups (or better). This results
				6528	* in O(log n) layers. Furthermore we reduce the number of cpus going up the
				6529	* tree to only the first of the previous level and we decrease the frequency
				6530	* of load-balance at each level inv. proportional to the number of cpus in
				6531	* the groups.
				6532	*
				6533	* This yields:
				6534	*
				6535	* log_2 n 1 n
				6536	* \Sum { --- * --- * 2^i } = O(n) (5)
				6537	* i = 0 2^i 2^i
				6538	* `- size of each group
				6539	* \| \| `- number of cpus doing load-balance
				6540	* \| `- freq
				6541	* `- sum over all levels
				6542	*
				6543	* Coupled with a limit on how many tasks we can migrate every balance pass,
				6544	* this makes (5) the runtime complexity of the balancer.
				6545	*
				6546	* An important property here is that each CPU is still (indirectly) connected
				6547	* to every other cpu in at most O(log n) steps:
				6548	*
				6549	* The adjacency matrix of the resulting graph is given by:
				6550	*
Byungchul Park	97a7142	2015-07-05 18:33:48 +0900	[diff] [blame]	6551	* log_2 n
Peter Zijlstra	e9c84cb	2012-07-03 13:53:26 +0200	[diff] [blame]	6552	* A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
				6553	* k = 0
				6554	*
				6555	* And you'll find that:
				6556	*
				6557	* A^(log_2 n)_i,j != 0 for all i,j (7)
				6558	*
				6559	* Showing there's indeed a path between every cpu in at most O(log n) steps.
				6560	* The task movement gives a factor of O(m), giving a convergence complexity
				6561	* of:
				6562	*
				6563	* O(nm log n), n := nr_cpus, m := nr_tasks (8)
				6564	*
				6565	*
				6566	* WORK CONSERVING
				6567	*
				6568	* In order to avoid CPUs going idle while there's still work to do, new idle
				6569	* balancing is more aggressive and has the newly idle cpu iterate up the domain
				6570	* tree itself instead of relying on other CPUs to bring it work.
				6571	*
				6572	* This adds some complexity to both (5) and (8) but it reduces the total idle
				6573	* time.
				6574	*
				6575	* [XXX more?]
				6576	*
				6577	*
				6578	* CGROUPS
				6579	*
				6580	* Cgroups make a horror show out of (2), instead of a simple sum we get:
				6581	*
				6582	* s_k,i
				6583	* W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
				6584	* S_k
				6585	*
				6586	* Where
				6587	*
				6588	* s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
				6589	*
				6590	* w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
				6591	*
				6592	* The big problem is S_k, its a global sum needed to compute a local (W_i)
				6593	* property.
				6594	*
				6595	* [XXX write more on how we solve this.. _after_ merging pjt's patches that
				6596	* rewrite all of this once again.]
Byungchul Park	97a7142	2015-07-05 18:33:48 +0900	[diff] [blame]	6597	*/
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6598
Hiroshi Shimamoto	ed387b7	2012-01-31 11:40:32 +0900	[diff] [blame]	6599	static unsigned long __read_mostly max_load_balance_interval = HZ/10;
				6600
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	6601	enum fbq_type { regular, remote, all };
				6602
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	6603	#define LBF_ALL_PINNED 0x01
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	6604	#define LBF_NEED_BREAK 0x02
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	6605	#define LBF_DST_PINNED 0x04
				6606	#define LBF_SOME_PINNED 0x08
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	6607
				6608	struct lb_env {
				6609	struct sched_domain *sd;
				6610
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	6611	struct rq *src_rq;
Prashanth Nageshappa	85c1e7d	2012-06-19 17:47:34 +0530	[diff] [blame]	6612	int src_cpu;
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	6613
				6614	int dst_cpu;
				6615	struct rq *dst_rq;
				6616
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	6617	struct cpumask *dst_grpmask;
				6618	int new_dst_cpu;
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	6619	enum cpu_idle_type idle;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	6620	long imbalance;
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	6621	/* The set of CPUs under consideration for load-balancing */
				6622	struct cpumask *cpus;
				6623
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	6624	unsigned int flags;
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	6625
				6626	unsigned int loop;
				6627	unsigned int loop_break;
				6628	unsigned int loop_max;
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	6629
				6630	enum fbq_type fbq_type;
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6631	struct list_head tasks;
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	6632	};
				6633
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6634	/*
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6635	* Is this task likely cache-hot:
				6636	*/
Hillf Danton	5d5e2b1	2014-06-10 10:58:43 +0200	[diff] [blame]	6637	static int task_hot(struct task_struct p, struct lb_env env)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6638	{
				6639	s64 delta;
				6640
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	6641	lockdep_assert_held(&env->src_rq->lock);
				6642
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6643	if (p->sched_class != &fair_sched_class)
				6644	return 0;
				6645
				6646	if (unlikely(p->policy == SCHED_IDLE))
				6647	return 0;
				6648
				6649	/*
				6650	* Buddy candidates are cache hot:
				6651	*/
Hillf Danton	5d5e2b1	2014-06-10 10:58:43 +0200	[diff] [blame]	6652	if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6653	(&p->se == cfs_rq_of(&p->se)->next \|\|
				6654	&p->se == cfs_rq_of(&p->se)->last))
				6655	return 1;
				6656
				6657	if (sysctl_sched_migration_cost == -1)
				6658	return 1;
				6659	if (sysctl_sched_migration_cost == 0)
				6660	return 0;
				6661
Hillf Danton	5d5e2b1	2014-06-10 10:58:43 +0200	[diff] [blame]	6662	delta = rq_clock_task(env->src_rq) - p->se.exec_start;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6663
				6664	return delta < (s64)sysctl_sched_migration_cost;
				6665	}
				6666
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	6667	#ifdef CONFIG_NUMA_BALANCING
Rik van Riel	c1ceac6	2015-05-14 22:59:36 -0400	[diff] [blame]	6668	/*
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	6669	* Returns 1, if task migration degrades locality
				6670	* Returns 0, if task migration improves locality i.e migration preferred.
				6671	* Returns -1, if task migration is not affected by locality.
Rik van Riel	c1ceac6	2015-05-14 22:59:36 -0400	[diff] [blame]	6672	*/
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	6673	static int migrate_degrades_locality(struct task_struct p, struct lb_env env)
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	6674	{
Rik van Riel	b1ad065	2014-05-15 13:03:06 -0400	[diff] [blame]	6675	struct numa_group *numa_group = rcu_dereference(p->numa_group);
Rik van Riel	c1ceac6	2015-05-14 22:59:36 -0400	[diff] [blame]	6676	unsigned long src_faults, dst_faults;
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	6677	int src_nid, dst_nid;
				6678
Srikar Dronamraju	2a59572	2015-08-11 21:54:21 +0530	[diff] [blame]	6679	if (!static_branch_likely(&sched_numa_balancing))
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	6680	return -1;
				6681
Srikar Dronamraju	c3b9bc5	2015-08-11 16:30:12 +0530	[diff] [blame]	6682	if (!p->numa_faults \|\| !(env->sd->flags & SD_NUMA))
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	6683	return -1;
Mel Gorman	7a0f308	2013-10-07 11:29:01 +0100	[diff] [blame]	6684
				6685	src_nid = cpu_to_node(env->src_cpu);
				6686	dst_nid = cpu_to_node(env->dst_cpu);
				6687
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	6688	if (src_nid == dst_nid)
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	6689	return -1;
Mel Gorman	7a0f308	2013-10-07 11:29:01 +0100	[diff] [blame]	6690
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	6691	/* Migrating away from the preferred node is always bad. */
				6692	if (src_nid == p->numa_preferred_nid) {
				6693	if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
				6694	return 1;
				6695	else
				6696	return -1;
				6697	}
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	6698
Rik van Riel	c1ceac6	2015-05-14 22:59:36 -0400	[diff] [blame]	6699	/* Encourage migration to the preferred node. */
				6700	if (dst_nid == p->numa_preferred_nid)
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	6701	return 0;
Rik van Riel	c1ceac6	2015-05-14 22:59:36 -0400	[diff] [blame]	6702
				6703	if (numa_group) {
				6704	src_faults = group_faults(p, src_nid);
				6705	dst_faults = group_faults(p, dst_nid);
				6706	} else {
				6707	src_faults = task_faults(p, src_nid);
				6708	dst_faults = task_faults(p, dst_nid);
				6709	}
				6710
				6711	return dst_faults < src_faults;
Mel Gorman	7a0f308	2013-10-07 11:29:01 +0100	[diff] [blame]	6712	}
				6713
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	6714	#else
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	6715	static inline int migrate_degrades_locality(struct task_struct *p,
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	6716	struct lb_env *env)
				6717	{
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	6718	return -1;
Mel Gorman	7a0f308	2013-10-07 11:29:01 +0100	[diff] [blame]	6719	}
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	6720	#endif
				6721
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6722	/*
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6723	* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
				6724	*/
				6725	static
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	6726	int can_migrate_task(struct task_struct p, struct lb_env env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6727	{
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	6728	int tsk_cache_hot;
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	6729
				6730	lockdep_assert_held(&env->src_rq->lock);
				6731
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6732	/*
				6733	* We do not migrate tasks that are:
Joonsoo Kim	d319808	2013-04-23 17:27:40 +0900	[diff] [blame]	6734	* 1) throttled_lb_pair, or
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6735	* 2) cannot be migrated to this CPU due to cpus_allowed, or
Joonsoo Kim	d319808	2013-04-23 17:27:40 +0900	[diff] [blame]	6736	* 3) running (obviously), or
				6737	* 4) are cache-hot on their current CPU.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6738	*/
Joonsoo Kim	d319808	2013-04-23 17:27:40 +0900	[diff] [blame]	6739	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
				6740	return 0;
				6741
Ingo Molnar	0c98d34	2017-02-05 15:38:10 +0100	[diff] [blame]	6742	if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) {
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	6743	int cpu;
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	6744
Josh Poimboeuf	ae92882	2016-06-17 12:43:24 -0500	[diff] [blame]	6745	schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	6746
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	6747	env->flags \|= LBF_SOME_PINNED;
				6748
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	6749	/*
				6750	* Remember if this task can be migrated to any other cpu in
				6751	* our sched_group. We may want to revisit it if we couldn't
				6752	* meet load balance goals by pulling other tasks on src_cpu.
				6753	*
				6754	* Also avoid computing new_dst_cpu if we have already computed
				6755	* one in current iteration.
				6756	*/
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	6757	if (!env->dst_grpmask \|\| (env->flags & LBF_DST_PINNED))
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	6758	return 0;
				6759
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	6760	/* Prevent to re-select dst_cpu via env's cpus */
				6761	for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
Ingo Molnar	0c98d34	2017-02-05 15:38:10 +0100	[diff] [blame]	6762	if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	6763	env->flags \|= LBF_DST_PINNED;
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	6764	env->new_dst_cpu = cpu;
				6765	break;
				6766	}
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	6767	}
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	6768
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6769	return 0;
				6770	}
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	6771
				6772	/* Record that we found atleast one task that could run on dst_cpu */
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	6773	env->flags &= ~LBF_ALL_PINNED;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6774
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	6775	if (task_running(env->src_rq, p)) {
Josh Poimboeuf	ae92882	2016-06-17 12:43:24 -0500	[diff] [blame]	6776	schedstat_inc(p->se.statistics.nr_failed_migrations_running);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6777	return 0;
				6778	}
				6779
				6780	/*
				6781	* Aggressive migration if:
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	6782	* 1) destination numa is preferred
				6783	* 2) task is cache cold, or
				6784	* 3) too many balance attempts have failed.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6785	*/
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	6786	tsk_cache_hot = migrate_degrades_locality(p, env);
				6787	if (tsk_cache_hot == -1)
				6788	tsk_cache_hot = task_hot(p, env);
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	6789
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	6790	if (tsk_cache_hot <= 0 \|\|
Kirill Tkhai	7a96c23	2014-09-22 22:36:12 +0400	[diff] [blame]	6791	env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	6792	if (tsk_cache_hot == 1) {
Josh Poimboeuf	ae92882	2016-06-17 12:43:24 -0500	[diff] [blame]	6793	schedstat_inc(env->sd->lb_hot_gained[env->idle]);
				6794	schedstat_inc(p->se.statistics.nr_forced_migrations);
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	6795	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6796	return 1;
				6797	}
				6798
Josh Poimboeuf	ae92882	2016-06-17 12:43:24 -0500	[diff] [blame]	6799	schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
Zhang Hang	4e2dcb7	2013-04-10 14:04:55 +0800	[diff] [blame]	6800	return 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6801	}
				6802
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	6803	/*
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6804	* detach_task() -- detach the task for the migration specified in env
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	6805	*/
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6806	static void detach_task(struct task_struct p, struct lb_env env)
				6807	{
				6808	lockdep_assert_held(&env->src_rq->lock);
				6809
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6810	p->on_rq = TASK_ON_RQ_MIGRATING;
Peter Zijlstra	5704ac0	2017-02-21 17:15:21 +0100	[diff] [blame]	6811	deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK);
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6812	set_task_cpu(p, env->dst_cpu);
				6813	}
				6814
				6815	/*
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	6816	* detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	6817	* part of active balancing operations within "domain".
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	6818	*
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	6819	* Returns a task if successful and NULL otherwise.
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	6820	*/
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	6821	static struct task_struct detach_one_task(struct lb_env env)
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	6822	{
				6823	struct task_struct p, n;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	6824
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	6825	lockdep_assert_held(&env->src_rq->lock);
				6826
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	6827	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	6828	if (!can_migrate_task(p, env))
				6829	continue;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	6830
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6831	detach_task(p, env);
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	6832
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	6833	/*
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	6834	* Right now, this is only the second place where
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6835	* lb_gained[env->idle] is updated (other is detach_tasks)
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	6836	* so we can safely collect stats here rather than
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6837	* inside detach_tasks().
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	6838	*/
Josh Poimboeuf	ae92882	2016-06-17 12:43:24 -0500	[diff] [blame]	6839	schedstat_inc(env->sd->lb_gained[env->idle]);
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	6840	return p;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	6841	}
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	6842	return NULL;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	6843	}
				6844
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	6845	static const unsigned int sched_nr_migrate_break = 32;
				6846
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	6847	/*
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6848	* detach_tasks() -- tries to detach up to imbalance weighted load from
				6849	* busiest_rq, as part of a balancing operation within domain "sd".
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	6850	*
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6851	* Returns number of detached tasks if successful and 0 otherwise.
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	6852	*/
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6853	static int detach_tasks(struct lb_env *env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6854	{
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	6855	struct list_head *tasks = &env->src_rq->cfs_tasks;
				6856	struct task_struct *p;
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	6857	unsigned long load;
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6858	int detached = 0;
				6859
				6860	lockdep_assert_held(&env->src_rq->lock);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6861
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	6862	if (env->imbalance <= 0)
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	6863	return 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6864
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	6865	while (!list_empty(tasks)) {
Yuyang Du	985d3a4	2015-07-06 06:11:51 +0800	[diff] [blame]	6866	/*
				6867	* We don't want to steal all, otherwise we may be treated likewise,
				6868	* which could at worst lead to a livelock crash.
				6869	*/
				6870	if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
				6871	break;
				6872
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	6873	p = list_first_entry(tasks, struct task_struct, se.group_node);
				6874
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	6875	env->loop++;
				6876	/* We've more or less seen every task there is, call it quits */
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	6877	if (env->loop > env->loop_max)
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	6878	break;
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	6879
				6880	/* take a breather every nr_migrate tasks */
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	6881	if (env->loop > env->loop_break) {
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	6882	env->loop_break += sched_nr_migrate_break;
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	6883	env->flags \|= LBF_NEED_BREAK;
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	6884	break;
Peter Zijlstra	a195f00	2011-09-22 15:30:18 +0200	[diff] [blame]	6885	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6886
Joonsoo Kim	d319808	2013-04-23 17:27:40 +0900	[diff] [blame]	6887	if (!can_migrate_task(p, env))
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	6888	goto next;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6889
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	6890	load = task_h_load(p);
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	6891
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	6892	if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	6893	goto next;
				6894
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	6895	if ((load / 2) > env->imbalance)
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	6896	goto next;
				6897
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6898	detach_task(p, env);
				6899	list_add(&p->se.group_node, &env->tasks);
				6900
				6901	detached++;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	6902	env->imbalance -= load;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6903
				6904	#ifdef CONFIG_PREEMPT
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	6905	/*
				6906	* NEWIDLE balancing is a source of latency, so preemptible
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6907	* kernels will stop after the first task is detached to minimize
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	6908	* the critical section.
				6909	*/
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	6910	if (env->idle == CPU_NEWLY_IDLE)
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	6911	break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6912	#endif
				6913
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	6914	/*
				6915	* We only want to steal up to the prescribed amount of
				6916	* weighted load.
				6917	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	6918	if (env->imbalance <= 0)
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	6919	break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6920
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	6921	continue;
				6922	next:
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	6923	list_move_tail(&p->se.group_node, tasks);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6924	}
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	6925
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6926	/*
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6927	* Right now, this is one of only two places we collect this stat
				6928	* so we can safely collect detach_one_task() stats here rather
				6929	* than inside detach_one_task().
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6930	*/
Josh Poimboeuf	ae92882	2016-06-17 12:43:24 -0500	[diff] [blame]	6931	schedstat_add(env->sd->lb_gained[env->idle], detached);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6932
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6933	return detached;
				6934	}
				6935
				6936	/*
				6937	* attach_task() -- attach the task detached by detach_task() to its new rq.
				6938	*/
				6939	static void attach_task(struct rq rq, struct task_struct p)
				6940	{
				6941	lockdep_assert_held(&rq->lock);
				6942
				6943	BUG_ON(task_rq(p) != rq);
Peter Zijlstra	5704ac0	2017-02-21 17:15:21 +0100	[diff] [blame]	6944	activate_task(rq, p, ENQUEUE_NOCLOCK);
Joonwoo Park	3ea94de	2015-11-12 19:38:54 -0800	[diff] [blame]	6945	p->on_rq = TASK_ON_RQ_QUEUED;
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6946	check_preempt_curr(rq, p, 0);
				6947	}
				6948
				6949	/*
				6950	* attach_one_task() -- attaches the task returned from detach_one_task() to
				6951	* its new rq.
				6952	*/
				6953	static void attach_one_task(struct rq rq, struct task_struct p)
				6954	{
Peter Zijlstra	8a8c69c	2016-10-04 16:04:35 +0200	[diff] [blame]	6955	struct rq_flags rf;
				6956
				6957	rq_lock(rq, &rf);
Peter Zijlstra	5704ac0	2017-02-21 17:15:21 +0100	[diff] [blame]	6958	update_rq_clock(rq);
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6959	attach_task(rq, p);
Peter Zijlstra	8a8c69c	2016-10-04 16:04:35 +0200	[diff] [blame]	6960	rq_unlock(rq, &rf);
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6961	}
				6962
				6963	/*
				6964	* attach_tasks() -- attaches all tasks detached by detach_tasks() to their
				6965	* new rq.
				6966	*/
				6967	static void attach_tasks(struct lb_env *env)
				6968	{
				6969	struct list_head *tasks = &env->tasks;
				6970	struct task_struct *p;
Peter Zijlstra	8a8c69c	2016-10-04 16:04:35 +0200	[diff] [blame]	6971	struct rq_flags rf;
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6972
Peter Zijlstra	8a8c69c	2016-10-04 16:04:35 +0200	[diff] [blame]	6973	rq_lock(env->dst_rq, &rf);
Peter Zijlstra	5704ac0	2017-02-21 17:15:21 +0100	[diff] [blame]	6974	update_rq_clock(env->dst_rq);
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6975
				6976	while (!list_empty(tasks)) {
				6977	p = list_first_entry(tasks, struct task_struct, se.group_node);
				6978	list_del_init(&p->se.group_node);
				6979
				6980	attach_task(env->dst_rq, p);
				6981	}
				6982
Peter Zijlstra	8a8c69c	2016-10-04 16:04:35 +0200	[diff] [blame]	6983	rq_unlock(env->dst_rq, &rf);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6984	}
				6985
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	6986	#ifdef CONFIG_FAIR_GROUP_SCHED
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	6987	static void update_blocked_averages(int cpu)
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	6988	{
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	6989	struct rq *rq = cpu_rq(cpu);
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	6990	struct cfs_rq *cfs_rq;
Peter Zijlstra	8a8c69c	2016-10-04 16:04:35 +0200	[diff] [blame]	6991	struct rq_flags rf;
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	6992
Peter Zijlstra	8a8c69c	2016-10-04 16:04:35 +0200	[diff] [blame]	6993	rq_lock_irqsave(rq, &rf);
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	6994	update_rq_clock(rq);
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	6995
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	6996	/*
				6997	* Iterates the task_group tree in a bottom up fashion, see
				6998	* list_add_leaf_cfs_rq() for details.
				6999	*/
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	7000	for_each_leaf_cfs_rq(rq, cfs_rq) {
Vincent Guittot	bc42789	2017-03-17 14:47:22 +0100	[diff] [blame]	7001	struct sched_entity *se;
				7002
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	7003	/* throttled entities do not contribute to load */
				7004	if (throttled_hierarchy(cfs_rq))
				7005	continue;
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	7006
Steve Muckle	a2c6c91	2016-03-24 15:26:07 -0700	[diff] [blame]	7007	if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	7008	update_tg_load_avg(cfs_rq, 0);
Vincent Guittot	4e51607	2016-11-08 10:53:46 +0100	[diff] [blame]	7009
Vincent Guittot	bc42789	2017-03-17 14:47:22 +0100	[diff] [blame]	7010	/* Propagate pending load changes to the parent, if any: */
				7011	se = cfs_rq->tg->se[cpu];
				7012	if (se && !skip_blocked_update(se))
				7013	update_load_avg(se, 0);
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	7014	}
Peter Zijlstra	8a8c69c	2016-10-04 16:04:35 +0200	[diff] [blame]	7015	rq_unlock_irqrestore(rq, &rf);
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	7016	}
				7017
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	7018	/*
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	7019	* Compute the hierarchical load factor for cfs_rq and all its ascendants.
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	7020	* This needs to be done in a top-down fashion because the load of a child
				7021	* group is a fraction of its parents load.
				7022	*/
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	7023	static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	7024	{
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	7025	struct rq *rq = rq_of(cfs_rq);
				7026	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
Peter Zijlstra	a35b646	2012-08-08 21:46:40 +0200	[diff] [blame]	7027	unsigned long now = jiffies;
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	7028	unsigned long load;
Peter Zijlstra	a35b646	2012-08-08 21:46:40 +0200	[diff] [blame]	7029
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	7030	if (cfs_rq->last_h_load_update == now)
Peter Zijlstra	a35b646	2012-08-08 21:46:40 +0200	[diff] [blame]	7031	return;
				7032
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	7033	cfs_rq->h_load_next = NULL;
				7034	for_each_sched_entity(se) {
				7035	cfs_rq = cfs_rq_of(se);
				7036	cfs_rq->h_load_next = se;
				7037	if (cfs_rq->last_h_load_update == now)
				7038	break;
				7039	}
Peter Zijlstra	a35b646	2012-08-08 21:46:40 +0200	[diff] [blame]	7040
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	7041	if (!se) {
Yuyang Du	7ea241a	2015-07-15 08:04:42 +0800	[diff] [blame]	7042	cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	7043	cfs_rq->last_h_load_update = now;
				7044	}
				7045
				7046	while ((se = cfs_rq->h_load_next) != NULL) {
				7047	load = cfs_rq->h_load;
Yuyang Du	7ea241a	2015-07-15 08:04:42 +0800	[diff] [blame]	7048	load = div64_ul(load * se->avg.load_avg,
				7049	cfs_rq_load_avg(cfs_rq) + 1);
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	7050	cfs_rq = group_cfs_rq(se);
				7051	cfs_rq->h_load = load;
				7052	cfs_rq->last_h_load_update = now;
				7053	}
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	7054	}
				7055
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	7056	static unsigned long task_h_load(struct task_struct *p)
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	7057	{
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	7058	struct cfs_rq *cfs_rq = task_cfs_rq(p);
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	7059
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	7060	update_cfs_rq_h_load(cfs_rq);
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	7061	return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
Yuyang Du	7ea241a	2015-07-15 08:04:42 +0800	[diff] [blame]	7062	cfs_rq_load_avg(cfs_rq) + 1);
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	7063	}
				7064	#else
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	7065	static inline void update_blocked_averages(int cpu)
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	7066	{
Vincent Guittot	6c1d47c	2015-07-15 08:04:38 +0800	[diff] [blame]	7067	struct rq *rq = cpu_rq(cpu);
				7068	struct cfs_rq *cfs_rq = &rq->cfs;
Peter Zijlstra	8a8c69c	2016-10-04 16:04:35 +0200	[diff] [blame]	7069	struct rq_flags rf;
Vincent Guittot	6c1d47c	2015-07-15 08:04:38 +0800	[diff] [blame]	7070
Peter Zijlstra	8a8c69c	2016-10-04 16:04:35 +0200	[diff] [blame]	7071	rq_lock_irqsave(rq, &rf);
Vincent Guittot	6c1d47c	2015-07-15 08:04:38 +0800	[diff] [blame]	7072	update_rq_clock(rq);
Steve Muckle	a2c6c91	2016-03-24 15:26:07 -0700	[diff] [blame]	7073	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
Peter Zijlstra	8a8c69c	2016-10-04 16:04:35 +0200	[diff] [blame]	7074	rq_unlock_irqrestore(rq, &rf);
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	7075	}
				7076
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	7077	static unsigned long task_h_load(struct task_struct *p)
				7078	{
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	7079	return p->se.avg.load_avg;
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	7080	}
				7081	#endif
				7082
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7083	/******** Helpers for find_busiest_group **********************/
Rik van Riel	caeb178	2014-07-28 14:16:28 -0400	[diff] [blame]	7084
				7085	enum group_type {
				7086	group_other = 0,
				7087	group_imbalanced,
				7088	group_overloaded,
				7089	};
				7090
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7091	/*
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7092	* sg_lb_stats - stats of a sched_group required for load_balancing
				7093	*/
				7094	struct sg_lb_stats {
				7095	unsigned long avg_load; /Avg load across the CPUs of the group /
				7096	unsigned long group_load; /* Total load over the CPUs of the group */
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7097	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7098	unsigned long load_per_task;
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7099	unsigned long group_capacity;
Dietmar Eggemann	9e91d61	2015-08-14 17:23:12 +0100	[diff] [blame]	7100	unsigned long group_util; /* Total utilization of the group */
Peter Zijlstra	147c5fc	2013-08-19 15:22:57 +0200	[diff] [blame]	7101	unsigned int sum_nr_running; /* Nr tasks running in the group */
Peter Zijlstra	147c5fc	2013-08-19 15:22:57 +0200	[diff] [blame]	7102	unsigned int idle_cpus;
				7103	unsigned int group_weight;
Rik van Riel	caeb178	2014-07-28 14:16:28 -0400	[diff] [blame]	7104	enum group_type group_type;
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7105	int group_no_capacity;
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	7106	#ifdef CONFIG_NUMA_BALANCING
				7107	unsigned int nr_numa_running;
				7108	unsigned int nr_preferred_running;
				7109	#endif
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7110	};
				7111
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7112	/*
				7113	* sd_lb_stats - Structure to store the statistics of a sched_domain
				7114	* during load balancing.
				7115	*/
				7116	struct sd_lb_stats {
				7117	struct sched_group busiest; / Busiest group in this sd */
				7118	struct sched_group local; / Local group in this sd */
				7119	unsigned long total_load; /* Total load of all groups in sd */
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7120	unsigned long total_capacity; /* Total capacity of all groups in sd */
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7121	unsigned long avg_load; /* Average load across all groups in sd */
				7122
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7123	struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
Peter Zijlstra	147c5fc	2013-08-19 15:22:57 +0200	[diff] [blame]	7124	struct sg_lb_stats local_stat; /* Statistics of the local group */
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7125	};
				7126
Peter Zijlstra	147c5fc	2013-08-19 15:22:57 +0200	[diff] [blame]	7127	static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
				7128	{
				7129	/*
				7130	* Skimp on the clearing to avoid duplicate work. We can avoid clearing
				7131	* local_stat because update_sg_lb_stats() does a full clear/assignment.
				7132	* We must however clear busiest_stat::avg_load because
				7133	* update_sd_pick_busiest() reads this before assignment.
				7134	*/
				7135	*sds = (struct sd_lb_stats){
				7136	.busiest = NULL,
				7137	.local = NULL,
				7138	.total_load = 0UL,
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7139	.total_capacity = 0UL,
Peter Zijlstra	147c5fc	2013-08-19 15:22:57 +0200	[diff] [blame]	7140	.busiest_stat = {
				7141	.avg_load = 0UL,
Rik van Riel	caeb178	2014-07-28 14:16:28 -0400	[diff] [blame]	7142	.sum_nr_running = 0,
				7143	.group_type = group_other,
Peter Zijlstra	147c5fc	2013-08-19 15:22:57 +0200	[diff] [blame]	7144	},
				7145	};
				7146	}
				7147
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7148	/**
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7149	* get_sd_load_idx - Obtain the load index for a given sched domain.
				7150	* @sd: The sched_domain whose load_idx is to be obtained.
Kamalesh Babulal	ed1b773	2013-10-13 23:06:15 +0530	[diff] [blame]	7151	* @idle: The idle status of the CPU for whose sd load_idx is obtained.
Yacine Belkadi	e69f618	2013-07-12 20:45:47 +0200	[diff] [blame]	7152	*
				7153	* Return: The load index.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7154	*/
				7155	static inline int get_sd_load_idx(struct sched_domain *sd,
				7156	enum cpu_idle_type idle)
				7157	{
				7158	int load_idx;
				7159
				7160	switch (idle) {
				7161	case CPU_NOT_IDLE:
				7162	load_idx = sd->busy_idx;
				7163	break;
				7164
				7165	case CPU_NEWLY_IDLE:
				7166	load_idx = sd->newidle_idx;
				7167	break;
				7168	default:
				7169	load_idx = sd->idle_idx;
				7170	break;
				7171	}
				7172
				7173	return load_idx;
				7174	}
				7175
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	7176	static unsigned long scale_rt_capacity(int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7177	{
				7178	struct rq *rq = cpu_rq(cpu);
Vincent Guittot	b5b4860	2015-02-27 16:54:08 +0100	[diff] [blame]	7179	u64 total, used, age_stamp, avg;
Peter Zijlstra	cadefd3	2014-02-27 10:40:35 +0100	[diff] [blame]	7180	s64 delta;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7181
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	7182	/*
				7183	* Since we're reading these variables without serialization make sure
				7184	* we read them once before doing sanity checks on them.
				7185	*/
Jason Low	316c1608d	2015-04-28 13:00:20 -0700	[diff] [blame]	7186	age_stamp = READ_ONCE(rq->age_stamp);
				7187	avg = READ_ONCE(rq->rt_avg);
Peter Zijlstra	cebde6d	2015-01-05 11:18:10 +0100	[diff] [blame]	7188	delta = __rq_clock_broken(rq) - age_stamp;
Venkatesh Pallipadi	aa48380	2010-10-04 17:03:22 -0700	[diff] [blame]	7189
Peter Zijlstra	cadefd3	2014-02-27 10:40:35 +0100	[diff] [blame]	7190	if (unlikely(delta < 0))
				7191	delta = 0;
				7192
				7193	total = sched_avg_period() + delta;
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	7194
Vincent Guittot	b5b4860	2015-02-27 16:54:08 +0100	[diff] [blame]	7195	used = div_u64(avg, total);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7196
Vincent Guittot	b5b4860	2015-02-27 16:54:08 +0100	[diff] [blame]	7197	if (likely(used < SCHED_CAPACITY_SCALE))
				7198	return SCHED_CAPACITY_SCALE - used;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7199
Vincent Guittot	b5b4860	2015-02-27 16:54:08 +0100	[diff] [blame]	7200	return 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7201	}
				7202
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	7203	static void update_cpu_capacity(struct sched_domain *sd, int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7204	{
Morten Rasmussen	8cd5601	2015-08-14 17:23:10 +0100	[diff] [blame]	7205	unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7206	struct sched_group *sdg = sd->groups;
				7207
Vincent Guittot	ca6d75e	2015-02-27 16:54:09 +0100	[diff] [blame]	7208	cpu_rq(cpu)->cpu_capacity_orig = capacity;
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	7209
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	7210	capacity *= scale_rt_capacity(cpu);
Nicolas Pitre	ca8ce3d	2014-05-26 18:19:39 -0400	[diff] [blame]	7211	capacity >>= SCHED_CAPACITY_SHIFT;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7212
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	7213	if (!capacity)
				7214	capacity = 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7215
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	7216	cpu_rq(cpu)->cpu_capacity = capacity;
				7217	sdg->sgc->capacity = capacity;
Morten Rasmussen	bf475ce	2016-10-14 14:41:09 +0100	[diff] [blame]	7218	sdg->sgc->min_capacity = capacity;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7219	}
				7220
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7221	void update_group_capacity(struct sched_domain *sd, int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7222	{
				7223	struct sched_domain *child = sd->child;
				7224	struct sched_group group, sdg = sd->groups;
Morten Rasmussen	bf475ce	2016-10-14 14:41:09 +0100	[diff] [blame]	7225	unsigned long capacity, min_capacity;
Vincent Guittot	4ec4412	2011-12-12 20:21:08 +0100	[diff] [blame]	7226	unsigned long interval;
				7227
				7228	interval = msecs_to_jiffies(sd->balance_interval);
				7229	interval = clamp(interval, 1UL, max_load_balance_interval);
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7230	sdg->sgc->next_update = jiffies + interval;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7231
				7232	if (!child) {
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	7233	update_cpu_capacity(sd, cpu);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7234	return;
				7235	}
				7236
Vincent Guittot	dc7ff76	2015-03-03 11:35:03 +0100	[diff] [blame]	7237	capacity = 0;
Morten Rasmussen	bf475ce	2016-10-14 14:41:09 +0100	[diff] [blame]	7238	min_capacity = ULONG_MAX;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7239
Peter Zijlstra	74a5ce2	2012-05-23 18:00:43 +0200	[diff] [blame]	7240	if (child->flags & SD_OVERLAP) {
				7241	/*
				7242	* SD_OVERLAP domains cannot assume that child groups
				7243	* span the current group.
				7244	*/
				7245
Peter Zijlstra	863bffc	2013-08-28 11:44:39 +0200	[diff] [blame]	7246	for_each_cpu(cpu, sched_group_cpus(sdg)) {
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7247	struct sched_group_capacity *sgc;
Srikar Dronamraju	9abf24d	2013-11-12 22:11:26 +0530	[diff] [blame]	7248	struct rq *rq = cpu_rq(cpu);
Peter Zijlstra	863bffc	2013-08-28 11:44:39 +0200	[diff] [blame]	7249
Srikar Dronamraju	9abf24d	2013-11-12 22:11:26 +0530	[diff] [blame]	7250	/*
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7251	* build_sched_domains() -> init_sched_groups_capacity()
Srikar Dronamraju	9abf24d	2013-11-12 22:11:26 +0530	[diff] [blame]	7252	* gets here before we've attached the domains to the
				7253	* runqueues.
				7254	*
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	7255	* Use capacity_of(), which is set irrespective of domains
				7256	* in update_cpu_capacity().
Srikar Dronamraju	9abf24d	2013-11-12 22:11:26 +0530	[diff] [blame]	7257	*
Vincent Guittot	dc7ff76	2015-03-03 11:35:03 +0100	[diff] [blame]	7258	* This avoids capacity from being 0 and
Srikar Dronamraju	9abf24d	2013-11-12 22:11:26 +0530	[diff] [blame]	7259	* causing divide-by-zero issues on boot.
Srikar Dronamraju	9abf24d	2013-11-12 22:11:26 +0530	[diff] [blame]	7260	*/
				7261	if (unlikely(!rq->sd)) {
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	7262	capacity += capacity_of(cpu);
Morten Rasmussen	bf475ce	2016-10-14 14:41:09 +0100	[diff] [blame]	7263	} else {
				7264	sgc = rq->sd->groups->sgc;
				7265	capacity += sgc->capacity;
Srikar Dronamraju	9abf24d	2013-11-12 22:11:26 +0530	[diff] [blame]	7266	}
				7267
Morten Rasmussen	bf475ce	2016-10-14 14:41:09 +0100	[diff] [blame]	7268	min_capacity = min(capacity, min_capacity);
Peter Zijlstra	863bffc	2013-08-28 11:44:39 +0200	[diff] [blame]	7269	}
Peter Zijlstra	74a5ce2	2012-05-23 18:00:43 +0200	[diff] [blame]	7270	} else {
				7271	/*
				7272	* !SD_OVERLAP domains can assume that child groups
				7273	* span the current group.
Byungchul Park	97a7142	2015-07-05 18:33:48 +0900	[diff] [blame]	7274	*/
Peter Zijlstra	74a5ce2	2012-05-23 18:00:43 +0200	[diff] [blame]	7275
				7276	group = child->groups;
				7277	do {
Morten Rasmussen	bf475ce	2016-10-14 14:41:09 +0100	[diff] [blame]	7278	struct sched_group_capacity *sgc = group->sgc;
				7279
				7280	capacity += sgc->capacity;
				7281	min_capacity = min(sgc->min_capacity, min_capacity);
Peter Zijlstra	74a5ce2	2012-05-23 18:00:43 +0200	[diff] [blame]	7282	group = group->next;
				7283	} while (group != child->groups);
				7284	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7285
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7286	sdg->sgc->capacity = capacity;
Morten Rasmussen	bf475ce	2016-10-14 14:41:09 +0100	[diff] [blame]	7287	sdg->sgc->min_capacity = min_capacity;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7288	}
				7289
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	7290	/*
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7291	* Check whether the capacity of the rq has been noticeably reduced by side
				7292	* activity. The imbalance_pct is used for the threshold.
				7293	* Return true is the capacity is reduced
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	7294	*/
				7295	static inline int
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7296	check_cpu_capacity(struct rq rq, struct sched_domain sd)
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	7297	{
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7298	return ((rq->cpu_capacity * sd->imbalance_pct) <
				7299	(rq->cpu_capacity_orig * 100));
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	7300	}
				7301
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	7302	/*
				7303	* Group imbalance indicates (and tries to solve) the problem where balancing
Ingo Molnar	0c98d34	2017-02-05 15:38:10 +0100	[diff] [blame]	7304	* groups is inadequate due to ->cpus_allowed constraints.
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	7305	*
				7306	* Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
				7307	* cpumask covering 1 cpu of the first group and 3 cpus of the second group.
				7308	* Something like:
				7309	*
Ingo Molnar	2b4d5b2	2016-11-23 07:37:00 +0100	[diff] [blame]	7310	* { 0 1 2 3 } { 4 5 6 7 }
				7311	* * * * *
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	7312	*
				7313	* If we were to balance group-wise we'd place two tasks in the first group and
				7314	* two tasks in the second group. Clearly this is undesired as it will overload
				7315	* cpu 3 and leave one of the cpus in the second group unused.
				7316	*
				7317	* The current solution to this issue is detecting the skew in the first group
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	7318	* by noticing the lower domain failed to reach balance and had difficulty
				7319	* moving tasks due to affinity constraints.
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	7320	*
				7321	* When this is so detected; this group becomes a candidate for busiest; see
Kamalesh Babulal	ed1b773	2013-10-13 23:06:15 +0530	[diff] [blame]	7322	* update_sd_pick_busiest(). And calculate_imbalance() and
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	7323	* find_busiest_group() avoid some of the usual balance conditions to allow it
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	7324	* to create an effective group imbalance.
				7325	*
				7326	* This is a somewhat tricky proposition since the next run might not find the
				7327	* group imbalance and decide the groups need to be balanced again. A most
				7328	* subtle and fragile situation.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7329	*/
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	7330
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	7331	static inline int sg_imbalanced(struct sched_group *group)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7332	{
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7333	return group->sgc->imbalance;
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	7334	}
				7335
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	7336	/*
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7337	* group_has_capacity returns true if the group has spare capacity that could
				7338	* be used by some tasks.
				7339	* We consider that a group has spare capacity if the * number of task is
Dietmar Eggemann	9e91d61	2015-08-14 17:23:12 +0100	[diff] [blame]	7340	* smaller than the number of CPUs or if the utilization is lower than the
				7341	* available capacity for CFS tasks.
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7342	* For the latter, we use a threshold to stabilize the state, to take into
				7343	* account the variance of the tasks' load and to return true if the available
				7344	* capacity in meaningful for the load balancer.
				7345	* As an example, an available capacity of 1% can appear but it doesn't make
				7346	* any benefit for the load balance.
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	7347	*/
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7348	static inline bool
				7349	group_has_capacity(struct lb_env env, struct sg_lb_stats sgs)
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	7350	{
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7351	if (sgs->sum_nr_running < sgs->group_weight)
				7352	return true;
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	7353
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7354	if ((sgs->group_capacity * 100) >
Dietmar Eggemann	9e91d61	2015-08-14 17:23:12 +0100	[diff] [blame]	7355	(sgs->group_util * env->sd->imbalance_pct))
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7356	return true;
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	7357
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7358	return false;
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	7359	}
				7360
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7361	/*
				7362	* group_is_overloaded returns true if the group has more tasks than it can
				7363	* handle.
				7364	* group_is_overloaded is not equals to !group_has_capacity because a group
				7365	* with the exact right number of tasks, has no more spare capacity but is not
				7366	* overloaded so both group_has_capacity and group_is_overloaded return
				7367	* false.
				7368	*/
				7369	static inline bool
				7370	group_is_overloaded(struct lb_env env, struct sg_lb_stats sgs)
Rik van Riel	caeb178	2014-07-28 14:16:28 -0400	[diff] [blame]	7371	{
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7372	if (sgs->sum_nr_running <= sgs->group_weight)
				7373	return false;
				7374
				7375	if ((sgs->group_capacity * 100) <
Dietmar Eggemann	9e91d61	2015-08-14 17:23:12 +0100	[diff] [blame]	7376	(sgs->group_util * env->sd->imbalance_pct))
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7377	return true;
				7378
				7379	return false;
				7380	}
				7381
Morten Rasmussen	9e0994c	2016-10-14 14:41:10 +0100	[diff] [blame]	7382	/*
				7383	* group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
				7384	* per-CPU capacity than sched_group ref.
				7385	*/
				7386	static inline bool
				7387	group_smaller_cpu_capacity(struct sched_group sg, struct sched_group ref)
				7388	{
				7389	return sg->sgc->min_capacity * capacity_margin <
				7390	ref->sgc->min_capacity * 1024;
				7391	}
				7392
Leo Yan	79a89f9	2015-09-15 18:56:45 +0800	[diff] [blame]	7393	static inline enum
				7394	group_type group_classify(struct sched_group *group,
				7395	struct sg_lb_stats *sgs)
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7396	{
				7397	if (sgs->group_no_capacity)
Rik van Riel	caeb178	2014-07-28 14:16:28 -0400	[diff] [blame]	7398	return group_overloaded;
				7399
				7400	if (sg_imbalanced(group))
				7401	return group_imbalanced;
				7402
				7403	return group_other;
				7404	}
				7405
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7406	/**
				7407	* update_sg_lb_stats - Update sched_group's statistics for load balancing.
				7408	* @env: The load balancing environment.
				7409	* @group: sched_group whose statistics are to be updated.
				7410	* @load_idx: Load index of sched_domain of this_cpu for load calc.
				7411	* @local_group: Does group contain this_cpu.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7412	* @sgs: variable to hold the statistics for this group.
Masanari Iida	cd3bd4e	2014-07-28 12:38:06 +0900	[diff] [blame]	7413	* @overload: Indicate more than one runnable task for any CPU.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7414	*/
				7415	static inline void update_sg_lb_stats(struct lb_env *env,
				7416	struct sched_group *group, int load_idx,
Tim Chen	4486edd	2014-06-23 12:16:49 -0700	[diff] [blame]	7417	int local_group, struct sg_lb_stats *sgs,
				7418	bool *overload)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7419	{
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	7420	unsigned long load;
Waiman Long	a426f99	2015-11-25 14:09:38 -0500	[diff] [blame]	7421	int i, nr_running;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7422
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	7423	memset(sgs, 0, sizeof(*sgs));
				7424
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	7425	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7426	struct rq *rq = cpu_rq(i);
				7427
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7428	/* Bias balancing toward cpus of our domain */
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	7429	if (local_group)
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	7430	load = target_load(i, load_idx);
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	7431	else
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7432	load = source_load(i, load_idx);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7433
				7434	sgs->group_load += load;
Dietmar Eggemann	9e91d61	2015-08-14 17:23:12 +0100	[diff] [blame]	7435	sgs->group_util += cpu_util(i);
Vincent Guittot	65fdac0	2014-08-26 13:06:46 +0200	[diff] [blame]	7436	sgs->sum_nr_running += rq->cfs.h_nr_running;
Tim Chen	4486edd	2014-06-23 12:16:49 -0700	[diff] [blame]	7437
Waiman Long	a426f99	2015-11-25 14:09:38 -0500	[diff] [blame]	7438	nr_running = rq->nr_running;
				7439	if (nr_running > 1)
Tim Chen	4486edd	2014-06-23 12:16:49 -0700	[diff] [blame]	7440	*overload = true;
				7441
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	7442	#ifdef CONFIG_NUMA_BALANCING
				7443	sgs->nr_numa_running += rq->nr_numa_running;
				7444	sgs->nr_preferred_running += rq->nr_preferred_running;
				7445	#endif
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7446	sgs->sum_weighted_load += weighted_cpuload(i);
Waiman Long	a426f99	2015-11-25 14:09:38 -0500	[diff] [blame]	7447	/*
				7448	* No need to call idle_cpu() if nr_running is not 0
				7449	*/
				7450	if (!nr_running && idle_cpu(i))
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	7451	sgs->idle_cpus++;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7452	}
				7453
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7454	/* Adjust by relative CPU capacity of the group */
				7455	sgs->group_capacity = group->sgc->capacity;
Nicolas Pitre	ca8ce3d	2014-05-26 18:19:39 -0400	[diff] [blame]	7456	sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7457
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	7458	if (sgs->sum_nr_running)
Peter Zijlstra	38d0f77	2013-08-15 19:47:56 +0200	[diff] [blame]	7459	sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7460
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	7461	sgs->group_weight = group->group_weight;
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	7462
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7463	sgs->group_no_capacity = group_is_overloaded(env, sgs);
Leo Yan	79a89f9	2015-09-15 18:56:45 +0800	[diff] [blame]	7464	sgs->group_type = group_classify(group, sgs);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7465	}
				7466
				7467	/**
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7468	* update_sd_pick_busiest - return 1 on busiest group
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	7469	* @env: The load balancing environment.
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7470	* @sds: sched_domain statistics
				7471	* @sg: sched_group candidate to be checked for being the busiest
Michael Neuling	b6b1229	2010-06-10 12:06:21 +1000	[diff] [blame]	7472	* @sgs: sched_group statistics
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7473	*
				7474	* Determine if @sg is a busier group than the previously selected
				7475	* busiest group.
Yacine Belkadi	e69f618	2013-07-12 20:45:47 +0200	[diff] [blame]	7476	*
				7477	* Return: %true if @sg is a busier group than the previously selected
				7478	* busiest group. %false otherwise.
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7479	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7480	static bool update_sd_pick_busiest(struct lb_env *env,
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7481	struct sd_lb_stats *sds,
				7482	struct sched_group *sg,
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7483	struct sg_lb_stats *sgs)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7484	{
Rik van Riel	caeb178	2014-07-28 14:16:28 -0400	[diff] [blame]	7485	struct sg_lb_stats *busiest = &sds->busiest_stat;
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7486
Rik van Riel	caeb178	2014-07-28 14:16:28 -0400	[diff] [blame]	7487	if (sgs->group_type > busiest->group_type)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7488	return true;
				7489
Rik van Riel	caeb178	2014-07-28 14:16:28 -0400	[diff] [blame]	7490	if (sgs->group_type < busiest->group_type)
				7491	return false;
				7492
				7493	if (sgs->avg_load <= busiest->avg_load)
				7494	return false;
				7495
Morten Rasmussen	9e0994c	2016-10-14 14:41:10 +0100	[diff] [blame]	7496	if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
				7497	goto asym_packing;
				7498
				7499	/*
				7500	* Candidate sg has no more than one task per CPU and
				7501	* has higher per-CPU capacity. Migrating tasks to less
				7502	* capable CPUs may harm throughput. Maximize throughput,
				7503	* power/energy consequences are not considered.
				7504	*/
				7505	if (sgs->sum_nr_running <= sgs->group_weight &&
				7506	group_smaller_cpu_capacity(sds->local, sg))
				7507	return false;
				7508
				7509	asym_packing:
Rik van Riel	caeb178	2014-07-28 14:16:28 -0400	[diff] [blame]	7510	/* This is the busiest node in its class. */
				7511	if (!(env->sd->flags & SD_ASYM_PACKING))
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7512	return true;
				7513
Srikar Dronamraju	1f621e0	2016-04-06 18:47:40 +0530	[diff] [blame]	7514	/* No ASYM_PACKING if target cpu is already busy */
				7515	if (env->idle == CPU_NOT_IDLE)
				7516	return true;
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7517	/*
Tim Chen	afe06ef	2016-11-22 12:23:53 -0800	[diff] [blame]	7518	* ASYM_PACKING needs to move all the work to the highest
				7519	* prority CPUs in the group, therefore mark all groups
				7520	* of lower priority than ourself as busy.
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7521	*/
Tim Chen	afe06ef	2016-11-22 12:23:53 -0800	[diff] [blame]	7522	if (sgs->sum_nr_running &&
				7523	sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7524	if (!sds->busiest)
				7525	return true;
				7526
Tim Chen	afe06ef	2016-11-22 12:23:53 -0800	[diff] [blame]	7527	/* Prefer to move from lowest priority cpu's work */
				7528	if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
				7529	sg->asym_prefer_cpu))
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7530	return true;
				7531	}
				7532
				7533	return false;
				7534	}
				7535
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	7536	#ifdef CONFIG_NUMA_BALANCING
				7537	static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
				7538	{
				7539	if (sgs->sum_nr_running > sgs->nr_numa_running)
				7540	return regular;
				7541	if (sgs->sum_nr_running > sgs->nr_preferred_running)
				7542	return remote;
				7543	return all;
				7544	}
				7545
				7546	static inline enum fbq_type fbq_classify_rq(struct rq *rq)
				7547	{
				7548	if (rq->nr_running > rq->nr_numa_running)
				7549	return regular;
				7550	if (rq->nr_running > rq->nr_preferred_running)
				7551	return remote;
				7552	return all;
				7553	}
				7554	#else
				7555	static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
				7556	{
				7557	return all;
				7558	}
				7559
				7560	static inline enum fbq_type fbq_classify_rq(struct rq *rq)
				7561	{
				7562	return regular;
				7563	}
				7564	#endif /* CONFIG_NUMA_BALANCING */
				7565
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7566	/**
Hui Kang	461819a	2011-10-11 23:00:59 -0400	[diff] [blame]	7567	* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	7568	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7569	* @sds: variable to hold the statistics for this sched_domain.
				7570	*/
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	7571	static inline void update_sd_lb_stats(struct lb_env env, struct sd_lb_stats sds)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7572	{
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7573	struct sched_domain *child = env->sd->child;
				7574	struct sched_group *sg = env->sd->groups;
Srikar Dronamraju	05b40e0	2017-03-22 23:27:50 +0530	[diff] [blame]	7575	struct sg_lb_stats *local = &sds->local_stat;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7576	struct sg_lb_stats tmp_sgs;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7577	int load_idx, prefer_sibling = 0;
Tim Chen	4486edd	2014-06-23 12:16:49 -0700	[diff] [blame]	7578	bool overload = false;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7579
				7580	if (child && child->flags & SD_PREFER_SIBLING)
				7581	prefer_sibling = 1;
				7582
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7583	load_idx = get_sd_load_idx(env->sd, env->idle);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7584
				7585	do {
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7586	struct sg_lb_stats *sgs = &tmp_sgs;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7587	int local_group;
				7588
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7589	local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7590	if (local_group) {
				7591	sds->local = sg;
Srikar Dronamraju	05b40e0	2017-03-22 23:27:50 +0530	[diff] [blame]	7592	sgs = local;
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	7593
				7594	if (env->idle != CPU_NEWLY_IDLE \|\|
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7595	time_after_eq(jiffies, sg->sgc->next_update))
				7596	update_group_capacity(env->sd, env->dst_cpu);
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7597	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7598
Tim Chen	4486edd	2014-06-23 12:16:49 -0700	[diff] [blame]	7599	update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
				7600	&overload);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7601
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	7602	if (local_group)
				7603	goto next_group;
				7604
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7605	/*
				7606	* In case the child domain prefers tasks go to siblings
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7607	* first, lower the sg capacity so that we'll try
Nikhil Rao	75dd321	2010-10-15 13:12:30 -0700	[diff] [blame]	7608	* and move all the excess tasks away. We lower the capacity
				7609	* of a group only if the local group has the capacity to fit
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7610	* these excess tasks. The extra check prevents the case where
				7611	* you always pull from the heaviest group when it is already
				7612	* under-utilized (possible with a large weight task outweighs
				7613	* the tasks on the system).
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7614	*/
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	7615	if (prefer_sibling && sds->local &&
Srikar Dronamraju	05b40e0	2017-03-22 23:27:50 +0530	[diff] [blame]	7616	group_has_capacity(env, local) &&
				7617	(sgs->sum_nr_running > local->sum_nr_running + 1)) {
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7618	sgs->group_no_capacity = 1;
Leo Yan	79a89f9	2015-09-15 18:56:45 +0800	[diff] [blame]	7619	sgs->group_type = group_classify(sg, sgs);
Wanpeng Li	cb0b9f2	2014-11-05 07:44:50 +0800	[diff] [blame]	7620	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7621
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	7622	if (update_sd_pick_busiest(env, sds, sg, sgs)) {
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7623	sds->busiest = sg;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7624	sds->busiest_stat = *sgs;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7625	}
				7626
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	7627	next_group:
				7628	/* Now, start updating sd_lb_stats */
				7629	sds->total_load += sgs->group_load;
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7630	sds->total_capacity += sgs->group_capacity;
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	7631
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7632	sg = sg->next;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7633	} while (sg != env->sd->groups);
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	7634
				7635	if (env->sd->flags & SD_NUMA)
				7636	env->fbq_type = fbq_classify_group(&sds->busiest_stat);
Tim Chen	4486edd	2014-06-23 12:16:49 -0700	[diff] [blame]	7637
				7638	if (!env->sd->parent) {
				7639	/* update overload indicator if we are at root domain */
				7640	if (env->dst_rq->rd->overload != overload)
				7641	env->dst_rq->rd->overload = overload;
				7642	}
				7643
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7644	}
				7645
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7646	/**
				7647	* check_asym_packing - Check to see if the group is packed into the
				7648	* sched doman.
				7649	*
				7650	* This is primarily intended to used at the sibling level. Some
				7651	* cores like POWER7 prefer to use lower numbered SMT threads. In the
				7652	* case of POWER7, it can move to lower SMT modes only when higher
				7653	* threads are idle. When in lower SMT modes, the threads will
				7654	* perform better since they share less core resources. Hence when we
				7655	* have idle threads, we want them to be the higher ones.
				7656	*
				7657	* This packing function is run on idle threads. It checks to see if
				7658	* the busiest CPU in this domain (core in the P7 case) has a higher
				7659	* CPU number than the packing function is being run on. Here we are
				7660	* assuming lower CPU number will be equivalent to lower a SMT thread
				7661	* number.
				7662	*
Yacine Belkadi	e69f618	2013-07-12 20:45:47 +0200	[diff] [blame]	7663	* Return: 1 when packing is required and a task should be moved to
Michael Neuling	b6b1229	2010-06-10 12:06:21 +1000	[diff] [blame]	7664	* this CPU. The amount of the imbalance is returned in *imbalance.
				7665	*
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	7666	* @env: The load balancing environment.
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7667	* @sds: Statistics of the sched_domain which is to be packed
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7668	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7669	static int check_asym_packing(struct lb_env env, struct sd_lb_stats sds)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7670	{
				7671	int busiest_cpu;
				7672
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7673	if (!(env->sd->flags & SD_ASYM_PACKING))
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7674	return 0;
				7675
Srikar Dronamraju	1f621e0	2016-04-06 18:47:40 +0530	[diff] [blame]	7676	if (env->idle == CPU_NOT_IDLE)
				7677	return 0;
				7678
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7679	if (!sds->busiest)
				7680	return 0;
				7681
Tim Chen	afe06ef	2016-11-22 12:23:53 -0800	[diff] [blame]	7682	busiest_cpu = sds->busiest->asym_prefer_cpu;
				7683	if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7684	return 0;
				7685
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7686	env->imbalance = DIV_ROUND_CLOSEST(
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7687	sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
Nicolas Pitre	ca8ce3d	2014-05-26 18:19:39 -0400	[diff] [blame]	7688	SCHED_CAPACITY_SCALE);
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7689
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7690	return 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7691	}
				7692
				7693	/**
				7694	* fix_small_imbalance - Calculate the minor imbalance that exists
				7695	* amongst the groups of a sched_domain, during
				7696	* load balancing.
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	7697	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7698	* @sds: Statistics of the sched_domain whose imbalance is to be calculated.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7699	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7700	static inline
				7701	void fix_small_imbalance(struct lb_env env, struct sd_lb_stats sds)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7702	{
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7703	unsigned long tmp, capa_now = 0, capa_move = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7704	unsigned int imbn = 2;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	7705	unsigned long scaled_busy_load_per_task;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7706	struct sg_lb_stats local, busiest;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7707
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7708	local = &sds->local_stat;
				7709	busiest = &sds->busiest_stat;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7710
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7711	if (!local->sum_nr_running)
				7712	local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
				7713	else if (busiest->load_per_task > local->load_per_task)
				7714	imbn = 1;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	7715
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7716	scaled_busy_load_per_task =
Nicolas Pitre	ca8ce3d	2014-05-26 18:19:39 -0400	[diff] [blame]	7717	(busiest->load_per_task * SCHED_CAPACITY_SCALE) /
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7718	busiest->group_capacity;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7719
Vladimir Davydov	3029ede	2013-09-15 17:49:14 +0400	[diff] [blame]	7720	if (busiest->avg_load + scaled_busy_load_per_task >=
				7721	local->avg_load + (scaled_busy_load_per_task * imbn)) {
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7722	env->imbalance = busiest->load_per_task;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7723	return;
				7724	}
				7725
				7726	/*
				7727	* OK, we don't have enough imbalance to justify moving tasks,
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	7728	* however we may be able to increase total CPU capacity used by
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7729	* moving them.
				7730	*/
				7731
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7732	capa_now += busiest->group_capacity *
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7733	min(busiest->load_per_task, busiest->avg_load);
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7734	capa_now += local->group_capacity *
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7735	min(local->load_per_task, local->avg_load);
Nicolas Pitre	ca8ce3d	2014-05-26 18:19:39 -0400	[diff] [blame]	7736	capa_now /= SCHED_CAPACITY_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7737
				7738	/* Amount of load we'd subtract */
Vincent Guittot	a2cd426	2014-03-11 17:26:06 +0100	[diff] [blame]	7739	if (busiest->avg_load > scaled_busy_load_per_task) {
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7740	capa_move += busiest->group_capacity *
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7741	min(busiest->load_per_task,
Vincent Guittot	a2cd426	2014-03-11 17:26:06 +0100	[diff] [blame]	7742	busiest->avg_load - scaled_busy_load_per_task);
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7743	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7744
				7745	/* Amount of load we'd add */
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7746	if (busiest->avg_load * busiest->group_capacity <
Nicolas Pitre	ca8ce3d	2014-05-26 18:19:39 -0400	[diff] [blame]	7747	busiest->load_per_task * SCHED_CAPACITY_SCALE) {
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7748	tmp = (busiest->avg_load * busiest->group_capacity) /
				7749	local->group_capacity;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7750	} else {
Nicolas Pitre	ca8ce3d	2014-05-26 18:19:39 -0400	[diff] [blame]	7751	tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7752	local->group_capacity;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7753	}
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7754	capa_move += local->group_capacity *
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	7755	min(local->load_per_task, local->avg_load + tmp);
Nicolas Pitre	ca8ce3d	2014-05-26 18:19:39 -0400	[diff] [blame]	7756	capa_move /= SCHED_CAPACITY_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7757
				7758	/* Move if we gain throughput */
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7759	if (capa_move > capa_now)
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7760	env->imbalance = busiest->load_per_task;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7761	}
				7762
				7763	/**
				7764	* calculate_imbalance - Calculate the amount of imbalance present within the
				7765	* groups of a given sched_domain during load balance.
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7766	* @env: load balance environment
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7767	* @sds: statistics of the sched_domain whose imbalance is to be calculated.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7768	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7769	static inline void calculate_imbalance(struct lb_env env, struct sd_lb_stats sds)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7770	{
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	7771	unsigned long max_pull, load_above_capacity = ~0UL;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7772	struct sg_lb_stats local, busiest;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	7773
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7774	local = &sds->local_stat;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7775	busiest = &sds->busiest_stat;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7776
Rik van Riel	caeb178	2014-07-28 14:16:28 -0400	[diff] [blame]	7777	if (busiest->group_type == group_imbalanced) {
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	7778	/*
				7779	* In the group_imb case we cannot rely on group-wide averages
				7780	* to ensure cpu-load equilibrium, look at wider averages. XXX
				7781	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7782	busiest->load_per_task =
				7783	min(busiest->load_per_task, sds->avg_load);
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	7784	}
				7785
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7786	/*
Dietmar Eggemann	885e542	2016-04-29 20:32:39 +0100	[diff] [blame]	7787	* Avg load of busiest sg can be less and avg load of local sg can
				7788	* be greater than avg load across all sgs of sd because avg load
				7789	* factors in sg capacity and sgs with smaller group_type are
				7790	* skipped when updating the busiest sg:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7791	*/
Vladimir Davydov	b188555	2013-09-15 17:49:13 +0400	[diff] [blame]	7792	if (busiest->avg_load <= sds->avg_load \|\|
				7793	local->avg_load >= sds->avg_load) {
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7794	env->imbalance = 0;
				7795	return fix_small_imbalance(env, sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7796	}
				7797
Peter Zijlstra	9a5d9ba	2014-07-29 17:15:11 +0200	[diff] [blame]	7798	/*
				7799	* If there aren't any idle cpus, avoid creating some.
				7800	*/
				7801	if (busiest->group_type == group_overloaded &&
				7802	local->group_type == group_overloaded) {
Peter Zijlstra	1be0eb2	2016-05-06 12:21:23 +0200	[diff] [blame]	7803	load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
Morten Rasmussen	cfa1033	2016-04-29 20:32:40 +0100	[diff] [blame]	7804	if (load_above_capacity > busiest->group_capacity) {
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7805	load_above_capacity -= busiest->group_capacity;
Dietmar Eggemann	2665621	2016-08-10 11:27:27 +0100	[diff] [blame]	7806	load_above_capacity *= scale_load_down(NICE_0_LOAD);
Morten Rasmussen	cfa1033	2016-04-29 20:32:40 +0100	[diff] [blame]	7807	load_above_capacity /= busiest->group_capacity;
				7808	} else
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7809	load_above_capacity = ~0UL;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	7810	}
				7811
				7812	/*
				7813	* We're trying to get all the cpus to the average_load, so we don't
				7814	* want to push ourselves above the average load, nor do we wish to
				7815	* reduce the max loaded cpu below the average load. At the same time,
Dietmar Eggemann	0a9b23c	2016-04-29 20:32:38 +0100	[diff] [blame]	7816	* we also don't want to reduce the group load below the group
				7817	* capacity. Thus we look for the minimum possible imbalance.
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	7818	*/
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	7819	max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7820
				7821	/* How much load to actually move to equalise the imbalance */
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7822	env->imbalance = min(
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7823	max_pull * busiest->group_capacity,
				7824	(sds->avg_load - local->avg_load) * local->group_capacity
Nicolas Pitre	ca8ce3d	2014-05-26 18:19:39 -0400	[diff] [blame]	7825	) / SCHED_CAPACITY_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7826
				7827	/*
				7828	* if *imbalance is less than the average load per runnable task
Lucas De Marchi	25985ed	2011-03-30 22:57:33 -0300	[diff] [blame]	7829	* there is no guarantee that any tasks will be moved so we'll have
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7830	* a think about bumping its value to force at least one task to be
				7831	* moved
				7832	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7833	if (env->imbalance < busiest->load_per_task)
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7834	return fix_small_imbalance(env, sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7835	}
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	7836
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7837	/***** find_busiest_group() helpers end here *******************/
				7838
				7839	/**
				7840	* find_busiest_group - Returns the busiest group within the sched_domain
Dietmar Eggemann	0a9b23c	2016-04-29 20:32:38 +0100	[diff] [blame]	7841	* if there is an imbalance.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7842	*
				7843	* Also calculates the amount of weighted load which should be moved
				7844	* to restore balance.
				7845	*
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	7846	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7847	*
Yacine Belkadi	e69f618	2013-07-12 20:45:47 +0200	[diff] [blame]	7848	* Return: - The busiest group if imbalance exists.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7849	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7850	static struct sched_group find_busiest_group(struct lb_env env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7851	{
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7852	struct sg_lb_stats local, busiest;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7853	struct sd_lb_stats sds;
				7854
Peter Zijlstra	147c5fc	2013-08-19 15:22:57 +0200	[diff] [blame]	7855	init_sd_lb_stats(&sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7856
				7857	/*
				7858	* Compute the various statistics relavent for load balancing at
				7859	* this level.
				7860	*/
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	7861	update_sd_lb_stats(env, &sds);
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7862	local = &sds.local_stat;
				7863	busiest = &sds.busiest_stat;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7864
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7865	/* ASYM feature bypasses nice load balance check */
Srikar Dronamraju	1f621e0	2016-04-06 18:47:40 +0530	[diff] [blame]	7866	if (check_asym_packing(env, &sds))
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7867	return sds.busiest;
				7868
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	7869	/* There is no busy sibling group to pull tasks from */
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7870	if (!sds.busiest \|\| busiest->sum_nr_running == 0)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7871	goto out_balanced;
				7872
Nicolas Pitre	ca8ce3d	2014-05-26 18:19:39 -0400	[diff] [blame]	7873	sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
				7874	/ sds.total_capacity;
Ken Chen	b0432d8	2011-04-07 17:23:22 -0700	[diff] [blame]	7875
Peter Zijlstra	866ab43	2011-02-21 18:56:47 +0100	[diff] [blame]	7876	/*
				7877	* If the busiest group is imbalanced the below checks don't
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	7878	* work because they assume all things are equal, which typically
Peter Zijlstra	866ab43	2011-02-21 18:56:47 +0100	[diff] [blame]	7879	* isn't true due to cpus_allowed constraints and the like.
				7880	*/
Rik van Riel	caeb178	2014-07-28 14:16:28 -0400	[diff] [blame]	7881	if (busiest->group_type == group_imbalanced)
Peter Zijlstra	866ab43	2011-02-21 18:56:47 +0100	[diff] [blame]	7882	goto force_balance;
				7883
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	7884	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7885	if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
				7886	busiest->group_no_capacity)
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	7887	goto force_balance;
				7888
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	7889	/*
Zhihui Zhang	9c58c79	2014-09-20 21:24:36 -0400	[diff] [blame]	7890	* If the local group is busier than the selected busiest group
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	7891	* don't try and pull any tasks.
				7892	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7893	if (local->avg_load >= busiest->avg_load)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7894	goto out_balanced;
				7895
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	7896	/*
				7897	* Don't pull any tasks if this group is already above the domain
				7898	* average load.
				7899	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7900	if (local->avg_load >= sds.avg_load)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7901	goto out_balanced;
				7902
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7903	if (env->idle == CPU_IDLE) {
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	7904	/*
Vincent Guittot	43f4d66	2014-10-01 15:38:55 +0200	[diff] [blame]	7905	* This cpu is idle. If the busiest group is not overloaded
				7906	* and there is no imbalance between this and busiest group
				7907	* wrt idle cpus, it is balanced. The imbalance becomes
				7908	* significant if the diff is greater than 1 otherwise we
				7909	* might end up to just move the imbalance on another group
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	7910	*/
Vincent Guittot	43f4d66	2014-10-01 15:38:55 +0200	[diff] [blame]	7911	if ((busiest->group_type != group_overloaded) &&
				7912	(local->idle_cpus <= (busiest->idle_cpus + 1)))
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	7913	goto out_balanced;
Peter Zijlstra	c186faf	2011-02-21 18:52:53 +0100	[diff] [blame]	7914	} else {
				7915	/*
				7916	* In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
				7917	* imbalance_pct to be conservative.
				7918	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7919	if (100 * busiest->avg_load <=
				7920	env->sd->imbalance_pct * local->avg_load)
Peter Zijlstra	c186faf	2011-02-21 18:52:53 +0100	[diff] [blame]	7921	goto out_balanced;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	7922	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7923
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	7924	force_balance:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7925	/* Looks like there is an imbalance. Compute it */
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7926	calculate_imbalance(env, &sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7927	return sds.busiest;
				7928
				7929	out_balanced:
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7930	env->imbalance = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7931	return NULL;
				7932	}
				7933
				7934	/*
				7935	* find_busiest_queue - find the busiest runqueue among the cpus in group.
				7936	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7937	static struct rq find_busiest_queue(struct lb_env env,
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	7938	struct sched_group *group)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7939	{
				7940	struct rq busiest = NULL, rq;
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	7941	unsigned long busiest_load = 0, busiest_capacity = 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7942	int i;
				7943
Peter Zijlstra	6906a40	2013-08-19 15:20:21 +0200	[diff] [blame]	7944	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7945	unsigned long capacity, wl;
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	7946	enum fbq_type rt;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7947
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	7948	rq = cpu_rq(i);
				7949	rt = fbq_classify_rq(rq);
				7950
				7951	/*
				7952	* We classify groups/runqueues into three groups:
				7953	* - regular: there are !numa tasks
				7954	* - remote: there are numa tasks that run on the 'wrong' node
				7955	* - all: there is no distinction
				7956	*
				7957	* In order to avoid migrating ideally placed numa tasks,
				7958	* ignore those when there's better options.
				7959	*
				7960	* If we ignore the actual busiest queue to migrate another
				7961	* task, the next balance pass can still reduce the busiest
				7962	* queue by moving tasks around inside the node.
				7963	*
				7964	* If we cannot move enough load due to this classification
				7965	* the next pass will adjust the group classification and
				7966	* allow migration of more tasks.
				7967	*
				7968	* Both cases only affect the total convergence complexity.
				7969	*/
				7970	if (rt > env->fbq_type)
				7971	continue;
				7972
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	7973	capacity = capacity_of(i);
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	7974
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	7975	wl = weighted_cpuload(i);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7976
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	7977	/*
				7978	* When comparing with imbalance, use weighted_cpuload()
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	7979	* which is not scaled with the cpu capacity.
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	7980	*/
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7981
				7982	if (rq->nr_running == 1 && wl > env->imbalance &&
				7983	!check_cpu_capacity(rq, env->sd))
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7984	continue;
				7985
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	7986	/*
				7987	* For the load comparisons with the other cpu's, consider
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	7988	* the weighted_cpuload() scaled with the cpu capacity, so
				7989	* that the load can be moved away from the cpu that is
				7990	* potentially running at a lower capacity.
Joonsoo Kim	95a79b8	2013-08-06 17:36:41 +0900	[diff] [blame]	7991	*
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	7992	* Thus we're looking for max(wl_i / capacity_i), crosswise
Joonsoo Kim	95a79b8	2013-08-06 17:36:41 +0900	[diff] [blame]	7993	* multiplication to rid ourselves of the division works out
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	7994	* to: wl_i * capacity_j > wl_j * capacity_i; where j is
				7995	* our previous maximum.
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	7996	*/
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	7997	if (wl * busiest_capacity > busiest_load * capacity) {
Joonsoo Kim	95a79b8	2013-08-06 17:36:41 +0900	[diff] [blame]	7998	busiest_load = wl;
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	7999	busiest_capacity = capacity;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8000	busiest = rq;
				8001	}
				8002	}
				8003
				8004	return busiest;
				8005	}
				8006
				8007	/*
				8008	* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
				8009	* so long as it is large enough.
				8010	*/
				8011	#define MAX_PINNED_INTERVAL 512
				8012
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	8013	static int need_active_balance(struct lb_env *env)
Peter Zijlstra	1af3ed3	2009-12-23 15:10:31 +0100	[diff] [blame]	8014	{
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	8015	struct sched_domain *sd = env->sd;
				8016
				8017	if (env->idle == CPU_NEWLY_IDLE) {
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	8018
				8019	/*
				8020	* ASYM_PACKING needs to force migrate tasks from busy but
Tim Chen	afe06ef	2016-11-22 12:23:53 -0800	[diff] [blame]	8021	* lower priority CPUs in order to pack all tasks in the
				8022	* highest priority CPUs.
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	8023	*/
Tim Chen	afe06ef	2016-11-22 12:23:53 -0800	[diff] [blame]	8024	if ((sd->flags & SD_ASYM_PACKING) &&
				8025	sched_asym_prefer(env->dst_cpu, env->src_cpu))
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	8026	return 1;
Peter Zijlstra	1af3ed3	2009-12-23 15:10:31 +0100	[diff] [blame]	8027	}
				8028
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	8029	/*
				8030	* The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
				8031	* It's worth migrating the task if the src_cpu's capacity is reduced
				8032	* because of other sched_class or IRQs if more capacity stays
				8033	* available on dst_cpu.
				8034	*/
				8035	if ((env->idle != CPU_NOT_IDLE) &&
				8036	(env->src_rq->cfs.h_nr_running == 1)) {
				8037	if ((check_cpu_capacity(env->src_rq, sd)) &&
				8038	(capacity_of(env->src_cpu)sd->imbalance_pct < capacity_of(env->dst_cpu)100))
				8039	return 1;
				8040	}
				8041
Peter Zijlstra	1af3ed3	2009-12-23 15:10:31 +0100	[diff] [blame]	8042	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
				8043	}
				8044
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	8045	static int active_load_balance_cpu_stop(void *data);
				8046
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	8047	static int should_we_balance(struct lb_env *env)
				8048	{
				8049	struct sched_group *sg = env->sd->groups;
				8050	struct cpumask sg_cpus, sg_mask;
				8051	int cpu, balance_cpu = -1;
				8052
				8053	/*
				8054	* In the newly idle case, we will allow all the cpu's
				8055	* to do the newly idle load balance.
				8056	*/
				8057	if (env->idle == CPU_NEWLY_IDLE)
				8058	return 1;
				8059
				8060	sg_cpus = sched_group_cpus(sg);
				8061	sg_mask = sched_group_mask(sg);
				8062	/* Try to find first idle cpu */
				8063	for_each_cpu_and(cpu, sg_cpus, env->cpus) {
				8064	if (!cpumask_test_cpu(cpu, sg_mask) \|\| !idle_cpu(cpu))
				8065	continue;
				8066
				8067	balance_cpu = cpu;
				8068	break;
				8069	}
				8070
				8071	if (balance_cpu == -1)
				8072	balance_cpu = group_balance_cpu(sg);
				8073
				8074	/*
				8075	* First idle cpu or the first cpu(busiest) in this sched group
				8076	* is eligible for doing load balancing at this and above domains.
				8077	*/
Joonsoo Kim	b0cff9d	2013-09-10 15:54:49 +0900	[diff] [blame]	8078	return balance_cpu == env->dst_cpu;
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	8079	}
				8080
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8081	/*
				8082	* Check this_cpu to ensure it is balanced within domain. Attempt to move
				8083	* tasks if there is an imbalance.
				8084	*/
				8085	static int load_balance(int this_cpu, struct rq *this_rq,
				8086	struct sched_domain *sd, enum cpu_idle_type idle,
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	8087	int *continue_balancing)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8088	{
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	8089	int ld_moved, cur_ld_moved, active_balance = 0;
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	8090	struct sched_domain *sd_parent = sd->parent;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8091	struct sched_group *group;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8092	struct rq *busiest;
Peter Zijlstra	8a8c69c	2016-10-04 16:04:35 +0200	[diff] [blame]	8093	struct rq_flags rf;
Christoph Lameter	4ba2968	2014-08-26 19:12:21 -0500	[diff] [blame]	8094	struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8095
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	8096	struct lb_env env = {
				8097	.sd = sd,
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	8098	.dst_cpu = this_cpu,
				8099	.dst_rq = this_rq,
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	8100	.dst_grpmask = sched_group_cpus(sd->groups),
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	8101	.idle = idle,
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	8102	.loop_break = sched_nr_migrate_break,
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	8103	.cpus = cpus,
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	8104	.fbq_type = all,
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	8105	.tasks = LIST_HEAD_INIT(env.tasks),
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	8106	};
				8107
Joonsoo Kim	cfc0311	2013-04-23 17:27:39 +0900	[diff] [blame]	8108	/*
				8109	* For NEWLY_IDLE load_balancing, we don't need to consider
				8110	* other cpus in our group
				8111	*/
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	8112	if (idle == CPU_NEWLY_IDLE)
Joonsoo Kim	cfc0311	2013-04-23 17:27:39 +0900	[diff] [blame]	8113	env.dst_grpmask = NULL;
Joonsoo Kim	cfc0311	2013-04-23 17:27:39 +0900	[diff] [blame]	8114
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8115	cpumask_copy(cpus, cpu_active_mask);
				8116
Josh Poimboeuf	ae92882	2016-06-17 12:43:24 -0500	[diff] [blame]	8117	schedstat_inc(sd->lb_count[idle]);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8118
				8119	redo:
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	8120	if (!should_we_balance(&env)) {
				8121	*continue_balancing = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8122	goto out_balanced;
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	8123	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8124
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	8125	group = find_busiest_group(&env);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8126	if (!group) {
Josh Poimboeuf	ae92882	2016-06-17 12:43:24 -0500	[diff] [blame]	8127	schedstat_inc(sd->lb_nobusyg[idle]);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8128	goto out_balanced;
				8129	}
				8130
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	8131	busiest = find_busiest_queue(&env, group);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8132	if (!busiest) {
Josh Poimboeuf	ae92882	2016-06-17 12:43:24 -0500	[diff] [blame]	8133	schedstat_inc(sd->lb_nobusyq[idle]);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8134	goto out_balanced;
				8135	}
				8136
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	8137	BUG_ON(busiest == env.dst_rq);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8138
Josh Poimboeuf	ae92882	2016-06-17 12:43:24 -0500	[diff] [blame]	8139	schedstat_add(sd->lb_imbalance[idle], env.imbalance);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8140
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	8141	env.src_cpu = busiest->cpu;
				8142	env.src_rq = busiest;
				8143
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8144	ld_moved = 0;
				8145	if (busiest->nr_running > 1) {
				8146	/*
				8147	* Attempt to move tasks. If find_busiest_group has found
				8148	* an imbalance but busiest->nr_running <= 1, the group is
				8149	* still unbalanced. ld_moved simply stays zero, so it is
				8150	* correctly treated as an imbalance.
				8151	*/
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	8152	env.flags \|= LBF_ALL_PINNED;
Peter Zijlstra	c82513e	2012-04-26 13:12:27 +0200	[diff] [blame]	8153	env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	8154
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	8155	more_balance:
Peter Zijlstra	8a8c69c	2016-10-04 16:04:35 +0200	[diff] [blame]	8156	rq_lock_irqsave(busiest, &rf);
Peter Zijlstra	3bed5e2	2016-10-03 16:35:32 +0200	[diff] [blame]	8157	update_rq_clock(busiest);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	8158
				8159	/*
				8160	* cur_ld_moved - load moved in current iteration
				8161	* ld_moved - cumulative load moved across iterations
				8162	*/
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	8163	cur_ld_moved = detach_tasks(&env);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8164
				8165	/*
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	8166	* We've detached some tasks from busiest_rq. Every
				8167	* task is masked "TASK_ON_RQ_MIGRATING", so we can safely
				8168	* unlock busiest->lock, and we are able to be sure
				8169	* that nobody can manipulate the tasks in parallel.
				8170	* See task_rq_lock() family for the details.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8171	*/
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	8172
Peter Zijlstra	8a8c69c	2016-10-04 16:04:35 +0200	[diff] [blame]	8173	rq_unlock(busiest, &rf);
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	8174
				8175	if (cur_ld_moved) {
				8176	attach_tasks(&env);
				8177	ld_moved += cur_ld_moved;
				8178	}
				8179
Peter Zijlstra	8a8c69c	2016-10-04 16:04:35 +0200	[diff] [blame]	8180	local_irq_restore(rf.flags);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	8181
Joonsoo Kim	f1cd085	2013-04-23 17:27:37 +0900	[diff] [blame]	8182	if (env.flags & LBF_NEED_BREAK) {
				8183	env.flags &= ~LBF_NEED_BREAK;
				8184	goto more_balance;
				8185	}
				8186
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	8187	/*
				8188	* Revisit (affine) tasks on src_cpu that couldn't be moved to
				8189	* us and move them to an alternate dst_cpu in our sched_group
				8190	* where they can run. The upper limit on how many times we
				8191	* iterate on same src_cpu is dependent on number of cpus in our
				8192	* sched_group.
				8193	*
				8194	* This changes load balance semantics a bit on who can move
				8195	* load to a given_cpu. In addition to the given_cpu itself
				8196	* (or a ilb_cpu acting on its behalf where given_cpu is
				8197	* nohz-idle), we now have balance_cpu in a position to move
				8198	* load to given_cpu. In rare situations, this may cause
				8199	* conflicts (balance_cpu and given_cpu/ilb_cpu deciding
				8200	* _independently_ and at _same_ time to move some load to
				8201	* given_cpu) causing exceess load to be moved to given_cpu.
				8202	* This however should not happen so much in practice and
				8203	* moreover subsequent load balance cycles should correct the
				8204	* excess load moved.
				8205	*/
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	8206	if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	8207
Vladimir Davydov	7aff2e3	2013-09-15 21:30:13 +0400	[diff] [blame]	8208	/* Prevent to re-select dst_cpu via env's cpus */
				8209	cpumask_clear_cpu(env.dst_cpu, env.cpus);
				8210
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	8211	env.dst_rq = cpu_rq(env.new_dst_cpu);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	8212	env.dst_cpu = env.new_dst_cpu;
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	8213	env.flags &= ~LBF_DST_PINNED;
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	8214	env.loop = 0;
				8215	env.loop_break = sched_nr_migrate_break;
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	8216
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	8217	/*
				8218	* Go back to "more_balance" rather than "redo" since we
				8219	* need to continue with same src_cpu.
				8220	*/
				8221	goto more_balance;
				8222	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8223
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	8224	/*
				8225	* We failed to reach balance because of affinity.
				8226	*/
				8227	if (sd_parent) {
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	8228	int *group_imbalance = &sd_parent->groups->sgc->imbalance;
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	8229
Vincent Guittot	afdeee0	2014-08-26 13:06:44 +0200	[diff] [blame]	8230	if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	8231	*group_imbalance = 1;
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	8232	}
				8233
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8234	/* All tasks on this runqueue were pinned by CPU affinity */
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	8235	if (unlikely(env.flags & LBF_ALL_PINNED)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8236	cpumask_clear_cpu(cpu_of(busiest), cpus);
Prashanth Nageshappa	bbf18b1	2012-06-19 17:52:07 +0530	[diff] [blame]	8237	if (!cpumask_empty(cpus)) {
				8238	env.loop = 0;
				8239	env.loop_break = sched_nr_migrate_break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8240	goto redo;
Prashanth Nageshappa	bbf18b1	2012-06-19 17:52:07 +0530	[diff] [blame]	8241	}
Vincent Guittot	afdeee0	2014-08-26 13:06:44 +0200	[diff] [blame]	8242	goto out_all_pinned;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8243	}
				8244	}
				8245
				8246	if (!ld_moved) {
Josh Poimboeuf	ae92882	2016-06-17 12:43:24 -0500	[diff] [blame]	8247	schedstat_inc(sd->lb_failed[idle]);
Venkatesh Pallipadi	58b26c4	2010-09-10 18:19:17 -0700	[diff] [blame]	8248	/*
				8249	* Increment the failure counter only on periodic balance.
				8250	* We do not want newidle balance, which can be very
				8251	* frequent, pollute the failure counter causing
				8252	* excessive cache_hot migrations and active balances.
				8253	*/
				8254	if (idle != CPU_NEWLY_IDLE)
				8255	sd->nr_balance_failed++;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8256
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	8257	if (need_active_balance(&env)) {
Peter Zijlstra	8a8c69c	2016-10-04 16:04:35 +0200	[diff] [blame]	8258	unsigned long flags;
				8259
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8260	raw_spin_lock_irqsave(&busiest->lock, flags);
				8261
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	8262	/* don't kick the active_load_balance_cpu_stop,
				8263	* if the curr task on busiest cpu can't be
				8264	* moved to this_cpu
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8265	*/
Ingo Molnar	0c98d34	2017-02-05 15:38:10 +0100	[diff] [blame]	8266	if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8267	raw_spin_unlock_irqrestore(&busiest->lock,
				8268	flags);
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	8269	env.flags \|= LBF_ALL_PINNED;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8270	goto out_one_pinned;
				8271	}
				8272
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	8273	/*
				8274	* ->active_balance synchronizes accesses to
				8275	* ->active_balance_work. Once set, it's cleared
				8276	* only after active load balance is finished.
				8277	*/
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8278	if (!busiest->active_balance) {
				8279	busiest->active_balance = 1;
				8280	busiest->push_cpu = this_cpu;
				8281	active_balance = 1;
				8282	}
				8283	raw_spin_unlock_irqrestore(&busiest->lock, flags);
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	8284
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	8285	if (active_balance) {
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	8286	stop_one_cpu_nowait(cpu_of(busiest),
				8287	active_load_balance_cpu_stop, busiest,
				8288	&busiest->active_balance_work);
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	8289	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8290
Srikar Dronamraju	d02c07118	2016-03-23 17:54:44 +0530	[diff] [blame]	8291	/* We've kicked active balancing, force task migration. */
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8292	sd->nr_balance_failed = sd->cache_nice_tries+1;
				8293	}
				8294	} else
				8295	sd->nr_balance_failed = 0;
				8296
				8297	if (likely(!active_balance)) {
				8298	/* We were unbalanced, so reset the balancing interval */
				8299	sd->balance_interval = sd->min_interval;
				8300	} else {
				8301	/*
				8302	* If we've begun active balancing, start to back off. This
				8303	* case may not be covered by the all_pinned logic if there
				8304	* is only 1 task on the busy runqueue (because we don't call
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	8305	* detach_tasks).
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8306	*/
				8307	if (sd->balance_interval < sd->max_interval)
				8308	sd->balance_interval *= 2;
				8309	}
				8310
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8311	goto out;
				8312
				8313	out_balanced:
Vincent Guittot	afdeee0	2014-08-26 13:06:44 +0200	[diff] [blame]	8314	/*
				8315	* We reach balance although we may have faced some affinity
				8316	* constraints. Clear the imbalance flag if it was set.
				8317	*/
				8318	if (sd_parent) {
				8319	int *group_imbalance = &sd_parent->groups->sgc->imbalance;
				8320
				8321	if (*group_imbalance)
				8322	*group_imbalance = 0;
				8323	}
				8324
				8325	out_all_pinned:
				8326	/*
				8327	* We reach balance because all tasks are pinned at this level so
				8328	* we can't migrate them. Let the imbalance flag set so parent level
				8329	* can try to migrate them.
				8330	*/
Josh Poimboeuf	ae92882	2016-06-17 12:43:24 -0500	[diff] [blame]	8331	schedstat_inc(sd->lb_balanced[idle]);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8332
				8333	sd->nr_balance_failed = 0;
				8334
				8335	out_one_pinned:
				8336	/* tune up the balancing interval */
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	8337	if (((env.flags & LBF_ALL_PINNED) &&
Peter Zijlstra	5b54b56	2011-09-22 15:23:13 +0200	[diff] [blame]	8338	sd->balance_interval < MAX_PINNED_INTERVAL) \|\|
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8339	(sd->balance_interval < sd->max_interval))
				8340	sd->balance_interval *= 2;
				8341
Venkatesh Pallipadi	46e49b3	2011-02-14 14:38:50 -0800	[diff] [blame]	8342	ld_moved = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8343	out:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8344	return ld_moved;
				8345	}
				8346
Jason Low	52a08ef	2014-05-08 17:49:22 -0700	[diff] [blame]	8347	static inline unsigned long
				8348	get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
				8349	{
				8350	unsigned long interval = sd->balance_interval;
				8351
				8352	if (cpu_busy)
				8353	interval *= sd->busy_factor;
				8354
				8355	/* scale ms to jiffies */
				8356	interval = msecs_to_jiffies(interval);
				8357	interval = clamp(interval, 1UL, max_load_balance_interval);
				8358
				8359	return interval;
				8360	}
				8361
				8362	static inline void
Leo Yan	31851a9	2016-08-05 14:31:29 +0800	[diff] [blame]	8363	update_next_balance(struct sched_domain sd, unsigned long next_balance)
Jason Low	52a08ef	2014-05-08 17:49:22 -0700	[diff] [blame]	8364	{
				8365	unsigned long interval, next;
				8366
Leo Yan	31851a9	2016-08-05 14:31:29 +0800	[diff] [blame]	8367	/* used by idle balance, so cpu_busy = 0 */
				8368	interval = get_sd_balance_interval(sd, 0);
Jason Low	52a08ef	2014-05-08 17:49:22 -0700	[diff] [blame]	8369	next = sd->last_balance + interval;
				8370
				8371	if (time_after(*next_balance, next))
				8372	*next_balance = next;
				8373	}
				8374
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8375	/*
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8376	* idle_balance is called by schedule() if this_cpu is about to become
				8377	* idle. Attempts to pull tasks from other CPUs.
				8378	*/
Matt Fleming	46f69fa	2016-09-21 14:38:12 +0100	[diff] [blame]	8379	static int idle_balance(struct rq this_rq, struct rq_flags rf)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8380	{
Jason Low	52a08ef	2014-05-08 17:49:22 -0700	[diff] [blame]	8381	unsigned long next_balance = jiffies + HZ;
				8382	int this_cpu = this_rq->cpu;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8383	struct sched_domain *sd;
				8384	int pulled_task = 0;
Jason Low	9bd721c	2013-09-13 11:26:52 -0700	[diff] [blame]	8385	u64 curr_cost = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8386
Peter Zijlstra	6e83125	2014-02-11 16:11:48 +0100	[diff] [blame]	8387	/*
				8388	* We must set idle_stamp _before_ calling idle_balance(), such that we
				8389	* measure the duration of idle_balance() as idle time.
				8390	*/
				8391	this_rq->idle_stamp = rq_clock(this_rq);
				8392
Matt Fleming	46f69fa	2016-09-21 14:38:12 +0100	[diff] [blame]	8393	/*
				8394	* This is OK, because current is on_cpu, which avoids it being picked
				8395	* for load-balance and preemption/IRQs are still disabled avoiding
				8396	* further scheduler activity on it and we're being very careful to
				8397	* re-start the picking loop.
				8398	*/
				8399	rq_unpin_lock(this_rq, rf);
				8400
Tim Chen	4486edd	2014-06-23 12:16:49 -0700	[diff] [blame]	8401	if (this_rq->avg_idle < sysctl_sched_migration_cost \|\|
				8402	!this_rq->rd->overload) {
Jason Low	52a08ef	2014-05-08 17:49:22 -0700	[diff] [blame]	8403	rcu_read_lock();
				8404	sd = rcu_dereference_check_sched_domain(this_rq->sd);
				8405	if (sd)
Leo Yan	31851a9	2016-08-05 14:31:29 +0800	[diff] [blame]	8406	update_next_balance(sd, &next_balance);
Jason Low	52a08ef	2014-05-08 17:49:22 -0700	[diff] [blame]	8407	rcu_read_unlock();
				8408
Peter Zijlstra	6e83125	2014-02-11 16:11:48 +0100	[diff] [blame]	8409	goto out;
Jason Low	52a08ef	2014-05-08 17:49:22 -0700	[diff] [blame]	8410	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8411
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	8412	raw_spin_unlock(&this_rq->lock);
				8413
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	8414	update_blocked_averages(this_cpu);
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	8415	rcu_read_lock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8416	for_each_domain(this_cpu, sd) {
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	8417	int continue_balancing = 1;
Jason Low	9bd721c	2013-09-13 11:26:52 -0700	[diff] [blame]	8418	u64 t0, domain_cost;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8419
				8420	if (!(sd->flags & SD_LOAD_BALANCE))
				8421	continue;
				8422
Jason Low	52a08ef	2014-05-08 17:49:22 -0700	[diff] [blame]	8423	if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
Leo Yan	31851a9	2016-08-05 14:31:29 +0800	[diff] [blame]	8424	update_next_balance(sd, &next_balance);
Jason Low	9bd721c	2013-09-13 11:26:52 -0700	[diff] [blame]	8425	break;
Jason Low	52a08ef	2014-05-08 17:49:22 -0700	[diff] [blame]	8426	}
Jason Low	9bd721c	2013-09-13 11:26:52 -0700	[diff] [blame]	8427
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	8428	if (sd->flags & SD_BALANCE_NEWIDLE) {
Jason Low	9bd721c	2013-09-13 11:26:52 -0700	[diff] [blame]	8429	t0 = sched_clock_cpu(this_cpu);
				8430
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	8431	pulled_task = load_balance(this_cpu, this_rq,
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	8432	sd, CPU_NEWLY_IDLE,
				8433	&continue_balancing);
Jason Low	9bd721c	2013-09-13 11:26:52 -0700	[diff] [blame]	8434
				8435	domain_cost = sched_clock_cpu(this_cpu) - t0;
				8436	if (domain_cost > sd->max_newidle_lb_cost)
				8437	sd->max_newidle_lb_cost = domain_cost;
				8438
				8439	curr_cost += domain_cost;
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	8440	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8441
Leo Yan	31851a9	2016-08-05 14:31:29 +0800	[diff] [blame]	8442	update_next_balance(sd, &next_balance);
Jason Low	39a4d9c	2014-04-23 18:30:35 -0700	[diff] [blame]	8443
				8444	/*
				8445	* Stop searching for tasks to pull if there are
				8446	* now runnable tasks on this rq.
				8447	*/
				8448	if (pulled_task \|\| this_rq->nr_running > 0)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8449	break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8450	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	8451	rcu_read_unlock();
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	8452
				8453	raw_spin_lock(&this_rq->lock);
				8454
Jason Low	0e5b533	2014-04-28 15:45:54 -0700	[diff] [blame]	8455	if (curr_cost > this_rq->max_idle_balance_cost)
				8456	this_rq->max_idle_balance_cost = curr_cost;
				8457
Daniel Lezcano	e5fc661	2014-01-17 10:04:02 +0100	[diff] [blame]	8458	/*
Jason Low	0e5b533	2014-04-28 15:45:54 -0700	[diff] [blame]	8459	* While browsing the domains, we released the rq lock, a task could
				8460	* have been enqueued in the meantime. Since we're not going idle,
				8461	* pretend we pulled a task.
Daniel Lezcano	e5fc661	2014-01-17 10:04:02 +0100	[diff] [blame]	8462	*/
Jason Low	0e5b533	2014-04-28 15:45:54 -0700	[diff] [blame]	8463	if (this_rq->cfs.h_nr_running && !pulled_task)
Peter Zijlstra	6e83125	2014-02-11 16:11:48 +0100	[diff] [blame]	8464	pulled_task = 1;
Daniel Lezcano	e5fc661	2014-01-17 10:04:02 +0100	[diff] [blame]	8465
Peter Zijlstra	6e83125	2014-02-11 16:11:48 +0100	[diff] [blame]	8466	out:
Jason Low	52a08ef	2014-05-08 17:49:22 -0700	[diff] [blame]	8467	/* Move the next balance forward */
				8468	if (time_after(this_rq->next_balance, next_balance))
				8469	this_rq->next_balance = next_balance;
				8470
Kirill Tkhai	e4aa358	2014-03-06 13:31:55 +0400	[diff] [blame]	8471	/* Is there a task of a high priority class? */
Kirill Tkhai	4638364	2014-03-15 02:15:07 +0400	[diff] [blame]	8472	if (this_rq->nr_running != this_rq->cfs.h_nr_running)
Kirill Tkhai	e4aa358	2014-03-06 13:31:55 +0400	[diff] [blame]	8473	pulled_task = -1;
				8474
Dietmar Eggemann	38c6ade	2015-10-20 13:04:41 +0100	[diff] [blame]	8475	if (pulled_task)
Peter Zijlstra	6e83125	2014-02-11 16:11:48 +0100	[diff] [blame]	8476	this_rq->idle_stamp = 0;
				8477
Matt Fleming	46f69fa	2016-09-21 14:38:12 +0100	[diff] [blame]	8478	rq_repin_lock(this_rq, rf);
				8479
Daniel Lezcano	3c4017c	2014-01-17 10:04:03 +0100	[diff] [blame]	8480	return pulled_task;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8481	}
				8482
				8483	/*
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	8484	* active_load_balance_cpu_stop is run by cpu stopper. It pushes
				8485	* running tasks off the busiest CPU onto idle CPUs. It requires at
				8486	* least 1 task to be running on each physical CPU where possible, and
				8487	* avoids physical / logical imbalances.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8488	*/
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	8489	static int active_load_balance_cpu_stop(void *data)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8490	{
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	8491	struct rq *busiest_rq = data;
				8492	int busiest_cpu = cpu_of(busiest_rq);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8493	int target_cpu = busiest_rq->push_cpu;
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	8494	struct rq *target_rq = cpu_rq(target_cpu);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8495	struct sched_domain *sd;
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	8496	struct task_struct *p = NULL;
Peter Zijlstra	8a8c69c	2016-10-04 16:04:35 +0200	[diff] [blame]	8497	struct rq_flags rf;
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	8498
Peter Zijlstra	8a8c69c	2016-10-04 16:04:35 +0200	[diff] [blame]	8499	rq_lock_irq(busiest_rq, &rf);
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	8500
				8501	/* make sure the requested cpu hasn't gone down in the meantime */
				8502	if (unlikely(busiest_cpu != smp_processor_id() \|\|
				8503	!busiest_rq->active_balance))
				8504	goto out_unlock;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8505
				8506	/* Is there any task to move? */
				8507	if (busiest_rq->nr_running <= 1)
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	8508	goto out_unlock;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8509
				8510	/*
				8511	* This condition is "impossible", if it occurs
				8512	* we need to fix it. Originally reported by
				8513	* Bjorn Helgaas on a 128-cpu setup.
				8514	*/
				8515	BUG_ON(busiest_rq == target_rq);
				8516
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8517	/* Search for an sd spanning us and the target CPU. */
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	8518	rcu_read_lock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8519	for_each_domain(target_cpu, sd) {
				8520	if ((sd->flags & SD_LOAD_BALANCE) &&
				8521	cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
				8522	break;
				8523	}
				8524
				8525	if (likely(sd)) {
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	8526	struct lb_env env = {
				8527	.sd = sd,
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	8528	.dst_cpu = target_cpu,
				8529	.dst_rq = target_rq,
				8530	.src_cpu = busiest_rq->cpu,
				8531	.src_rq = busiest_rq,
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	8532	.idle = CPU_IDLE,
				8533	};
				8534
Josh Poimboeuf	ae92882	2016-06-17 12:43:24 -0500	[diff] [blame]	8535	schedstat_inc(sd->alb_count);
Peter Zijlstra	3bed5e2	2016-10-03 16:35:32 +0200	[diff] [blame]	8536	update_rq_clock(busiest_rq);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8537
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	8538	p = detach_one_task(&env);
Srikar Dronamraju	d02c07118	2016-03-23 17:54:44 +0530	[diff] [blame]	8539	if (p) {
Josh Poimboeuf	ae92882	2016-06-17 12:43:24 -0500	[diff] [blame]	8540	schedstat_inc(sd->alb_pushed);
Srikar Dronamraju	d02c07118	2016-03-23 17:54:44 +0530	[diff] [blame]	8541	/* Active balancing done, reset the failure counter. */
				8542	sd->nr_balance_failed = 0;
				8543	} else {
Josh Poimboeuf	ae92882	2016-06-17 12:43:24 -0500	[diff] [blame]	8544	schedstat_inc(sd->alb_failed);
Srikar Dronamraju	d02c07118	2016-03-23 17:54:44 +0530	[diff] [blame]	8545	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8546	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	8547	rcu_read_unlock();
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	8548	out_unlock:
				8549	busiest_rq->active_balance = 0;
Peter Zijlstra	8a8c69c	2016-10-04 16:04:35 +0200	[diff] [blame]	8550	rq_unlock(busiest_rq, &rf);
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	8551
				8552	if (p)
				8553	attach_one_task(target_rq, p);
				8554
				8555	local_irq_enable();
				8556
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	8557	return 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8558	}
				8559
Mike Galbraith	d987fc7	2011-12-05 10:01:47 +0100	[diff] [blame]	8560	static inline int on_null_domain(struct rq *rq)
				8561	{
				8562	return unlikely(!rcu_dereference_sched(rq->sd));
				8563	}
				8564
Frederic Weisbecker	3451d02	2011-08-10 23:21:01 +0200	[diff] [blame]	8565	#ifdef CONFIG_NO_HZ_COMMON
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8566	/*
				8567	* idle load balancing details
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8568	* - When one of the busy CPUs notice that there may be an idle rebalancing
				8569	* needed, they will kick the idle load balancer, which then does idle
				8570	* load balancing for all the idle CPUs.
				8571	*/
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8572	static struct {
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8573	cpumask_var_t idle_cpus_mask;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	8574	atomic_t nr_cpus;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8575	unsigned long next_balance; /* in jiffy units */
				8576	} nohz ____cacheline_aligned;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8577
Daniel Lezcano	3dd0337	2014-01-06 12:34:41 +0100	[diff] [blame]	8578	static inline int find_new_ilb(void)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8579	{
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	8580	int ilb = cpumask_first(nohz.idle_cpus_mask);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8581
Suresh Siddha	786d6dc	2011-12-01 17:07:35 -0800	[diff] [blame]	8582	if (ilb < nr_cpu_ids && idle_cpu(ilb))
				8583	return ilb;
				8584
				8585	return nr_cpu_ids;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8586	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8587
				8588	/*
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8589	* Kick a CPU to do the nohz balancing, if it is time for it. We pick the
				8590	* nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
				8591	* CPU (if there is one).
				8592	*/
Daniel Lezcano	0aeeeeb	2014-01-06 12:34:42 +0100	[diff] [blame]	8593	static void nohz_balancer_kick(void)
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8594	{
				8595	int ilb_cpu;
				8596
				8597	nohz.next_balance++;
				8598
Daniel Lezcano	3dd0337	2014-01-06 12:34:41 +0100	[diff] [blame]	8599	ilb_cpu = find_new_ilb();
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8600
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	8601	if (ilb_cpu >= nr_cpu_ids)
				8602	return;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8603
Suresh Siddha	cd490c5	2011-12-06 11:26:34 -0800	[diff] [blame]	8604	if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	8605	return;
				8606	/*
				8607	* Use smp_send_reschedule() instead of resched_cpu().
				8608	* This way we generate a sched IPI on the target cpu which
				8609	* is idle. And the softirq performing nohz idle load balance
				8610	* will be run before returning from the IPI.
				8611	*/
				8612	smp_send_reschedule(ilb_cpu);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8613	return;
				8614	}
				8615
Thomas Gleixner	20a5c8c	2016-03-10 12:54:20 +0100	[diff] [blame]	8616	void nohz_balance_exit_idle(unsigned int cpu)
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	8617	{
				8618	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
Mike Galbraith	d987fc7	2011-12-05 10:01:47 +0100	[diff] [blame]	8619	/*
				8620	* Completely isolated CPUs don't ever set, so we must test.
				8621	*/
				8622	if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
				8623	cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
				8624	atomic_dec(&nohz.nr_cpus);
				8625	}
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	8626	clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
				8627	}
				8628	}
				8629
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	8630	static inline void set_cpu_sd_state_busy(void)
				8631	{
				8632	struct sched_domain *sd;
Preeti U Murthy	37dc6b5	2013-10-30 08:42:52 +0530	[diff] [blame]	8633	int cpu = smp_processor_id();
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	8634
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	8635	rcu_read_lock();
Peter Zijlstra	0e369d7	2016-05-09 10:38:01 +0200	[diff] [blame]	8636	sd = rcu_dereference(per_cpu(sd_llc, cpu));
Vincent Guittot	25f55d9	2013-04-23 16:59:02 +0200	[diff] [blame]	8637
				8638	if (!sd \|\| !sd->nohz_idle)
				8639	goto unlock;
				8640	sd->nohz_idle = 0;
				8641
Peter Zijlstra	0e369d7	2016-05-09 10:38:01 +0200	[diff] [blame]	8642	atomic_inc(&sd->shared->nr_busy_cpus);
Vincent Guittot	25f55d9	2013-04-23 16:59:02 +0200	[diff] [blame]	8643	unlock:
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	8644	rcu_read_unlock();
				8645	}
				8646
				8647	void set_cpu_sd_state_idle(void)
				8648	{
				8649	struct sched_domain *sd;
Preeti U Murthy	37dc6b5	2013-10-30 08:42:52 +0530	[diff] [blame]	8650	int cpu = smp_processor_id();
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	8651
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	8652	rcu_read_lock();
Peter Zijlstra	0e369d7	2016-05-09 10:38:01 +0200	[diff] [blame]	8653	sd = rcu_dereference(per_cpu(sd_llc, cpu));
Vincent Guittot	25f55d9	2013-04-23 16:59:02 +0200	[diff] [blame]	8654
				8655	if (!sd \|\| sd->nohz_idle)
				8656	goto unlock;
				8657	sd->nohz_idle = 1;
				8658
Peter Zijlstra	0e369d7	2016-05-09 10:38:01 +0200	[diff] [blame]	8659	atomic_dec(&sd->shared->nr_busy_cpus);
Vincent Guittot	25f55d9	2013-04-23 16:59:02 +0200	[diff] [blame]	8660	unlock:
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	8661	rcu_read_unlock();
				8662	}
				8663
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8664	/*
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	8665	* This routine will record that the cpu is going idle with tick stopped.
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	8666	* This info will be used in performing idle load balancing in the future.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8667	*/
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	8668	void nohz_balance_enter_idle(int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8669	{
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	8670	/*
				8671	* If this cpu is going down, then nothing needs to be done.
				8672	*/
				8673	if (!cpu_active(cpu))
				8674	return;
				8675
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	8676	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
				8677	return;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8678
Mike Galbraith	d987fc7	2011-12-05 10:01:47 +0100	[diff] [blame]	8679	/*
				8680	* If we're a completely isolated CPU, we don't play.
				8681	*/
				8682	if (on_null_domain(cpu_rq(cpu)))
				8683	return;
				8684
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	8685	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
				8686	atomic_inc(&nohz.nr_cpus);
				8687	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8688	}
				8689	#endif
				8690
				8691	static DEFINE_SPINLOCK(balancing);
				8692
Peter Zijlstra	49c022e	2011-04-05 10:14:25 +0200	[diff] [blame]	8693	/*
				8694	* Scale the max load_balance interval with the number of CPUs in the system.
				8695	* This trades load-balance latency on larger machines for less cross talk.
				8696	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	8697	void update_max_interval(void)
Peter Zijlstra	49c022e	2011-04-05 10:14:25 +0200	[diff] [blame]	8698	{
				8699	max_load_balance_interval = HZ*num_online_cpus()/10;
				8700	}
				8701
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8702	/*
				8703	* It checks each scheduling domain to see if it is due to be balanced,
				8704	* and initiates a balancing operation if so.
				8705	*
Libin	b9b0853	2013-04-01 19:14:01 +0800	[diff] [blame]	8706	* Balancing parameters are set up in init_sched_domains.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8707	*/
Daniel Lezcano	f7ed0a8	2014-01-06 12:34:43 +0100	[diff] [blame]	8708	static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8709	{
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	8710	int continue_balancing = 1;
Daniel Lezcano	f7ed0a8	2014-01-06 12:34:43 +0100	[diff] [blame]	8711	int cpu = rq->cpu;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8712	unsigned long interval;
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	8713	struct sched_domain *sd;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8714	/* Earliest time when we have to do rebalance again */
				8715	unsigned long next_balance = jiffies + 60*HZ;
				8716	int update_next_balance = 0;
Jason Low	f48627e	2013-09-13 11:26:53 -0700	[diff] [blame]	8717	int need_serialize, need_decay = 0;
				8718	u64 max_cost = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8719
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	8720	update_blocked_averages(cpu);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	8721
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	8722	rcu_read_lock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8723	for_each_domain(cpu, sd) {
Jason Low	f48627e	2013-09-13 11:26:53 -0700	[diff] [blame]	8724	/*
				8725	* Decay the newidle max times here because this is a regular
				8726	* visit to all the domains. Decay ~1% per second.
				8727	*/
				8728	if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
				8729	sd->max_newidle_lb_cost =
				8730	(sd->max_newidle_lb_cost * 253) / 256;
				8731	sd->next_decay_max_lb_cost = jiffies + HZ;
				8732	need_decay = 1;
				8733	}
				8734	max_cost += sd->max_newidle_lb_cost;
				8735
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8736	if (!(sd->flags & SD_LOAD_BALANCE))
				8737	continue;
				8738
Jason Low	f48627e	2013-09-13 11:26:53 -0700	[diff] [blame]	8739	/*
				8740	* Stop the load balance at this level. There is another
				8741	* CPU in our sched group which is doing load balancing more
				8742	* actively.
				8743	*/
				8744	if (!continue_balancing) {
				8745	if (need_decay)
				8746	continue;
				8747	break;
				8748	}
				8749
Jason Low	52a08ef	2014-05-08 17:49:22 -0700	[diff] [blame]	8750	interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8751
				8752	need_serialize = sd->flags & SD_SERIALIZE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8753	if (need_serialize) {
				8754	if (!spin_trylock(&balancing))
				8755	goto out;
				8756	}
				8757
				8758	if (time_after_eq(jiffies, sd->last_balance + interval)) {
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	8759	if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8760	/*
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	8761	* The LBF_DST_PINNED logic could have changed
Joonsoo Kim	de5eb2d	2013-04-23 17:27:38 +0900	[diff] [blame]	8762	* env->dst_cpu, so we can't know our idle
				8763	* state even if we migrated tasks. Update it.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8764	*/
Joonsoo Kim	de5eb2d	2013-04-23 17:27:38 +0900	[diff] [blame]	8765	idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8766	}
				8767	sd->last_balance = jiffies;
Jason Low	52a08ef	2014-05-08 17:49:22 -0700	[diff] [blame]	8768	interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8769	}
				8770	if (need_serialize)
				8771	spin_unlock(&balancing);
				8772	out:
				8773	if (time_after(next_balance, sd->last_balance + interval)) {
				8774	next_balance = sd->last_balance + interval;
				8775	update_next_balance = 1;
				8776	}
Jason Low	f48627e	2013-09-13 11:26:53 -0700	[diff] [blame]	8777	}
				8778	if (need_decay) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8779	/*
Jason Low	f48627e	2013-09-13 11:26:53 -0700	[diff] [blame]	8780	* Ensure the rq-wide value also decays but keep it at a
				8781	* reasonable floor to avoid funnies with rq->avg_idle.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8782	*/
Jason Low	f48627e	2013-09-13 11:26:53 -0700	[diff] [blame]	8783	rq->max_idle_balance_cost =
				8784	max((u64)sysctl_sched_migration_cost, max_cost);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8785	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	8786	rcu_read_unlock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8787
				8788	/*
				8789	* next_balance will be updated only when there is a need.
				8790	* When the cpu is attached to null domain for ex, it will not be
				8791	* updated.
				8792	*/
Vincent Guittot	c5afb6a	2015-08-03 11:55:50 +0200	[diff] [blame]	8793	if (likely(update_next_balance)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8794	rq->next_balance = next_balance;
Vincent Guittot	c5afb6a	2015-08-03 11:55:50 +0200	[diff] [blame]	8795
				8796	#ifdef CONFIG_NO_HZ_COMMON
				8797	/*
				8798	* If this CPU has been elected to perform the nohz idle
				8799	* balance. Other idle CPUs have already rebalanced with
				8800	* nohz_idle_balance() and nohz.next_balance has been
				8801	* updated accordingly. This CPU is now running the idle load
				8802	* balance for itself and we need to update the
				8803	* nohz.next_balance accordingly.
				8804	*/
				8805	if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
				8806	nohz.next_balance = rq->next_balance;
				8807	#endif
				8808	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8809	}
				8810
Frederic Weisbecker	3451d02	2011-08-10 23:21:01 +0200	[diff] [blame]	8811	#ifdef CONFIG_NO_HZ_COMMON
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8812	/*
Frederic Weisbecker	3451d02	2011-08-10 23:21:01 +0200	[diff] [blame]	8813	* In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8814	* rebalancing for all the cpus for whom scheduler ticks are stopped.
				8815	*/
Daniel Lezcano	208cb16	2014-01-06 12:34:44 +0100	[diff] [blame]	8816	static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8817	{
Daniel Lezcano	208cb16	2014-01-06 12:34:44 +0100	[diff] [blame]	8818	int this_cpu = this_rq->cpu;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8819	struct rq *rq;
				8820	int balance_cpu;
Vincent Guittot	c5afb6a	2015-08-03 11:55:50 +0200	[diff] [blame]	8821	/* Earliest time when we have to do rebalance again */
				8822	unsigned long next_balance = jiffies + 60*HZ;
				8823	int update_next_balance = 0;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8824
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	8825	if (idle != CPU_IDLE \|\|
				8826	!test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
				8827	goto end;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8828
				8829	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
Suresh Siddha	8a6d42d	2011-12-06 11:19:37 -0800	[diff] [blame]	8830	if (balance_cpu == this_cpu \|\| !idle_cpu(balance_cpu))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8831	continue;
				8832
				8833	/*
				8834	* If this cpu gets work to do, stop the load balancing
				8835	* work being done for other cpus. Next load
				8836	* balancing owner will pick it up.
				8837	*/
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	8838	if (need_resched())
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8839	break;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8840
Vincent Guittot	5ed4f1d	2012-09-13 06:11:26 +0200	[diff] [blame]	8841	rq = cpu_rq(balance_cpu);
				8842
Tim Chen	ed61bbc	2014-05-20 14:39:27 -0700	[diff] [blame]	8843	/*
				8844	* If time for next balance is due,
				8845	* do the balance.
				8846	*/
				8847	if (time_after_eq(jiffies, rq->next_balance)) {
Peter Zijlstra	8a8c69c	2016-10-04 16:04:35 +0200	[diff] [blame]	8848	struct rq_flags rf;
				8849
				8850	rq_lock_irq(rq, &rf);
Tim Chen	ed61bbc	2014-05-20 14:39:27 -0700	[diff] [blame]	8851	update_rq_clock(rq);
Frederic Weisbecker	cee1afc	2016-04-13 15:56:50 +0200	[diff] [blame]	8852	cpu_load_update_idle(rq);
Peter Zijlstra	8a8c69c	2016-10-04 16:04:35 +0200	[diff] [blame]	8853	rq_unlock_irq(rq, &rf);
				8854
Tim Chen	ed61bbc	2014-05-20 14:39:27 -0700	[diff] [blame]	8855	rebalance_domains(rq, CPU_IDLE);
				8856	}
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8857
Vincent Guittot	c5afb6a	2015-08-03 11:55:50 +0200	[diff] [blame]	8858	if (time_after(next_balance, rq->next_balance)) {
				8859	next_balance = rq->next_balance;
				8860	update_next_balance = 1;
				8861	}
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8862	}
Vincent Guittot	c5afb6a	2015-08-03 11:55:50 +0200	[diff] [blame]	8863
				8864	/*
				8865	* next_balance will be updated only when there is a need.
				8866	* When the CPU is attached to null domain for ex, it will not be
				8867	* updated.
				8868	*/
				8869	if (likely(update_next_balance))
				8870	nohz.next_balance = next_balance;
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	8871	end:
				8872	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8873	}
				8874
				8875	/*
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	8876	* Current heuristic for kicking the idle load balancer in the presence
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	8877	* of an idle cpu in the system.
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	8878	* - This rq has more than one task.
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	8879	* - This rq has at least one CFS task and the capacity of the CPU is
				8880	* significantly reduced because of RT tasks or IRQs.
				8881	* - At parent of LLC scheduler domain level, this cpu's scheduler group has
				8882	* multiple busy cpu.
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	8883	* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
				8884	* domain span are idle.
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8885	*/
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	8886	static inline bool nohz_kick_needed(struct rq *rq)
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8887	{
				8888	unsigned long now = jiffies;
Peter Zijlstra	0e369d7	2016-05-09 10:38:01 +0200	[diff] [blame]	8889	struct sched_domain_shared *sds;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	8890	struct sched_domain *sd;
Tim Chen	afe06ef	2016-11-22 12:23:53 -0800	[diff] [blame]	8891	int nr_busy, i, cpu = rq->cpu;
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	8892	bool kick = false;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8893
Daniel Lezcano	4a72562	2014-01-06 12:34:39 +0100	[diff] [blame]	8894	if (unlikely(rq->idle_balance))
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	8895	return false;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8896
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	8897	/*
				8898	* We may be recently in ticked or tickless idle mode. At the first
				8899	* busy tick after returning from idle, we will update the busy stats.
				8900	*/
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	8901	set_cpu_sd_state_busy();
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	8902	nohz_balance_exit_idle(cpu);
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	8903
				8904	/*
				8905	* None are in tickless mode and hence no need for NOHZ idle load
				8906	* balancing.
				8907	*/
				8908	if (likely(!atomic_read(&nohz.nr_cpus)))
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	8909	return false;
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	8910
				8911	if (time_before(now, nohz.next_balance))
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	8912	return false;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8913
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	8914	if (rq->nr_running >= 2)
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	8915	return true;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8916
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	8917	rcu_read_lock();
Peter Zijlstra	0e369d7	2016-05-09 10:38:01 +0200	[diff] [blame]	8918	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
				8919	if (sds) {
				8920	/*
				8921	* XXX: write a coherent comment on why we do this.
				8922	* See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
				8923	*/
				8924	nr_busy = atomic_read(&sds->nr_busy_cpus);
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	8925	if (nr_busy > 1) {
				8926	kick = true;
				8927	goto unlock;
				8928	}
				8929
				8930	}
				8931
				8932	sd = rcu_dereference(rq->sd);
				8933	if (sd) {
				8934	if ((rq->cfs.h_nr_running >= 1) &&
				8935	check_cpu_capacity(rq, sd)) {
				8936	kick = true;
				8937	goto unlock;
				8938	}
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8939	}
Preeti U Murthy	37dc6b5	2013-10-30 08:42:52 +0530	[diff] [blame]	8940
				8941	sd = rcu_dereference(per_cpu(sd_asym, cpu));
Tim Chen	afe06ef	2016-11-22 12:23:53 -0800	[diff] [blame]	8942	if (sd) {
				8943	for_each_cpu(i, sched_domain_span(sd)) {
				8944	if (i == cpu \|\|
				8945	!cpumask_test_cpu(i, nohz.idle_cpus_mask))
				8946	continue;
Preeti U Murthy	37dc6b5	2013-10-30 08:42:52 +0530	[diff] [blame]	8947
Tim Chen	afe06ef	2016-11-22 12:23:53 -0800	[diff] [blame]	8948	if (sched_asym_prefer(i, cpu)) {
				8949	kick = true;
				8950	goto unlock;
				8951	}
				8952	}
				8953	}
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	8954	unlock:
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	8955	rcu_read_unlock();
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	8956	return kick;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8957	}
				8958	#else
Daniel Lezcano	208cb16	2014-01-06 12:34:44 +0100	[diff] [blame]	8959	static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8960	#endif
				8961
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8962	/*
				8963	* run_rebalance_domains is triggered when needed from the scheduler tick.
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8964	* Also triggered for nohz idle balancing (with nohz_balancing_kick set).
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8965	*/
Emese Revfy	0766f78	2016-06-20 20:42:34 +0200	[diff] [blame]	8966	static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8967	{
Daniel Lezcano	208cb16	2014-01-06 12:34:44 +0100	[diff] [blame]	8968	struct rq *this_rq = this_rq();
Suresh Siddha	6eb57e0	2011-10-03 15:09:01 -0700	[diff] [blame]	8969	enum cpu_idle_type idle = this_rq->idle_balance ?
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8970	CPU_IDLE : CPU_NOT_IDLE;
				8971
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8972	/*
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8973	* If this cpu has a pending nohz_balance_kick, then do the
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8974	* balancing on behalf of the other idle cpus whose ticks are
Preeti U Murthy	d4573c3	2015-03-26 18:32:44 +0530	[diff] [blame]	8975	* stopped. Do nohz_idle_balance before rebalance_domains to
				8976	* give the idle cpus a chance to load balance. Else we may
				8977	* load balance only within the local sched_domain hierarchy
				8978	* and abort nohz_idle_balance altogether if we pull some load.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8979	*/
Daniel Lezcano	208cb16	2014-01-06 12:34:44 +0100	[diff] [blame]	8980	nohz_idle_balance(this_rq, idle);
Preeti U Murthy	d4573c3	2015-03-26 18:32:44 +0530	[diff] [blame]	8981	rebalance_domains(this_rq, idle);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8982	}
				8983
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8984	/*
				8985	* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8986	*/
Daniel Lezcano	7caff66	2014-01-06 12:34:38 +0100	[diff] [blame]	8987	void trigger_load_balance(struct rq *rq)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8988	{
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8989	/* Don't need to rebalance while attached to NULL domain */
Daniel Lezcano	c726099	2014-01-06 12:34:45 +0100	[diff] [blame]	8990	if (unlikely(on_null_domain(rq)))
				8991	return;
				8992
				8993	if (time_after_eq(jiffies, rq->next_balance))
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8994	raise_softirq(SCHED_SOFTIRQ);
Frederic Weisbecker	3451d02	2011-08-10 23:21:01 +0200	[diff] [blame]	8995	#ifdef CONFIG_NO_HZ_COMMON
Daniel Lezcano	c726099	2014-01-06 12:34:45 +0100	[diff] [blame]	8996	if (nohz_kick_needed(rq))
Daniel Lezcano	0aeeeeb	2014-01-06 12:34:42 +0100	[diff] [blame]	8997	nohz_balancer_kick();
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8998	#endif
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8999	}
				9000
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	9001	static void rq_online_fair(struct rq *rq)
				9002	{
				9003	update_sysctl();
Kirill Tkhai	0e59bda	2014-06-25 12:19:42 +0400	[diff] [blame]	9004
				9005	update_runtime_enabled(rq);
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	9006	}
				9007
				9008	static void rq_offline_fair(struct rq *rq)
				9009	{
				9010	update_sysctl();
Peter Boonstoppel	a4c96ae	2012-08-09 15:34:47 -0700	[diff] [blame]	9011
				9012	/* Ensure any throttled groups are reachable by pick_next_task */
				9013	unthrottle_offline_cfs_rqs(rq);
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	9014	}
				9015
Dhaval Giani	55e12e5	2008-06-24 23:39:43 +0530	[diff] [blame]	9016	#endif /* CONFIG_SMP */
Peter Williams	e1d1484	2007-10-24 18:23:51 +0200	[diff] [blame]	9017
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	9018	/*
				9019	* scheduler tick hitting a task of our scheduling class:
				9020	*/
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	9021	static void task_tick_fair(struct rq rq, struct task_struct curr, int queued)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	9022	{
				9023	struct cfs_rq *cfs_rq;
				9024	struct sched_entity *se = &curr->se;
				9025
				9026	for_each_sched_entity(se) {
				9027	cfs_rq = cfs_rq_of(se);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	9028	entity_tick(cfs_rq, se, queued);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	9029	}
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	9030
Srikar Dronamraju	b52da86	2015-10-02 07:48:25 +0530	[diff] [blame]	9031	if (static_branch_unlikely(&sched_numa_balancing))
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	9032	task_tick_numa(rq, curr);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	9033	}
				9034
				9035	/*
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	9036	* called on fork with the child task as argument from the parent's context
				9037	* - child not yet on the tasklist
				9038	* - preemption disabled
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	9039	*/
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	9040	static void task_fork_fair(struct task_struct *p)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	9041	{
Daisuke Nishimura	4fc420c	2011-12-15 14:36:55 +0900	[diff] [blame]	9042	struct cfs_rq *cfs_rq;
				9043	struct sched_entity se = &p->se, curr;
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	9044	struct rq *rq = this_rq();
Peter Zijlstra	8a8c69c	2016-10-04 16:04:35 +0200	[diff] [blame]	9045	struct rq_flags rf;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	9046
Peter Zijlstra	8a8c69c	2016-10-04 16:04:35 +0200	[diff] [blame]	9047	rq_lock(rq, &rf);
Peter Zijlstra	861d034	2010-08-19 13:31:43 +0200	[diff] [blame]	9048	update_rq_clock(rq);
				9049
Daisuke Nishimura	4fc420c	2011-12-15 14:36:55 +0900	[diff] [blame]	9050	cfs_rq = task_cfs_rq(current);
				9051	curr = cfs_rq->curr;
Peter Zijlstra	e210bff	2016-06-16 18:51:48 +0200	[diff] [blame]	9052	if (curr) {
				9053	update_curr(cfs_rq);
Mike Galbraith	b5d9d73	2009-09-08 11:12:28 +0200	[diff] [blame]	9054	se->vruntime = curr->vruntime;
Peter Zijlstra	e210bff	2016-06-16 18:51:48 +0200	[diff] [blame]	9055	}
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	9056	place_entity(cfs_rq, se, 1);
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	9057
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	9058	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
Dmitry Adamushko	87fefa3	2007-10-15 17:00:08 +0200	[diff] [blame]	9059	/*
Ingo Molnar	edcb60a	2007-10-15 17:00:08 +0200	[diff] [blame]	9060	* Upon rescheduling, sched_class::put_prev_task() will place
				9061	* 'current' within the tree based on its new key value.
				9062	*/
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	9063	swap(curr->vruntime, se->vruntime);
Kirill Tkhai	8875125	2014-06-29 00:03:57 +0400	[diff] [blame]	9064	resched_curr(rq);
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	9065	}
				9066
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	9067	se->vruntime -= cfs_rq->min_vruntime;
Peter Zijlstra	8a8c69c	2016-10-04 16:04:35 +0200	[diff] [blame]	9068	rq_unlock(rq, &rf);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	9069	}
				9070
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	9071	/*
				9072	* Priority of the task has changed. Check to see if we preempt
				9073	* the current task.
				9074	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	9075	static void
				9076	prio_changed_fair(struct rq rq, struct task_struct p, int oldprio)
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	9077	{
Kirill Tkhai	da0c1e6	2014-08-20 13:47:32 +0400	[diff] [blame]	9078	if (!task_on_rq_queued(p))
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	9079	return;
				9080
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	9081	/*
				9082	* Reschedule if we are currently running on this runqueue and
				9083	* our priority decreased, or if we are not currently running on
				9084	* this runqueue and our priority is higher than the current's
				9085	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	9086	if (rq->curr == p) {
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	9087	if (p->prio > oldprio)
Kirill Tkhai	8875125	2014-06-29 00:03:57 +0400	[diff] [blame]	9088	resched_curr(rq);
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	9089	} else
Peter Zijlstra	15afe09	2008-09-20 23:38:02 +0200	[diff] [blame]	9090	check_preempt_curr(rq, p, 0);
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	9091	}
				9092
Byungchul Park	daa5940	2015-08-20 20:22:00 +0900	[diff] [blame]	9093	static inline bool vruntime_normalized(struct task_struct *p)
				9094	{
				9095	struct sched_entity *se = &p->se;
				9096
				9097	/*
				9098	* In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
				9099	* the dequeue_entity(.flags=0) will already have normalized the
				9100	* vruntime.
				9101	*/
				9102	if (p->on_rq)
				9103	return true;
				9104
				9105	/*
				9106	* When !on_rq, vruntime of the task has usually NOT been normalized.
				9107	* But there are some cases where it has already been normalized:
				9108	*
				9109	* - A forked child which is waiting for being woken up by
				9110	* wake_up_new_task().
				9111	* - A task which has been woken up by try_to_wake_up() and
				9112	* waiting for actually being woken up by sched_ttwu_pending().
				9113	*/
				9114	if (!se->sum_exec_runtime \|\| p->state == TASK_WAKING)
				9115	return true;
				9116
				9117	return false;
				9118	}
				9119
Vincent Guittot	09a43ac	2016-11-08 10:53:45 +0100	[diff] [blame]	9120	#ifdef CONFIG_FAIR_GROUP_SCHED
				9121	/*
				9122	* Propagate the changes of the sched_entity across the tg tree to make it
				9123	* visible to the root
				9124	*/
				9125	static void propagate_entity_cfs_rq(struct sched_entity *se)
				9126	{
				9127	struct cfs_rq *cfs_rq;
				9128
				9129	/* Start to propagate at parent */
				9130	se = se->parent;
				9131
				9132	for_each_sched_entity(se) {
				9133	cfs_rq = cfs_rq_of(se);
				9134
				9135	if (cfs_rq_throttled(cfs_rq))
				9136	break;
				9137
				9138	update_load_avg(se, UPDATE_TG);
				9139	}
				9140	}
				9141	#else
				9142	static void propagate_entity_cfs_rq(struct sched_entity *se) { }
				9143	#endif
				9144
Vincent Guittot	df21791	2016-11-08 10:53:42 +0100	[diff] [blame]	9145	static void detach_entity_cfs_rq(struct sched_entity *se)
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	9146	{
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	9147	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				9148
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	9149	/* Catch up with the cfs_rq and remove our load when we leave */
Vincent Guittot	d31b1a6	2016-11-08 10:53:44 +0100	[diff] [blame]	9150	update_load_avg(se, 0);
Byungchul Park	a05e8c5	2015-08-20 20:21:56 +0900	[diff] [blame]	9151	detach_entity_load_avg(cfs_rq, se);
Peter Zijlstra	7c3edd2	2016-07-13 10:56:25 +0200	[diff] [blame]	9152	update_tg_load_avg(cfs_rq, false);
Vincent Guittot	09a43ac	2016-11-08 10:53:45 +0100	[diff] [blame]	9153	propagate_entity_cfs_rq(se);
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	9154	}
				9155
Vincent Guittot	df21791	2016-11-08 10:53:42 +0100	[diff] [blame]	9156	static void attach_entity_cfs_rq(struct sched_entity *se)
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	9157	{
Byungchul Park	daa5940	2015-08-20 20:22:00 +0900	[diff] [blame]	9158	struct cfs_rq *cfs_rq = cfs_rq_of(se);
Byungchul Park	7855a35	2015-08-10 18:02:55 +0900	[diff] [blame]	9159
				9160	#ifdef CONFIG_FAIR_GROUP_SCHED
Michael wang	eb7a59b	2014-02-20 11:14:53 +0800	[diff] [blame]	9161	/*
				9162	* Since the real-depth could have been changed (only FAIR
				9163	* class maintain depth value), reset depth properly.
				9164	*/
				9165	se->depth = se->parent ? se->parent->depth + 1 : 0;
				9166	#endif
Byungchul Park	7855a35	2015-08-10 18:02:55 +0900	[diff] [blame]	9167
Vincent Guittot	df21791	2016-11-08 10:53:42 +0100	[diff] [blame]	9168	/* Synchronize entity with its cfs_rq */
Vincent Guittot	d31b1a6	2016-11-08 10:53:44 +0100	[diff] [blame]	9169	update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
Byungchul Park	daa5940	2015-08-20 20:22:00 +0900	[diff] [blame]	9170	attach_entity_load_avg(cfs_rq, se);
Peter Zijlstra	7c3edd2	2016-07-13 10:56:25 +0200	[diff] [blame]	9171	update_tg_load_avg(cfs_rq, false);
Vincent Guittot	09a43ac	2016-11-08 10:53:45 +0100	[diff] [blame]	9172	propagate_entity_cfs_rq(se);
Vincent Guittot	df21791	2016-11-08 10:53:42 +0100	[diff] [blame]	9173	}
				9174
				9175	static void detach_task_cfs_rq(struct task_struct *p)
				9176	{
				9177	struct sched_entity *se = &p->se;
				9178	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				9179
				9180	if (!vruntime_normalized(p)) {
				9181	/*
				9182	* Fix up our vruntime so that the current sleep doesn't
				9183	* cause 'unlimited' sleep bonus.
				9184	*/
				9185	place_entity(cfs_rq, se, 0);
				9186	se->vruntime -= cfs_rq->min_vruntime;
				9187	}
				9188
				9189	detach_entity_cfs_rq(se);
				9190	}
				9191
				9192	static void attach_task_cfs_rq(struct task_struct *p)
				9193	{
				9194	struct sched_entity *se = &p->se;
				9195	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				9196
				9197	attach_entity_cfs_rq(se);
Byungchul Park	6efdb10	2015-08-20 20:21:59 +0900	[diff] [blame]	9198
Byungchul Park	daa5940	2015-08-20 20:22:00 +0900	[diff] [blame]	9199	if (!vruntime_normalized(p))
				9200	se->vruntime += cfs_rq->min_vruntime;
				9201	}
Byungchul Park	7855a35	2015-08-10 18:02:55 +0900	[diff] [blame]	9202
Byungchul Park	daa5940	2015-08-20 20:22:00 +0900	[diff] [blame]	9203	static void switched_from_fair(struct rq rq, struct task_struct p)
				9204	{
				9205	detach_task_cfs_rq(p);
				9206	}
				9207
				9208	static void switched_to_fair(struct rq rq, struct task_struct p)
				9209	{
				9210	attach_task_cfs_rq(p);
				9211
				9212	if (task_on_rq_queued(p)) {
Byungchul Park	7855a35	2015-08-10 18:02:55 +0900	[diff] [blame]	9213	/*
Byungchul Park	daa5940	2015-08-20 20:22:00 +0900	[diff] [blame]	9214	* We were most likely switched from sched_rt, so
				9215	* kick off the schedule if running, otherwise just see
				9216	* if we can still preempt the current task.
Byungchul Park	7855a35	2015-08-10 18:02:55 +0900	[diff] [blame]	9217	*/
Byungchul Park	daa5940	2015-08-20 20:22:00 +0900	[diff] [blame]	9218	if (rq->curr == p)
				9219	resched_curr(rq);
				9220	else
				9221	check_preempt_curr(rq, p, 0);
Byungchul Park	7855a35	2015-08-10 18:02:55 +0900	[diff] [blame]	9222	}
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	9223	}
				9224
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	9225	/* Account for a task changing its policy or group.
				9226	*
				9227	* This routine is mostly called to set cfs_rq->curr field when a task
				9228	* migrates between groups/classes.
				9229	*/
				9230	static void set_curr_task_fair(struct rq *rq)
				9231	{
				9232	struct sched_entity *se = &rq->curr->se;
				9233
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	9234	for_each_sched_entity(se) {
				9235	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				9236
				9237	set_next_entity(cfs_rq, se);
				9238	/* ensure bandwidth has been allocated on our new cfs_rq */
				9239	account_cfs_rq_runtime(cfs_rq, 0);
				9240	}
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	9241	}
				9242
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9243	void init_cfs_rq(struct cfs_rq *cfs_rq)
				9244	{
				9245	cfs_rq->tasks_timeline = RB_ROOT;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9246	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
				9247	#ifndef CONFIG_64BIT
				9248	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
				9249	#endif
Alex Shi	141965c	2013-06-26 13:05:39 +0800	[diff] [blame]	9250	#ifdef CONFIG_SMP
Vincent Guittot	09a43ac	2016-11-08 10:53:45 +0100	[diff] [blame]	9251	#ifdef CONFIG_FAIR_GROUP_SCHED
				9252	cfs_rq->propagate_avg = 0;
				9253	#endif
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	9254	atomic_long_set(&cfs_rq->removed_load_avg, 0);
				9255	atomic_long_set(&cfs_rq->removed_util_avg, 0);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	9256	#endif
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9257	}
				9258
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	9259	#ifdef CONFIG_FAIR_GROUP_SCHED
Vincent Guittot	ea86cb4	2016-06-17 13:38:55 +0200	[diff] [blame]	9260	static void task_set_group_fair(struct task_struct *p)
				9261	{
				9262	struct sched_entity *se = &p->se;
				9263
				9264	set_task_rq(p, task_cpu(p));
				9265	se->depth = se->parent ? se->parent->depth + 1 : 0;
				9266	}
				9267
Peter Zijlstra	bc54da2	2015-08-31 17:13:55 +0200	[diff] [blame]	9268	static void task_move_group_fair(struct task_struct *p)
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	9269	{
Byungchul Park	daa5940	2015-08-20 20:22:00 +0900	[diff] [blame]	9270	detach_task_cfs_rq(p);
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	9271	set_task_rq(p, task_cpu(p));
Byungchul Park	6efdb10	2015-08-20 20:21:59 +0900	[diff] [blame]	9272
				9273	#ifdef CONFIG_SMP
				9274	/* Tell se's cfs_rq has been changed -- migrated */
				9275	p->se.avg.last_update_time = 0;
				9276	#endif
Byungchul Park	daa5940	2015-08-20 20:22:00 +0900	[diff] [blame]	9277	attach_task_cfs_rq(p);
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	9278	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9279
Vincent Guittot	ea86cb4	2016-06-17 13:38:55 +0200	[diff] [blame]	9280	static void task_change_group_fair(struct task_struct *p, int type)
				9281	{
				9282	switch (type) {
				9283	case TASK_SET_GROUP:
				9284	task_set_group_fair(p);
				9285	break;
				9286
				9287	case TASK_MOVE_GROUP:
				9288	task_move_group_fair(p);
				9289	break;
				9290	}
				9291	}
				9292
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9293	void free_fair_sched_group(struct task_group *tg)
				9294	{
				9295	int i;
				9296
				9297	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
				9298
				9299	for_each_possible_cpu(i) {
				9300	if (tg->cfs_rq)
				9301	kfree(tg->cfs_rq[i]);
Peter Zijlstra	6fe1f34	2016-01-21 22:24:16 +0100	[diff] [blame]	9302	if (tg->se)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9303	kfree(tg->se[i]);
				9304	}
				9305
				9306	kfree(tg->cfs_rq);
				9307	kfree(tg->se);
				9308	}
				9309
				9310	int alloc_fair_sched_group(struct task_group tg, struct task_group parent)
				9311	{
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9312	struct sched_entity *se;
Peter Zijlstra	b7fa30c	2016-06-09 15:07:50 +0200	[diff] [blame]	9313	struct cfs_rq *cfs_rq;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9314	int i;
				9315
				9316	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
				9317	if (!tg->cfs_rq)
				9318	goto err;
				9319	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
				9320	if (!tg->se)
				9321	goto err;
				9322
				9323	tg->shares = NICE_0_LOAD;
				9324
				9325	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
				9326
				9327	for_each_possible_cpu(i) {
				9328	cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
				9329	GFP_KERNEL, cpu_to_node(i));
				9330	if (!cfs_rq)
				9331	goto err;
				9332
				9333	se = kzalloc_node(sizeof(struct sched_entity),
				9334	GFP_KERNEL, cpu_to_node(i));
				9335	if (!se)
				9336	goto err_free_rq;
				9337
				9338	init_cfs_rq(cfs_rq);
				9339	init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
Yuyang Du	540247f	2015-07-15 08:04:39 +0800	[diff] [blame]	9340	init_entity_runnable_average(se);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9341	}
				9342
				9343	return 1;
				9344
				9345	err_free_rq:
				9346	kfree(cfs_rq);
				9347	err:
				9348	return 0;
				9349	}
				9350
Peter Zijlstra	8663e24	2016-06-22 14:58:02 +0200	[diff] [blame]	9351	void online_fair_sched_group(struct task_group *tg)
				9352	{
				9353	struct sched_entity *se;
				9354	struct rq *rq;
				9355	int i;
				9356
				9357	for_each_possible_cpu(i) {
				9358	rq = cpu_rq(i);
				9359	se = tg->se[i];
				9360
				9361	raw_spin_lock_irq(&rq->lock);
Peter Zijlstra	4126bad	2016-10-03 16:20:59 +0200	[diff] [blame]	9362	update_rq_clock(rq);
Vincent Guittot	d032669	2016-11-08 10:53:47 +0100	[diff] [blame]	9363	attach_entity_cfs_rq(se);
Peter Zijlstra	55e16d3	2016-06-22 15:14:26 +0200	[diff] [blame]	9364	sync_throttle(tg, i);
Peter Zijlstra	8663e24	2016-06-22 14:58:02 +0200	[diff] [blame]	9365	raw_spin_unlock_irq(&rq->lock);
				9366	}
				9367	}
				9368
Peter Zijlstra	6fe1f34	2016-01-21 22:24:16 +0100	[diff] [blame]	9369	void unregister_fair_sched_group(struct task_group *tg)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9370	{
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9371	unsigned long flags;
Peter Zijlstra	6fe1f34	2016-01-21 22:24:16 +0100	[diff] [blame]	9372	struct rq *rq;
				9373	int cpu;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9374
Peter Zijlstra	6fe1f34	2016-01-21 22:24:16 +0100	[diff] [blame]	9375	for_each_possible_cpu(cpu) {
				9376	if (tg->se[cpu])
				9377	remove_entity_load_avg(tg->se[cpu]);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9378
Peter Zijlstra	6fe1f34	2016-01-21 22:24:16 +0100	[diff] [blame]	9379	/*
				9380	* Only empty task groups can be destroyed; so we can speculatively
				9381	* check on_list without danger of it being re-added.
				9382	*/
				9383	if (!tg->cfs_rq[cpu]->on_list)
				9384	continue;
				9385
				9386	rq = cpu_rq(cpu);
				9387
				9388	raw_spin_lock_irqsave(&rq->lock, flags);
				9389	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
				9390	raw_spin_unlock_irqrestore(&rq->lock, flags);
				9391	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9392	}
				9393
				9394	void init_tg_cfs_entry(struct task_group tg, struct cfs_rq cfs_rq,
				9395	struct sched_entity *se, int cpu,
				9396	struct sched_entity *parent)
				9397	{
				9398	struct rq *rq = cpu_rq(cpu);
				9399
				9400	cfs_rq->tg = tg;
				9401	cfs_rq->rq = rq;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9402	init_cfs_rq_runtime(cfs_rq);
				9403
				9404	tg->cfs_rq[cpu] = cfs_rq;
				9405	tg->se[cpu] = se;
				9406
				9407	/* se could be NULL for root_task_group */
				9408	if (!se)
				9409	return;
				9410
Peter Zijlstra	fed14d4	2012-02-11 06:05:00 +0100	[diff] [blame]	9411	if (!parent) {
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9412	se->cfs_rq = &rq->cfs;
Peter Zijlstra	fed14d4	2012-02-11 06:05:00 +0100	[diff] [blame]	9413	se->depth = 0;
				9414	} else {
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9415	se->cfs_rq = parent->my_q;
Peter Zijlstra	fed14d4	2012-02-11 06:05:00 +0100	[diff] [blame]	9416	se->depth = parent->depth + 1;
				9417	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9418
				9419	se->my_q = cfs_rq;
Paul Turner	0ac9b1c	2013-10-16 11:16:27 -0700	[diff] [blame]	9420	/* guarantee group entities always have weight */
				9421	update_load_set(&se->load, NICE_0_LOAD);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9422	se->parent = parent;
				9423	}
				9424
				9425	static DEFINE_MUTEX(shares_mutex);
				9426
				9427	int sched_group_set_shares(struct task_group *tg, unsigned long shares)
				9428	{
				9429	int i;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9430
				9431	/*
				9432	* We can't change the weight of the root cgroup.
				9433	*/
				9434	if (!tg->se[0])
				9435	return -EINVAL;
				9436
				9437	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
				9438
				9439	mutex_lock(&shares_mutex);
				9440	if (tg->shares == shares)
				9441	goto done;
				9442
				9443	tg->shares = shares;
				9444	for_each_possible_cpu(i) {
				9445	struct rq *rq = cpu_rq(i);
Peter Zijlstra	8a8c69c	2016-10-04 16:04:35 +0200	[diff] [blame]	9446	struct sched_entity *se = tg->se[i];
				9447	struct rq_flags rf;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9448
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9449	/* Propagate contribution to hierarchy */
Peter Zijlstra	8a8c69c	2016-10-04 16:04:35 +0200	[diff] [blame]	9450	rq_lock_irqsave(rq, &rf);
Frederic Weisbecker	71b1da4	2013-04-12 01:50:59 +0200	[diff] [blame]	9451	update_rq_clock(rq);
Vincent Guittot	89ee048	2016-12-21 16:50:26 +0100	[diff] [blame]	9452	for_each_sched_entity(se) {
				9453	update_load_avg(se, UPDATE_TG);
				9454	update_cfs_shares(se);
				9455	}
Peter Zijlstra	8a8c69c	2016-10-04 16:04:35 +0200	[diff] [blame]	9456	rq_unlock_irqrestore(rq, &rf);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9457	}
				9458
				9459	done:
				9460	mutex_unlock(&shares_mutex);
				9461	return 0;
				9462	}
				9463	#else /* CONFIG_FAIR_GROUP_SCHED */
				9464
				9465	void free_fair_sched_group(struct task_group *tg) { }
				9466
				9467	int alloc_fair_sched_group(struct task_group tg, struct task_group parent)
				9468	{
				9469	return 1;
				9470	}
				9471
Peter Zijlstra	8663e24	2016-06-22 14:58:02 +0200	[diff] [blame]	9472	void online_fair_sched_group(struct task_group *tg) { }
				9473
Peter Zijlstra	6fe1f34	2016-01-21 22:24:16 +0100	[diff] [blame]	9474	void unregister_fair_sched_group(struct task_group *tg) { }
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9475
				9476	#endif /* CONFIG_FAIR_GROUP_SCHED */
				9477
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	9478
H Hartley Sweeten	6d686f4	2010-01-13 20:21:52 -0700	[diff] [blame]	9479	static unsigned int get_rr_interval_fair(struct rq rq, struct task_struct task)
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	9480	{
				9481	struct sched_entity *se = &task->se;
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	9482	unsigned int rr_interval = 0;
				9483
				9484	/*
				9485	* Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
				9486	* idle runqueue:
				9487	*/
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	9488	if (rq->cfs.load.weight)
Zhu Yanhai	a59f4e0	2013-01-08 12:56:52 +0800	[diff] [blame]	9489	rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	9490
				9491	return rr_interval;
				9492	}
				9493
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	9494	/*
				9495	* All the scheduling class methods:
				9496	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9497	const struct sched_class fair_sched_class = {
Ingo Molnar	5522d5d	2007-10-15 17:00:12 +0200	[diff] [blame]	9498	.next = &idle_sched_class,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	9499	.enqueue_task = enqueue_task_fair,
				9500	.dequeue_task = dequeue_task_fair,
				9501	.yield_task = yield_task_fair,
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	9502	.yield_to_task = yield_to_task_fair,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	9503
Ingo Molnar	2e09bf5	2007-10-15 17:00:05 +0200	[diff] [blame]	9504	.check_preempt_curr = check_preempt_wakeup,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	9505
				9506	.pick_next_task = pick_next_task_fair,
				9507	.put_prev_task = put_prev_task_fair,
				9508
Peter Williams	681f3e6	2007-10-24 18:23:51 +0200	[diff] [blame]	9509	#ifdef CONFIG_SMP
Li Zefan	4ce72a2	2008-10-22 15:25:26 +0800	[diff] [blame]	9510	.select_task_rq = select_task_rq_fair,
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	9511	.migrate_task_rq = migrate_task_rq_fair,
Alex Shi	141965c	2013-06-26 13:05:39 +0800	[diff] [blame]	9512
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	9513	.rq_online = rq_online_fair,
				9514	.rq_offline = rq_offline_fair,
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	9515
Yuyang Du	1269557	2015-07-15 08:04:40 +0800	[diff] [blame]	9516	.task_dead = task_dead_fair,
Peter Zijlstra	c5b2803	2015-05-15 17:43:35 +0200	[diff] [blame]	9517	.set_cpus_allowed = set_cpus_allowed_common,
Peter Williams	681f3e6	2007-10-24 18:23:51 +0200	[diff] [blame]	9518	#endif
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	9519
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	9520	.set_curr_task = set_curr_task_fair,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	9521	.task_tick = task_tick_fair,
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	9522	.task_fork = task_fork_fair,
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	9523
				9524	.prio_changed = prio_changed_fair,
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	9525	.switched_from = switched_from_fair,
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	9526	.switched_to = switched_to_fair,
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	9527
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	9528	.get_rr_interval = get_rr_interval_fair,
				9529
Stanislaw Gruszka	6e99891	2014-11-12 16:58:44 +0100	[diff] [blame]	9530	.update_curr = update_curr_fair,
				9531
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	9532	#ifdef CONFIG_FAIR_GROUP_SCHED
Vincent Guittot	ea86cb4	2016-06-17 13:38:55 +0200	[diff] [blame]	9533	.task_change_group = task_change_group_fair,
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	9534	#endif
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	9535	};
				9536
				9537	#ifdef CONFIG_SCHED_DEBUG
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9538	void print_cfs_stats(struct seq_file *m, int cpu)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	9539	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	9540	struct cfs_rq *cfs_rq;
				9541
Peter Zijlstra	5973e5b	2008-01-25 21:08:34 +0100	[diff] [blame]	9542	rcu_read_lock();
Ingo Molnar	c3b64f1	2007-08-09 11:16:51 +0200	[diff] [blame]	9543	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
Ingo Molnar	5cef9ec	2007-08-09 11:16:47 +0200	[diff] [blame]	9544	print_cfs_rq(m, cpu, cfs_rq);
Peter Zijlstra	5973e5b	2008-01-25 21:08:34 +0100	[diff] [blame]	9545	rcu_read_unlock();
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	9546	}
Srikar Dronamraju	397f237	2015-06-25 22:51:43 +0530	[diff] [blame]	9547
				9548	#ifdef CONFIG_NUMA_BALANCING
				9549	void show_numa_stats(struct task_struct p, struct seq_file m)
				9550	{
				9551	int node;
				9552	unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
				9553
				9554	for_each_online_node(node) {
				9555	if (p->numa_faults) {
				9556	tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
				9557	tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
				9558	}
				9559	if (p->numa_group) {
				9560	gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
				9561	gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
				9562	}
				9563	print_numa_stats(m, node, tsf, tpf, gsf, gpf);
				9564	}
				9565	}
				9566	#endif /* CONFIG_NUMA_BALANCING */
				9567	#endif /* CONFIG_SCHED_DEBUG */
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9568
				9569	__init void init_sched_fair_class(void)
				9570	{
				9571	#ifdef CONFIG_SMP
				9572	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
				9573
Frederic Weisbecker	3451d02	2011-08-10 23:21:01 +0200	[diff] [blame]	9574	#ifdef CONFIG_NO_HZ_COMMON
Diwakar Tundlam	554ceca	2012-03-07 14:44:26 -0800	[diff] [blame]	9575	nohz.next_balance = jiffies;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9576	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9577	#endif
				9578	#endif /* SMP */
				9579
				9580	}