Blame - kernel/sched/fair.c - SHIFTPHONES/mainline/linux

blob: 18d9e75f1f6ef79654bfd9133be77a9ff92667f2 [file] [log] [blame]

Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1	/*
				2	* Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
				3	*
				4	* Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
				5	*
				6	* Interactivity improvements by Mike Galbraith
				7	* (C) 2007 Mike Galbraith <efault@gmx.de>
				8	*
				9	* Various enhancements by Dmitry Adamushko.
				10	* (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
				11	*
				12	* Group scheduling enhancements by Srivatsa Vaddagiri
				13	* Copyright IBM Corporation, 2007
				14	* Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
				15	*
				16	* Scaled math optimizations by Thomas Gleixner
				17	* Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	18	*
				19	* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
Peter Zijlstra	90eec10	2015-11-16 11:08:45 +0100	[diff] [blame]	20	* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	21	*/
				22
Christian Ehrhardt	1983a92	2009-11-30 12:16:47 +0100	[diff] [blame]	23	#include <linux/sched.h>
Mel Gorman	cb25176	2016-02-05 09:08:36 +0000	[diff] [blame]	24	#include <linux/latencytop.h>
Sisir Koppaka	3436ae1	2011-03-26 18:22:55 +0530	[diff] [blame]	25	#include <linux/cpumask.h>
Nicolas Pitre	83a0a96	2014-09-04 11:32:10 -0400	[diff] [blame]	26	#include <linux/cpuidle.h>
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	27	#include <linux/slab.h>
				28	#include <linux/profile.h>
				29	#include <linux/interrupt.h>
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	30	#include <linux/mempolicy.h>
Mel Gorman	e14808b	2012-11-19 10:59:15 +0000	[diff] [blame]	31	#include <linux/migrate.h>
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	32	#include <linux/task_work.h>
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	33
				34	#include <trace/events/sched.h>
				35
				36	#include "sched.h"
Arjan van de Ven	9745512	2008-01-25 21:08:34 +0100	[diff] [blame]	37
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	38	/*
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	39	* Targeted preemption latency for CPU-bound tasks:
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	40	*
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	41	* NOTE: this latency value is not the same as the concept of
Ingo Molnar	d274a4c	2007-10-15 17:00:14 +0200	[diff] [blame]	42	* 'timeslice length' - timeslices in CFS are of variable length
				43	* and have no persistent notion like in traditional, time-slice
				44	* based scheduling concepts.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	45	*
Ingo Molnar	d274a4c	2007-10-15 17:00:14 +0200	[diff] [blame]	46	* (to see the precise effective timeslice length of your workload,
				47	* run vmstat and monitor the context-switches (cs) field)
Ingo Molnar	2b4d5b2	2016-11-23 07:37:00 +0100	[diff] [blame]	48	*
				49	* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	50	*/
Ingo Molnar	2b4d5b2	2016-11-23 07:37:00 +0100	[diff] [blame]	51	unsigned int sysctl_sched_latency = 6000000ULL;
				52	unsigned int normalized_sysctl_sched_latency = 6000000ULL;
Ingo Molnar	2bd8e6d	2007-10-15 17:00:02 +0200	[diff] [blame]	53
				54	/*
Christian Ehrhardt	1983a92	2009-11-30 12:16:47 +0100	[diff] [blame]	55	* The initial- and re-scaling of tunables is configurable
Christian Ehrhardt	1983a92	2009-11-30 12:16:47 +0100	[diff] [blame]	56	*
				57	* Options are:
Ingo Molnar	2b4d5b2	2016-11-23 07:37:00 +0100	[diff] [blame]	58	*
				59	* SCHED_TUNABLESCALING_NONE - unscaled, always *1
				60	* SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
				61	* SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
				62	*
				63	* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
Christian Ehrhardt	1983a92	2009-11-30 12:16:47 +0100	[diff] [blame]	64	*/
Ingo Molnar	2b4d5b2	2016-11-23 07:37:00 +0100	[diff] [blame]	65	enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
Christian Ehrhardt	1983a92	2009-11-30 12:16:47 +0100	[diff] [blame]	66
				67	/*
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	68	* Minimal preemption granularity for CPU-bound tasks:
Ingo Molnar	2b4d5b2	2016-11-23 07:37:00 +0100	[diff] [blame]	69	*
Takuya Yoshikawa	864616e	2010-10-14 16:09:13 +0900	[diff] [blame]	70	* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	71	*/
Ingo Molnar	2b4d5b2	2016-11-23 07:37:00 +0100	[diff] [blame]	72	unsigned int sysctl_sched_min_granularity = 750000ULL;
				73	unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	74
				75	/*
Ingo Molnar	2b4d5b2	2016-11-23 07:37:00 +0100	[diff] [blame]	76	* This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	77	*/
Ingo Molnar	0bf377b	2010-09-12 08:14:52 +0200	[diff] [blame]	78	static unsigned int sched_nr_latency = 8;
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	79
				80	/*
Mike Galbraith	2bba22c	2009-09-09 15:41:37 +0200	[diff] [blame]	81	* After fork, child runs first. If set to 0 (default) then
Ingo Molnar	2bd8e6d	2007-10-15 17:00:02 +0200	[diff] [blame]	82	* parent will (try to) run first.
				83	*/
Mike Galbraith	2bba22c	2009-09-09 15:41:37 +0200	[diff] [blame]	84	unsigned int sysctl_sched_child_runs_first __read_mostly;
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	85
				86	/*
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	87	* SCHED_OTHER wake-up granularity.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	88	*
				89	* This option delays the preemption effects of decoupled workloads
				90	* and reduces their over-scheduling. Synchronous workloads will still
				91	* have immediate wakeup/sleep latencies.
Ingo Molnar	2b4d5b2	2016-11-23 07:37:00 +0100	[diff] [blame]	92	*
				93	* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	94	*/
Ingo Molnar	2b4d5b2	2016-11-23 07:37:00 +0100	[diff] [blame]	95	unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
				96	unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	97
Ingo Molnar	2b4d5b2	2016-11-23 07:37:00 +0100	[diff] [blame]	98	const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
Ingo Molnar	da84d96	2007-10-15 17:00:18 +0200	[diff] [blame]	99
Tim Chen	afe06ef	2016-11-22 12:23:53 -0800	[diff] [blame^]	100	#ifdef CONFIG_SMP
				101	/*
				102	* For asym packing, by default the lower numbered cpu has higher priority.
				103	*/
				104	int __weak arch_asym_cpu_priority(int cpu)
				105	{
				106	return -cpu;
				107	}
				108	#endif
				109
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	110	#ifdef CONFIG_CFS_BANDWIDTH
				111	/*
				112	* Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
				113	* each time a cfs_rq requests quota.
				114	*
				115	* Note: in the case that the slice exceeds the runtime remaining (either due
				116	* to consumption or the quota being specified to be smaller than the slice)
				117	* we will always only issue the remaining available time.
				118	*
Ingo Molnar	2b4d5b2	2016-11-23 07:37:00 +0100	[diff] [blame]	119	* (default: 5 msec, units: microseconds)
				120	*/
				121	unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	122	#endif
				123
Morten Rasmussen	3273163	2016-07-25 14:34:26 +0100	[diff] [blame]	124	/*
				125	* The margin used when comparing utilization with CPU capacity:
Morten Rasmussen	893c5d2	2016-10-14 14:41:12 +0100	[diff] [blame]	126	* util * margin < capacity * 1024
Ingo Molnar	2b4d5b2	2016-11-23 07:37:00 +0100	[diff] [blame]	127	*
				128	* (default: ~20%)
Morten Rasmussen	3273163	2016-07-25 14:34:26 +0100	[diff] [blame]	129	*/
Ingo Molnar	2b4d5b2	2016-11-23 07:37:00 +0100	[diff] [blame]	130	unsigned int capacity_margin = 1280;
Morten Rasmussen	3273163	2016-07-25 14:34:26 +0100	[diff] [blame]	131
Paul Gortmaker	8527632	2013-04-19 15:10:50 -0400	[diff] [blame]	132	static inline void update_load_add(struct load_weight *lw, unsigned long inc)
				133	{
				134	lw->weight += inc;
				135	lw->inv_weight = 0;
				136	}
				137
				138	static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
				139	{
				140	lw->weight -= dec;
				141	lw->inv_weight = 0;
				142	}
				143
				144	static inline void update_load_set(struct load_weight *lw, unsigned long w)
				145	{
				146	lw->weight = w;
				147	lw->inv_weight = 0;
				148	}
				149
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	150	/*
				151	* Increase the granularity value when there are more CPUs,
				152	* because with more CPUs the 'effective latency' as visible
				153	* to users decreases. But the relationship is not linear,
				154	* so pick a second-best guess by going with the log2 of the
				155	* number of CPUs.
				156	*
				157	* This idea comes from the SD scheduler of Con Kolivas:
				158	*/
Nicholas Mc Guire	58ac93e	2015-05-15 21:05:42 +0200	[diff] [blame]	159	static unsigned int get_update_sysctl_factor(void)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	160	{
Nicholas Mc Guire	58ac93e	2015-05-15 21:05:42 +0200	[diff] [blame]	161	unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	162	unsigned int factor;
				163
				164	switch (sysctl_sched_tunable_scaling) {
				165	case SCHED_TUNABLESCALING_NONE:
				166	factor = 1;
				167	break;
				168	case SCHED_TUNABLESCALING_LINEAR:
				169	factor = cpus;
				170	break;
				171	case SCHED_TUNABLESCALING_LOG:
				172	default:
				173	factor = 1 + ilog2(cpus);
				174	break;
				175	}
				176
				177	return factor;
				178	}
				179
				180	static void update_sysctl(void)
				181	{
				182	unsigned int factor = get_update_sysctl_factor();
				183
				184	#define SET_SYSCTL(name) \
				185	(sysctl_##name = (factor) * normalized_sysctl_##name)
				186	SET_SYSCTL(sched_min_granularity);
				187	SET_SYSCTL(sched_latency);
				188	SET_SYSCTL(sched_wakeup_granularity);
				189	#undef SET_SYSCTL
				190	}
				191
				192	void sched_init_granularity(void)
				193	{
				194	update_sysctl();
				195	}
				196
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	197	#define WMULT_CONST (~0U)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	198	#define WMULT_SHIFT 32
				199
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	200	static void __update_inv_weight(struct load_weight *lw)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	201	{
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	202	unsigned long w;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	203
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	204	if (likely(lw->inv_weight))
				205	return;
				206
				207	w = scale_load_down(lw->weight);
				208
				209	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
				210	lw->inv_weight = 1;
				211	else if (unlikely(!w))
				212	lw->inv_weight = WMULT_CONST;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	213	else
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	214	lw->inv_weight = WMULT_CONST / w;
				215	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	216
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	217	/*
				218	* delta_exec * weight / lw.weight
				219	* OR
				220	* (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
				221	*
Yuyang Du	1c3de5e	2016-03-30 07:07:51 +0800	[diff] [blame]	222	* Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	223	* we're guaranteed shift stays positive because inv_weight is guaranteed to
				224	* fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
				225	*
				226	* Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
				227	* weight/lw.weight <= 1, and therefore our shift will also be positive.
				228	*/
				229	static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
				230	{
				231	u64 fact = scale_load_down(weight);
				232	int shift = WMULT_SHIFT;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	233
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	234	__update_inv_weight(lw);
				235
				236	if (unlikely(fact >> 32)) {
				237	while (fact >> 32) {
				238	fact >>= 1;
				239	shift--;
				240	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	241	}
				242
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	243	/* hint to use a 32x32->64 mul */
				244	fact = (u64)(u32)fact * lw->inv_weight;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	245
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	246	while (fact >> 32) {
				247	fact >>= 1;
				248	shift--;
				249	}
				250
				251	return mul_u64_u32_shr(delta_exec, fact, shift);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	252	}
				253
				254
				255	const struct sched_class fair_sched_class;
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	256
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	257	/**************************************************************
				258	* CFS operations on generic schedulable entities:
				259	*/
				260
				261	#ifdef CONFIG_FAIR_GROUP_SCHED
				262
				263	/* cpu runqueue to which this cfs_rq is attached */
				264	static inline struct rq rq_of(struct cfs_rq cfs_rq)
				265	{
				266	return cfs_rq->rq;
				267	}
				268
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	269	/* An entity is a task if it doesn't "own" a runqueue */
				270	#define entity_is_task(se) (!se->my_q)
				271
Peter Zijlstra	8f48894	2009-07-24 12:25:30 +0200	[diff] [blame]	272	static inline struct task_struct task_of(struct sched_entity se)
				273	{
Peter Zijlstra	9148a3a	2016-09-20 22:34:51 +0200	[diff] [blame]	274	SCHED_WARN_ON(!entity_is_task(se));
Peter Zijlstra	8f48894	2009-07-24 12:25:30 +0200	[diff] [blame]	275	return container_of(se, struct task_struct, se);
				276	}
				277
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	278	/* Walk up scheduling entities hierarchy */
				279	#define for_each_sched_entity(se) \
				280	for (; se; se = se->parent)
				281
				282	static inline struct cfs_rq task_cfs_rq(struct task_struct p)
				283	{
				284	return p->se.cfs_rq;
				285	}
				286
				287	/* runqueue on which this entity is (to be) queued */
				288	static inline struct cfs_rq cfs_rq_of(struct sched_entity se)
				289	{
				290	return se->cfs_rq;
				291	}
				292
				293	/* runqueue "owned" by this group */
				294	static inline struct cfs_rq group_cfs_rq(struct sched_entity grp)
				295	{
				296	return grp->my_q;
				297	}
				298
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	299	static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				300	{
				301	if (!cfs_rq->on_list) {
Vincent Guittot	9c2791f	2016-11-08 10:53:43 +0100	[diff] [blame]	302	struct rq *rq = rq_of(cfs_rq);
				303	int cpu = cpu_of(rq);
Paul Turner	67e8625	2010-11-15 15:47:05 -0800	[diff] [blame]	304	/*
				305	* Ensure we either appear before our parent (if already
				306	* enqueued) or force our parent to appear after us when it is
Vincent Guittot	9c2791f	2016-11-08 10:53:43 +0100	[diff] [blame]	307	* enqueued. The fact that we always enqueue bottom-up
				308	* reduces this to two cases and a special case for the root
				309	* cfs_rq. Furthermore, it also means that we will always reset
				310	* tmp_alone_branch either when the branch is connected
				311	* to a tree or when we reach the beg of the tree
Paul Turner	67e8625	2010-11-15 15:47:05 -0800	[diff] [blame]	312	*/
				313	if (cfs_rq->tg->parent &&
Vincent Guittot	9c2791f	2016-11-08 10:53:43 +0100	[diff] [blame]	314	cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
				315	/*
				316	* If parent is already on the list, we add the child
				317	* just before. Thanks to circular linked property of
				318	* the list, this means to put the child at the tail
				319	* of the list that starts by parent.
				320	*/
Paul Turner	67e8625	2010-11-15 15:47:05 -0800	[diff] [blame]	321	list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
Vincent Guittot	9c2791f	2016-11-08 10:53:43 +0100	[diff] [blame]	322	&(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
				323	/*
				324	* The branch is now connected to its tree so we can
				325	* reset tmp_alone_branch to the beginning of the
				326	* list.
				327	*/
				328	rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
				329	} else if (!cfs_rq->tg->parent) {
				330	/*
				331	* cfs rq without parent should be put
				332	* at the tail of the list.
				333	*/
				334	list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
				335	&rq->leaf_cfs_rq_list);
				336	/*
				337	* We have reach the beg of a tree so we can reset
				338	* tmp_alone_branch to the beginning of the list.
				339	*/
				340	rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
				341	} else {
				342	/*
				343	* The parent has not already been added so we want to
				344	* make sure that it will be put after us.
				345	* tmp_alone_branch points to the beg of the branch
				346	* where we will add parent.
				347	*/
				348	list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
				349	rq->tmp_alone_branch);
				350	/*
				351	* update tmp_alone_branch to points to the new beg
				352	* of the branch
				353	*/
				354	rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
Paul Turner	67e8625	2010-11-15 15:47:05 -0800	[diff] [blame]	355	}
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	356
				357	cfs_rq->on_list = 1;
				358	}
				359	}
				360
				361	static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				362	{
				363	if (cfs_rq->on_list) {
				364	list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
				365	cfs_rq->on_list = 0;
				366	}
				367	}
				368
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	369	/* Iterate thr' all leaf cfs_rq's on a runqueue */
				370	#define for_each_leaf_cfs_rq(rq, cfs_rq) \
				371	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
				372
				373	/* Do the two (enqueued) entities belong to the same group ? */
Peter Zijlstra	fed14d4	2012-02-11 06:05:00 +0100	[diff] [blame]	374	static inline struct cfs_rq *
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	375	is_same_group(struct sched_entity se, struct sched_entity pse)
				376	{
				377	if (se->cfs_rq == pse->cfs_rq)
Peter Zijlstra	fed14d4	2012-02-11 06:05:00 +0100	[diff] [blame]	378	return se->cfs_rq;
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	379
Peter Zijlstra	fed14d4	2012-02-11 06:05:00 +0100	[diff] [blame]	380	return NULL;
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	381	}
				382
				383	static inline struct sched_entity parent_entity(struct sched_entity se)
				384	{
				385	return se->parent;
				386	}
				387
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	388	static void
				389	find_matching_se(struct sched_entity se, struct sched_entity pse)
				390	{
				391	int se_depth, pse_depth;
				392
				393	/*
				394	* preemption test can be made between sibling entities who are in the
				395	* same cfs_rq i.e who have a common parent. Walk up the hierarchy of
				396	* both tasks until we find their ancestors who are siblings of common
				397	* parent.
				398	*/
				399
				400	/* First walk up until both entities are at same depth */
Peter Zijlstra	fed14d4	2012-02-11 06:05:00 +0100	[diff] [blame]	401	se_depth = (*se)->depth;
				402	pse_depth = (*pse)->depth;
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	403
				404	while (se_depth > pse_depth) {
				405	se_depth--;
				406	se = parent_entity(se);
				407	}
				408
				409	while (pse_depth > se_depth) {
				410	pse_depth--;
				411	pse = parent_entity(pse);
				412	}
				413
				414	while (!is_same_group(se, pse)) {
				415	se = parent_entity(se);
				416	pse = parent_entity(pse);
				417	}
				418	}
				419
Peter Zijlstra	8f48894	2009-07-24 12:25:30 +0200	[diff] [blame]	420	#else /* !CONFIG_FAIR_GROUP_SCHED */
				421
				422	static inline struct task_struct task_of(struct sched_entity se)
				423	{
				424	return container_of(se, struct task_struct, se);
				425	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	426
				427	static inline struct rq rq_of(struct cfs_rq cfs_rq)
				428	{
				429	return container_of(cfs_rq, struct rq, cfs);
				430	}
				431
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	432	#define entity_is_task(se) 1
				433
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	434	#define for_each_sched_entity(se) \
				435	for (; se; se = NULL)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	436
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	437	static inline struct cfs_rq task_cfs_rq(struct task_struct p)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	438	{
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	439	return &task_rq(p)->cfs;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	440	}
				441
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	442	static inline struct cfs_rq cfs_rq_of(struct sched_entity se)
				443	{
				444	struct task_struct *p = task_of(se);
				445	struct rq *rq = task_rq(p);
				446
				447	return &rq->cfs;
				448	}
				449
				450	/* runqueue "owned" by this group */
				451	static inline struct cfs_rq group_cfs_rq(struct sched_entity grp)
				452	{
				453	return NULL;
				454	}
				455
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	456	static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				457	{
				458	}
				459
				460	static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				461	{
				462	}
				463
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	464	#define for_each_leaf_cfs_rq(rq, cfs_rq) \
				465	for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
				466
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	467	static inline struct sched_entity parent_entity(struct sched_entity se)
				468	{
				469	return NULL;
				470	}
				471
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	472	static inline void
				473	find_matching_se(struct sched_entity se, struct sched_entity pse)
				474	{
				475	}
				476
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	477	#endif /* CONFIG_FAIR_GROUP_SCHED */
				478
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	479	static __always_inline
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	480	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	481
				482	/**************************************************************
				483	* Scheduling class tree data structure manipulation methods:
				484	*/
				485
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	486	static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
Peter Zijlstra	02e0431	2007-10-15 17:00:07 +0200	[diff] [blame]	487	{
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	488	s64 delta = (s64)(vruntime - max_vruntime);
Peter Zijlstra	368059a	2007-10-15 17:00:11 +0200	[diff] [blame]	489	if (delta > 0)
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	490	max_vruntime = vruntime;
Peter Zijlstra	02e0431	2007-10-15 17:00:07 +0200	[diff] [blame]	491
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	492	return max_vruntime;
Peter Zijlstra	02e0431	2007-10-15 17:00:07 +0200	[diff] [blame]	493	}
				494
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	495	static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
Peter Zijlstra	b0ffd24	2007-10-15 17:00:12 +0200	[diff] [blame]	496	{
				497	s64 delta = (s64)(vruntime - min_vruntime);
				498	if (delta < 0)
				499	min_vruntime = vruntime;
				500
				501	return min_vruntime;
				502	}
				503
Fabio Checconi	54fdc58	2009-07-16 12:32:27 +0200	[diff] [blame]	504	static inline int entity_before(struct sched_entity *a,
				505	struct sched_entity *b)
				506	{
				507	return (s64)(a->vruntime - b->vruntime) < 0;
				508	}
				509
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	510	static void update_min_vruntime(struct cfs_rq *cfs_rq)
				511	{
Peter Zijlstra	b60205c	2016-09-20 21:58:12 +0200	[diff] [blame]	512	struct sched_entity *curr = cfs_rq->curr;
				513
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	514	u64 vruntime = cfs_rq->min_vruntime;
				515
Peter Zijlstra	b60205c	2016-09-20 21:58:12 +0200	[diff] [blame]	516	if (curr) {
				517	if (curr->on_rq)
				518	vruntime = curr->vruntime;
				519	else
				520	curr = NULL;
				521	}
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	522
				523	if (cfs_rq->rb_leftmost) {
				524	struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
				525	struct sched_entity,
				526	run_node);
				527
Peter Zijlstra	b60205c	2016-09-20 21:58:12 +0200	[diff] [blame]	528	if (!curr)
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	529	vruntime = se->vruntime;
				530	else
				531	vruntime = min_vruntime(vruntime, se->vruntime);
				532	}
				533
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	534	/* ensure we never gain time by being placed backwards. */
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	535	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	536	#ifndef CONFIG_64BIT
				537	smp_wmb();
				538	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
				539	#endif
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	540	}
				541
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	542	/*
				543	* Enqueue an entity into the rb-tree:
				544	*/
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	545	static void __enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	546	{
				547	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
				548	struct rb_node *parent = NULL;
				549	struct sched_entity *entry;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	550	int leftmost = 1;
				551
				552	/*
				553	* Find the right place in the rbtree:
				554	*/
				555	while (*link) {
				556	parent = *link;
				557	entry = rb_entry(parent, struct sched_entity, run_node);
				558	/*
				559	* We dont care about collisions. Nodes with
				560	* the same key stay together.
				561	*/
Stephan Baerwolf	2bd2d6f	2011-07-20 14:46:59 +0200	[diff] [blame]	562	if (entity_before(se, entry)) {
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	563	link = &parent->rb_left;
				564	} else {
				565	link = &parent->rb_right;
				566	leftmost = 0;
				567	}
				568	}
				569
				570	/*
				571	* Maintain a cache of leftmost tree entries (it is frequently
				572	* used):
				573	*/
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	574	if (leftmost)
Ingo Molnar	57cb499	2007-10-15 17:00:11 +0200	[diff] [blame]	575	cfs_rq->rb_leftmost = &se->run_node;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	576
				577	rb_link_node(&se->run_node, parent, link);
				578	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	579	}
				580
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	581	static void __dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	582	{
Peter Zijlstra	3fe6974	2008-03-14 20:55:51 +0100	[diff] [blame]	583	if (cfs_rq->rb_leftmost == &se->run_node) {
				584	struct rb_node *next_node;
Peter Zijlstra	3fe6974	2008-03-14 20:55:51 +0100	[diff] [blame]	585
				586	next_node = rb_next(&se->run_node);
				587	cfs_rq->rb_leftmost = next_node;
Peter Zijlstra	3fe6974	2008-03-14 20:55:51 +0100	[diff] [blame]	588	}
Ingo Molnar	e9acbff	2007-10-15 17:00:04 +0200	[diff] [blame]	589
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	590	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	591	}
				592
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	593	struct sched_entity __pick_first_entity(struct cfs_rq cfs_rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	594	{
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	595	struct rb_node *left = cfs_rq->rb_leftmost;
				596
				597	if (!left)
				598	return NULL;
				599
				600	return rb_entry(left, struct sched_entity, run_node);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	601	}
				602
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	603	static struct sched_entity __pick_next_entity(struct sched_entity se)
				604	{
				605	struct rb_node *next = rb_next(&se->run_node);
				606
				607	if (!next)
				608	return NULL;
				609
				610	return rb_entry(next, struct sched_entity, run_node);
				611	}
				612
				613	#ifdef CONFIG_SCHED_DEBUG
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	614	struct sched_entity __pick_last_entity(struct cfs_rq cfs_rq)
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	615	{
Ingo Molnar	7eee3e6	2008-02-22 10:32:21 +0100	[diff] [blame]	616	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	617
Balbir Singh	70eee74	2008-02-22 13:25:53 +0530	[diff] [blame]	618	if (!last)
				619	return NULL;
Ingo Molnar	7eee3e6	2008-02-22 10:32:21 +0100	[diff] [blame]	620
				621	return rb_entry(last, struct sched_entity, run_node);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	622	}
				623
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	624	/**************************************************************
				625	* Scheduling class statistics methods:
				626	*/
				627
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	628	int sched_proc_update_handler(struct ctl_table *table, int write,
Alexey Dobriyan	8d65af7	2009-09-23 15:57:19 -0700	[diff] [blame]	629	void __user buffer, size_t lenp,
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	630	loff_t *ppos)
				631	{
Alexey Dobriyan	8d65af7	2009-09-23 15:57:19 -0700	[diff] [blame]	632	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
Nicholas Mc Guire	58ac93e	2015-05-15 21:05:42 +0200	[diff] [blame]	633	unsigned int factor = get_update_sysctl_factor();
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	634
				635	if (ret \|\| !write)
				636	return ret;
				637
				638	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
				639	sysctl_sched_min_granularity);
				640
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	641	#define WRT_SYSCTL(name) \
				642	(normalized_sysctl_##name = sysctl_##name / (factor))
				643	WRT_SYSCTL(sched_min_granularity);
				644	WRT_SYSCTL(sched_latency);
				645	WRT_SYSCTL(sched_wakeup_granularity);
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	646	#undef WRT_SYSCTL
				647
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	648	return 0;
				649	}
				650	#endif
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	651
				652	/*
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	653	* delta /= w
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	654	*/
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	655	static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	656	{
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	657	if (unlikely(se->load.weight != NICE_0_LOAD))
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	658	delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	659
				660	return delta;
				661	}
				662
				663	/*
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	664	* The idea is to set a period in which each task runs once.
				665	*
Borislav Petkov	532b185	2012-08-08 16:16:04 +0200	[diff] [blame]	666	* When there are too many tasks (sched_nr_latency) we have to stretch
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	667	* this period because otherwise the slices get too small.
				668	*
				669	* p = (nr <= nl) ? l : l*nr/nl
				670	*/
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	671	static u64 __sched_period(unsigned long nr_running)
				672	{
Boqun Feng	8e2b0bf	2015-07-02 22:25:52 +0800	[diff] [blame]	673	if (unlikely(nr_running > sched_nr_latency))
				674	return nr_running * sysctl_sched_min_granularity;
				675	else
				676	return sysctl_sched_latency;
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	677	}
				678
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	679	/*
				680	* We calculate the wall-time slice from the period by taking a part
				681	* proportional to the weight.
				682	*
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	683	* s = p*P[w/rw]
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	684	*/
Peter Zijlstra	6d0f0eb	2007-10-15 17:00:05 +0200	[diff] [blame]	685	static u64 sched_slice(struct cfs_rq cfs_rq, struct sched_entity se)
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	686	{
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	687	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	688
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	689	for_each_sched_entity(se) {
Lin Ming	6272d68	2009-01-15 17:17:15 +0100	[diff] [blame]	690	struct load_weight *load;
Christian Engelmayer	3104bf0	2009-06-16 10:35:12 +0200	[diff] [blame]	691	struct load_weight lw;
Lin Ming	6272d68	2009-01-15 17:17:15 +0100	[diff] [blame]	692
				693	cfs_rq = cfs_rq_of(se);
				694	load = &cfs_rq->load;
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	695
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	696	if (unlikely(!se->on_rq)) {
Christian Engelmayer	3104bf0	2009-06-16 10:35:12 +0200	[diff] [blame]	697	lw = cfs_rq->load;
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	698
				699	update_load_add(&lw, se->load.weight);
				700	load = &lw;
				701	}
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	702	slice = __calc_delta(slice, se->load.weight, load);
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	703	}
				704	return slice;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	705	}
				706
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	707	/*
Andrei Epure	660cc00	2013-03-11 12:03:20 +0200	[diff] [blame]	708	* We calculate the vruntime slice of a to-be-inserted task.
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	709	*
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	710	* vs = s/w
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	711	*/
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	712	static u64 sched_vslice(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	713	{
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	714	return calc_delta_fair(sched_slice(cfs_rq, se), se);
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	715	}
				716
Alex Shi	a75cdaa	2013-06-20 10:18:47 +0800	[diff] [blame]	717	#ifdef CONFIG_SMP
Morten Rasmussen	772bd008c	2016-06-22 18:03:13 +0100	[diff] [blame]	718	static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	719	static unsigned long task_h_load(struct task_struct *p);
				720
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	721	/*
				722	* We choose a half-life close to 1 scheduling period.
Leo Yan	84fb5a1	2015-09-15 18:57:37 +0800	[diff] [blame]	723	* Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
				724	* dependent on this value.
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	725	*/
				726	#define LOAD_AVG_PERIOD 32
				727	#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
Leo Yan	84fb5a1	2015-09-15 18:57:37 +0800	[diff] [blame]	728	#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */
Alex Shi	a75cdaa	2013-06-20 10:18:47 +0800	[diff] [blame]	729
Yuyang Du	540247f	2015-07-15 08:04:39 +0800	[diff] [blame]	730	/* Give new sched_entity start runnable values to heavy its load in infant time */
				731	void init_entity_runnable_average(struct sched_entity *se)
Alex Shi	a75cdaa	2013-06-20 10:18:47 +0800	[diff] [blame]	732	{
Yuyang Du	540247f	2015-07-15 08:04:39 +0800	[diff] [blame]	733	struct sched_avg *sa = &se->avg;
Alex Shi	a75cdaa	2013-06-20 10:18:47 +0800	[diff] [blame]	734
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	735	sa->last_update_time = 0;
				736	/*
				737	* sched_avg's period_contrib should be strictly less then 1024, so
				738	* we give it 1023 to make sure it is almost a period (1024us), and
				739	* will definitely be update (after enqueue).
				740	*/
				741	sa->period_contrib = 1023;
Vincent Guittot	b5a9b34	2016-10-19 14:45:23 +0200	[diff] [blame]	742	/*
				743	* Tasks are intialized with full load to be seen as heavy tasks until
				744	* they get a chance to stabilize to their real load level.
				745	* Group entities are intialized with zero load to reflect the fact that
				746	* nothing has been attached to the task group yet.
				747	*/
				748	if (entity_is_task(se))
				749	sa->load_avg = scale_load_down(se->load.weight);
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	750	sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
Yuyang Du	2b8c41d	2016-03-30 04:30:56 +0800	[diff] [blame]	751	/*
				752	* At this point, util_avg won't be used in select_task_rq_fair anyway
				753	*/
				754	sa->util_avg = 0;
				755	sa->util_sum = 0;
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	756	/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
Alex Shi	a75cdaa	2013-06-20 10:18:47 +0800	[diff] [blame]	757	}
Yuyang Du	7ea241a	2015-07-15 08:04:42 +0800	[diff] [blame]	758
Peter Zijlstra	7dc603c	2016-06-16 13:29:28 +0200	[diff] [blame]	759	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
Vincent Guittot	df21791	2016-11-08 10:53:42 +0100	[diff] [blame]	760	static void attach_entity_cfs_rq(struct sched_entity *se);
Peter Zijlstra	7dc603c	2016-06-16 13:29:28 +0200	[diff] [blame]	761
Yuyang Du	2b8c41d	2016-03-30 04:30:56 +0800	[diff] [blame]	762	/*
				763	* With new tasks being created, their initial util_avgs are extrapolated
				764	* based on the cfs_rq's current util_avg:
				765	*
				766	* util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
				767	*
				768	* However, in many cases, the above util_avg does not give a desired
				769	* value. Moreover, the sum of the util_avgs may be divergent, such
				770	* as when the series is a harmonic series.
				771	*
				772	* To solve this problem, we also cap the util_avg of successive tasks to
				773	* only 1/2 of the left utilization budget:
				774	*
				775	* util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
				776	*
				777	* where n denotes the nth task.
				778	*
				779	* For example, a simplest series from the beginning would be like:
				780	*
				781	* task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
				782	* cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
				783	*
				784	* Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
				785	* if util_avg > util_avg_cap.
				786	*/
				787	void post_init_entity_util_avg(struct sched_entity *se)
				788	{
				789	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				790	struct sched_avg *sa = &se->avg;
Yuyang Du	172895e	2016-04-05 12:12:27 +0800	[diff] [blame]	791	long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
Yuyang Du	2b8c41d	2016-03-30 04:30:56 +0800	[diff] [blame]	792
				793	if (cap > 0) {
				794	if (cfs_rq->avg.util_avg != 0) {
				795	sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
				796	sa->util_avg /= (cfs_rq->avg.load_avg + 1);
				797
				798	if (sa->util_avg > cap)
				799	sa->util_avg = cap;
				800	} else {
				801	sa->util_avg = cap;
				802	}
				803	sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
				804	}
Peter Zijlstra	7dc603c	2016-06-16 13:29:28 +0200	[diff] [blame]	805
				806	if (entity_is_task(se)) {
				807	struct task_struct *p = task_of(se);
				808	if (p->sched_class != &fair_sched_class) {
				809	/*
				810	* For !fair tasks do:
				811	*
				812	update_cfs_rq_load_avg(now, cfs_rq, false);
				813	attach_entity_load_avg(cfs_rq, se);
				814	switched_from_fair(rq, p);
				815	*
				816	* such that the next switched_to_fair() has the
				817	* expected state.
				818	*/
Vincent Guittot	df21791	2016-11-08 10:53:42 +0100	[diff] [blame]	819	se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
Peter Zijlstra	7dc603c	2016-06-16 13:29:28 +0200	[diff] [blame]	820	return;
				821	}
				822	}
				823
Vincent Guittot	df21791	2016-11-08 10:53:42 +0100	[diff] [blame]	824	attach_entity_cfs_rq(se);
Yuyang Du	2b8c41d	2016-03-30 04:30:56 +0800	[diff] [blame]	825	}
				826
Peter Zijlstra	7dc603c	2016-06-16 13:29:28 +0200	[diff] [blame]	827	#else /* !CONFIG_SMP */
Yuyang Du	540247f	2015-07-15 08:04:39 +0800	[diff] [blame]	828	void init_entity_runnable_average(struct sched_entity *se)
Alex Shi	a75cdaa	2013-06-20 10:18:47 +0800	[diff] [blame]	829	{
				830	}
Yuyang Du	2b8c41d	2016-03-30 04:30:56 +0800	[diff] [blame]	831	void post_init_entity_util_avg(struct sched_entity *se)
				832	{
				833	}
Peter Zijlstra	3d30544f	2016-06-21 14:27:50 +0200	[diff] [blame]	834	static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
				835	{
				836	}
Peter Zijlstra	7dc603c	2016-06-16 13:29:28 +0200	[diff] [blame]	837	#endif /* CONFIG_SMP */
Alex Shi	a75cdaa	2013-06-20 10:18:47 +0800	[diff] [blame]	838
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	839	/*
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	840	* Update the current task's runtime statistics.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	841	*/
Ingo Molnar	b7cc089	2007-08-09 11:16:47 +0200	[diff] [blame]	842	static void update_curr(struct cfs_rq *cfs_rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	843	{
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	844	struct sched_entity *curr = cfs_rq->curr;
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	845	u64 now = rq_clock_task(rq_of(cfs_rq));
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	846	u64 delta_exec;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	847
				848	if (unlikely(!curr))
				849	return;
				850
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	851	delta_exec = now - curr->exec_start;
				852	if (unlikely((s64)delta_exec <= 0))
Peter Zijlstra	34f28ec	2008-12-16 08:45:31 +0100	[diff] [blame]	853	return;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	854
Ingo Molnar	8ebc91d	2007-10-15 17:00:03 +0200	[diff] [blame]	855	curr->exec_start = now;
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	856
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	857	schedstat_set(curr->statistics.exec_max,
				858	max(delta_exec, curr->statistics.exec_max));
				859
				860	curr->sum_exec_runtime += delta_exec;
Josh Poimboeuf	ae92882	2016-06-17 12:43:24 -0500	[diff] [blame]	861	schedstat_add(cfs_rq->exec_clock, delta_exec);
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	862
				863	curr->vruntime += calc_delta_fair(delta_exec, curr);
				864	update_min_vruntime(cfs_rq);
				865
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	866	if (entity_is_task(curr)) {
				867	struct task_struct *curtask = task_of(curr);
				868
Ingo Molnar	f977bb4	2009-09-13 18:15:54 +0200	[diff] [blame]	869	trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	870	cpuacct_charge(curtask, delta_exec);
Frank Mayhar	f06febc	2008-09-12 09:54:39 -0700	[diff] [blame]	871	account_group_exec_runtime(curtask, delta_exec);
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	872	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	873
				874	account_cfs_rq_runtime(cfs_rq, delta_exec);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	875	}
				876
Stanislaw Gruszka	6e99891	2014-11-12 16:58:44 +0100	[diff] [blame]	877	static void update_curr_fair(struct rq *rq)
				878	{
				879	update_curr(cfs_rq_of(&rq->curr->se));
				880	}
				881
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	882	static inline void
Ingo Molnar	5870db5	2007-08-09 11:16:47 +0200	[diff] [blame]	883	update_stats_wait_start(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	884	{
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	885	u64 wait_start, prev_wait_start;
				886
				887	if (!schedstat_enabled())
				888	return;
				889
				890	wait_start = rq_clock(rq_of(cfs_rq));
				891	prev_wait_start = schedstat_val(se->statistics.wait_start);
Joonwoo Park	3ea94de	2015-11-12 19:38:54 -0800	[diff] [blame]	892
				893	if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	894	likely(wait_start > prev_wait_start))
				895	wait_start -= prev_wait_start;
Joonwoo Park	3ea94de	2015-11-12 19:38:54 -0800	[diff] [blame]	896
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	897	schedstat_set(se->statistics.wait_start, wait_start);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	898	}
				899
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	900	static inline void
Joonwoo Park	3ea94de	2015-11-12 19:38:54 -0800	[diff] [blame]	901	update_stats_wait_end(struct cfs_rq cfs_rq, struct sched_entity se)
				902	{
				903	struct task_struct *p;
Mel Gorman	cb25176	2016-02-05 09:08:36 +0000	[diff] [blame]	904	u64 delta;
				905
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	906	if (!schedstat_enabled())
				907	return;
				908
				909	delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
Joonwoo Park	3ea94de	2015-11-12 19:38:54 -0800	[diff] [blame]	910
				911	if (entity_is_task(se)) {
				912	p = task_of(se);
				913	if (task_on_rq_migrating(p)) {
				914	/*
				915	* Preserve migrating task's wait time so wait_start
				916	* time stamp can be adjusted to accumulate wait time
				917	* prior to migration.
				918	*/
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	919	schedstat_set(se->statistics.wait_start, delta);
Joonwoo Park	3ea94de	2015-11-12 19:38:54 -0800	[diff] [blame]	920	return;
				921	}
				922	trace_sched_stat_wait(p, delta);
				923	}
				924
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	925	schedstat_set(se->statistics.wait_max,
				926	max(schedstat_val(se->statistics.wait_max), delta));
				927	schedstat_inc(se->statistics.wait_count);
				928	schedstat_add(se->statistics.wait_sum, delta);
				929	schedstat_set(se->statistics.wait_start, 0);
Joonwoo Park	3ea94de	2015-11-12 19:38:54 -0800	[diff] [blame]	930	}
Joonwoo Park	3ea94de	2015-11-12 19:38:54 -0800	[diff] [blame]	931
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	932	static inline void
Josh Poimboeuf	1a3d027	2016-06-17 12:43:23 -0500	[diff] [blame]	933	update_stats_enqueue_sleeper(struct cfs_rq cfs_rq, struct sched_entity se)
				934	{
				935	struct task_struct *tsk = NULL;
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	936	u64 sleep_start, block_start;
				937
				938	if (!schedstat_enabled())
				939	return;
				940
				941	sleep_start = schedstat_val(se->statistics.sleep_start);
				942	block_start = schedstat_val(se->statistics.block_start);
Josh Poimboeuf	1a3d027	2016-06-17 12:43:23 -0500	[diff] [blame]	943
				944	if (entity_is_task(se))
				945	tsk = task_of(se);
				946
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	947	if (sleep_start) {
				948	u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
Josh Poimboeuf	1a3d027	2016-06-17 12:43:23 -0500	[diff] [blame]	949
				950	if ((s64)delta < 0)
				951	delta = 0;
				952
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	953	if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
				954	schedstat_set(se->statistics.sleep_max, delta);
Josh Poimboeuf	1a3d027	2016-06-17 12:43:23 -0500	[diff] [blame]	955
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	956	schedstat_set(se->statistics.sleep_start, 0);
				957	schedstat_add(se->statistics.sum_sleep_runtime, delta);
Josh Poimboeuf	1a3d027	2016-06-17 12:43:23 -0500	[diff] [blame]	958
				959	if (tsk) {
				960	account_scheduler_latency(tsk, delta >> 10, 1);
				961	trace_sched_stat_sleep(tsk, delta);
				962	}
				963	}
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	964	if (block_start) {
				965	u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
Josh Poimboeuf	1a3d027	2016-06-17 12:43:23 -0500	[diff] [blame]	966
				967	if ((s64)delta < 0)
				968	delta = 0;
				969
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	970	if (unlikely(delta > schedstat_val(se->statistics.block_max)))
				971	schedstat_set(se->statistics.block_max, delta);
Josh Poimboeuf	1a3d027	2016-06-17 12:43:23 -0500	[diff] [blame]	972
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	973	schedstat_set(se->statistics.block_start, 0);
				974	schedstat_add(se->statistics.sum_sleep_runtime, delta);
Josh Poimboeuf	1a3d027	2016-06-17 12:43:23 -0500	[diff] [blame]	975
				976	if (tsk) {
				977	if (tsk->in_iowait) {
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	978	schedstat_add(se->statistics.iowait_sum, delta);
				979	schedstat_inc(se->statistics.iowait_count);
Josh Poimboeuf	1a3d027	2016-06-17 12:43:23 -0500	[diff] [blame]	980	trace_sched_stat_iowait(tsk, delta);
				981	}
				982
				983	trace_sched_stat_blocked(tsk, delta);
				984
				985	/*
				986	* Blocking time is in units of nanosecs, so shift by
				987	* 20 to get a milliseconds-range estimation of the
				988	* amount of time that the task spent sleeping:
				989	*/
				990	if (unlikely(prof_on == SLEEP_PROFILING)) {
				991	profile_hits(SLEEP_PROFILING,
				992	(void *)get_wchan(tsk),
				993	delta >> 20);
				994	}
				995	account_scheduler_latency(tsk, delta >> 10, 0);
				996	}
				997	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	998	}
				999
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1000	/*
				1001	* Task is being enqueued - update stats:
				1002	*/
Mel Gorman	cb25176	2016-02-05 09:08:36 +0000	[diff] [blame]	1003	static inline void
Josh Poimboeuf	1a3d027	2016-06-17 12:43:23 -0500	[diff] [blame]	1004	update_stats_enqueue(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1005	{
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	1006	if (!schedstat_enabled())
				1007	return;
				1008
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1009	/*
				1010	* Are we enqueueing a waiting task? (for current tasks
				1011	* a dequeue/enqueue event is a NOP)
				1012	*/
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	1013	if (se != cfs_rq->curr)
Ingo Molnar	5870db5	2007-08-09 11:16:47 +0200	[diff] [blame]	1014	update_stats_wait_start(cfs_rq, se);
Josh Poimboeuf	1a3d027	2016-06-17 12:43:23 -0500	[diff] [blame]	1015
				1016	if (flags & ENQUEUE_WAKEUP)
				1017	update_stats_enqueue_sleeper(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1018	}
				1019
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1020	static inline void
Mel Gorman	cb25176	2016-02-05 09:08:36 +0000	[diff] [blame]	1021	update_stats_dequeue(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1022	{
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	1023
				1024	if (!schedstat_enabled())
				1025	return;
				1026
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1027	/*
				1028	* Mark the end of the wait period if dequeueing a
				1029	* waiting task:
				1030	*/
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	1031	if (se != cfs_rq->curr)
Ingo Molnar	9ef0a96	2007-08-09 11:16:47 +0200	[diff] [blame]	1032	update_stats_wait_end(cfs_rq, se);
Mel Gorman	cb25176	2016-02-05 09:08:36 +0000	[diff] [blame]	1033
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	1034	if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
				1035	struct task_struct *tsk = task_of(se);
Mel Gorman	cb25176	2016-02-05 09:08:36 +0000	[diff] [blame]	1036
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	1037	if (tsk->state & TASK_INTERRUPTIBLE)
				1038	schedstat_set(se->statistics.sleep_start,
				1039	rq_clock(rq_of(cfs_rq)));
				1040	if (tsk->state & TASK_UNINTERRUPTIBLE)
				1041	schedstat_set(se->statistics.block_start,
				1042	rq_clock(rq_of(cfs_rq)));
Mel Gorman	cb25176	2016-02-05 09:08:36 +0000	[diff] [blame]	1043	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1044	}
				1045
				1046	/*
				1047	* We are picking a new current task - update its stats:
				1048	*/
				1049	static inline void
Ingo Molnar	79303e9	2007-08-09 11:16:47 +0200	[diff] [blame]	1050	update_stats_curr_start(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1051	{
				1052	/*
				1053	* We are starting a new run period:
				1054	*/
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	1055	se->exec_start = rq_clock_task(rq_of(cfs_rq));
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1056	}
				1057
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1058	/**************************************************
				1059	* Scheduling class queueing methods:
				1060	*/
				1061
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1062	#ifdef CONFIG_NUMA_BALANCING
				1063	/*
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	1064	* Approximate time to scan a full NUMA task in ms. The task scan period is
				1065	* calculated based on the tasks virtual memory size and
				1066	* numa_balancing_scan_size.
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1067	*/
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	1068	unsigned int sysctl_numa_balancing_scan_period_min = 1000;
				1069	unsigned int sysctl_numa_balancing_scan_period_max = 60000;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	1070
				1071	/* Portion of address space to scan in MB */
				1072	unsigned int sysctl_numa_balancing_scan_size = 256;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1073
Peter Zijlstra	4b96a29	2012-10-25 14:16:47 +0200	[diff] [blame]	1074	/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
				1075	unsigned int sysctl_numa_balancing_scan_delay = 1000;
				1076
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	1077	static unsigned int task_nr_scan_windows(struct task_struct *p)
				1078	{
				1079	unsigned long rss = 0;
				1080	unsigned long nr_scan_pages;
				1081
				1082	/*
				1083	* Calculations based on RSS as non-present and empty pages are skipped
				1084	* by the PTE scanner and NUMA hinting faults should be trapped based
				1085	* on resident pages
				1086	*/
				1087	nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
				1088	rss = get_mm_rss(p->mm);
				1089	if (!rss)
				1090	rss = nr_scan_pages;
				1091
				1092	rss = round_up(rss, nr_scan_pages);
				1093	return rss / nr_scan_pages;
				1094	}
				1095
				1096	/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
				1097	#define MAX_SCAN_WINDOW 2560
				1098
				1099	static unsigned int task_scan_min(struct task_struct *p)
				1100	{
Jason Low	316c1608d	2015-04-28 13:00:20 -0700	[diff] [blame]	1101	unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	1102	unsigned int scan, floor;
				1103	unsigned int windows = 1;
				1104
Kirill Tkhai	6419265	2014-10-16 14:39:37 +0400	[diff] [blame]	1105	if (scan_size < MAX_SCAN_WINDOW)
				1106	windows = MAX_SCAN_WINDOW / scan_size;
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	1107	floor = 1000 / windows;
				1108
				1109	scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
				1110	return max_t(unsigned int, floor, scan);
				1111	}
				1112
				1113	static unsigned int task_scan_max(struct task_struct *p)
				1114	{
				1115	unsigned int smin = task_scan_min(p);
				1116	unsigned int smax;
				1117
				1118	/* Watch for min being lower than max due to floor calculations */
				1119	smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
				1120	return max(smin, smax);
				1121	}
				1122
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	1123	static void account_numa_enqueue(struct rq rq, struct task_struct p)
				1124	{
				1125	rq->nr_numa_running += (p->numa_preferred_nid != -1);
				1126	rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
				1127	}
				1128
				1129	static void account_numa_dequeue(struct rq rq, struct task_struct p)
				1130	{
				1131	rq->nr_numa_running -= (p->numa_preferred_nid != -1);
				1132	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
				1133	}
				1134
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	1135	struct numa_group {
				1136	atomic_t refcount;
				1137
				1138	spinlock_t lock; /* nr_tasks, tasks */
				1139	int nr_tasks;
Mel Gorman	e29cf08	2013-10-07 11:29:22 +0100	[diff] [blame]	1140	pid_t gid;
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	1141	int active_nodes;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	1142
				1143	struct rcu_head rcu;
Mel Gorman	989348b	2013-10-07 11:29:40 +0100	[diff] [blame]	1144	unsigned long total_faults;
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	1145	unsigned long max_faults_cpu;
Rik van Riel	7e2703e	2014-01-27 17:03:45 -0500	[diff] [blame]	1146	/*
				1147	* Faults_cpu is used to decide whether memory should move
				1148	* towards the CPU. As a consequence, these stats are weighted
				1149	* more by CPU use than by memory faults.
				1150	*/
Rik van Riel	50ec8a4	2014-01-27 17:03:42 -0500	[diff] [blame]	1151	unsigned long *faults_cpu;
Mel Gorman	989348b	2013-10-07 11:29:40 +0100	[diff] [blame]	1152	unsigned long faults[0];
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	1153	};
				1154
Rik van Riel	be1e4e7	2014-01-27 17:03:48 -0500	[diff] [blame]	1155	/* Shared or private faults. */
				1156	#define NR_NUMA_HINT_FAULT_TYPES 2
				1157
				1158	/* Memory and CPU locality */
				1159	#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
				1160
				1161	/* Averaged statistics, and temporary buffers. */
				1162	#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
				1163
Mel Gorman	e29cf08	2013-10-07 11:29:22 +0100	[diff] [blame]	1164	pid_t task_numa_group_id(struct task_struct *p)
				1165	{
				1166	return p->numa_group ? p->numa_group->gid : 0;
				1167	}
				1168
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	1169	/*
				1170	* The averaged statistics, shared & private, memory & cpu,
				1171	* occupy the first half of the array. The second half of the
				1172	* array is for current counters, which are averaged into the
				1173	* first set by task_numa_placement.
				1174	*/
				1175	static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
Mel Gorman	ac8e895	2013-10-07 11:29:03 +0100	[diff] [blame]	1176	{
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	1177	return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
Mel Gorman	ac8e895	2013-10-07 11:29:03 +0100	[diff] [blame]	1178	}
				1179
				1180	static inline unsigned long task_faults(struct task_struct *p, int nid)
				1181	{
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	1182	if (!p->numa_faults)
Mel Gorman	ac8e895	2013-10-07 11:29:03 +0100	[diff] [blame]	1183	return 0;
				1184
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	1185	return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
				1186	p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
Mel Gorman	ac8e895	2013-10-07 11:29:03 +0100	[diff] [blame]	1187	}
				1188
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	1189	static inline unsigned long group_faults(struct task_struct *p, int nid)
				1190	{
				1191	if (!p->numa_group)
				1192	return 0;
				1193
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	1194	return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
				1195	p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	1196	}
				1197
Rik van Riel	20e07de	2014-01-27 17:03:43 -0500	[diff] [blame]	1198	static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
				1199	{
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	1200	return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
				1201	group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
Rik van Riel	20e07de	2014-01-27 17:03:43 -0500	[diff] [blame]	1202	}
				1203
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	1204	/*
				1205	* A node triggering more than 1/3 as many NUMA faults as the maximum is
				1206	* considered part of a numa group's pseudo-interleaving set. Migrations
				1207	* between these nodes are slowed down, to allow things to settle down.
				1208	*/
				1209	#define ACTIVE_NODE_FRACTION 3
				1210
				1211	static bool numa_is_active_node(int nid, struct numa_group *ng)
				1212	{
				1213	return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
				1214	}
				1215
Rik van Riel	6c6b119	2014-10-17 03:29:52 -0400	[diff] [blame]	1216	/* Handle placement on systems where not all nodes are directly connected. */
				1217	static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
				1218	int maxdist, bool task)
				1219	{
				1220	unsigned long score = 0;
				1221	int node;
				1222
				1223	/*
				1224	* All nodes are directly connected, and the same distance
				1225	* from each other. No need for fancy placement algorithms.
				1226	*/
				1227	if (sched_numa_topology_type == NUMA_DIRECT)
				1228	return 0;
				1229
				1230	/*
				1231	* This code is called for each node, introducing N^2 complexity,
				1232	* which should be ok given the number of nodes rarely exceeds 8.
				1233	*/
				1234	for_each_online_node(node) {
				1235	unsigned long faults;
				1236	int dist = node_distance(nid, node);
				1237
				1238	/*
				1239	* The furthest away nodes in the system are not interesting
				1240	* for placement; nid was already counted.
				1241	*/
				1242	if (dist == sched_max_numa_distance \|\| node == nid)
				1243	continue;
				1244
				1245	/*
				1246	* On systems with a backplane NUMA topology, compare groups
				1247	* of nodes, and move tasks towards the group with the most
				1248	* memory accesses. When comparing two nodes at distance
				1249	* "hoplimit", only nodes closer by than "hoplimit" are part
				1250	* of each group. Skip other nodes.
				1251	*/
				1252	if (sched_numa_topology_type == NUMA_BACKPLANE &&
				1253	dist > maxdist)
				1254	continue;
				1255
				1256	/* Add up the faults from nearby nodes. */
				1257	if (task)
				1258	faults = task_faults(p, node);
				1259	else
				1260	faults = group_faults(p, node);
				1261
				1262	/*
				1263	* On systems with a glueless mesh NUMA topology, there are
				1264	* no fixed "groups of nodes". Instead, nodes that are not
				1265	* directly connected bounce traffic through intermediate
				1266	* nodes; a numa_group can occupy any set of nodes.
				1267	* The further away a node is, the less the faults count.
				1268	* This seems to result in good task placement.
				1269	*/
				1270	if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
				1271	faults *= (sched_max_numa_distance - dist);
				1272	faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
				1273	}
				1274
				1275	score += faults;
				1276	}
				1277
				1278	return score;
				1279	}
				1280
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	1281	/*
				1282	* These return the fraction of accesses done by a particular task, or
				1283	* task group, on a particular numa node. The group weight is given a
				1284	* larger multiplier, in order to group tasks together that are almost
				1285	* evenly spread out between numa nodes.
				1286	*/
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1287	static inline unsigned long task_weight(struct task_struct *p, int nid,
				1288	int dist)
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	1289	{
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1290	unsigned long faults, total_faults;
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	1291
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	1292	if (!p->numa_faults)
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	1293	return 0;
				1294
				1295	total_faults = p->total_numa_faults;
				1296
				1297	if (!total_faults)
				1298	return 0;
				1299
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1300	faults = task_faults(p, nid);
Rik van Riel	6c6b119	2014-10-17 03:29:52 -0400	[diff] [blame]	1301	faults += score_nearby_nodes(p, nid, dist, true);
				1302
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1303	return 1000 * faults / total_faults;
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	1304	}
				1305
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1306	static inline unsigned long group_weight(struct task_struct *p, int nid,
				1307	int dist)
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	1308	{
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1309	unsigned long faults, total_faults;
				1310
				1311	if (!p->numa_group)
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	1312	return 0;
				1313
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1314	total_faults = p->numa_group->total_faults;
				1315
				1316	if (!total_faults)
				1317	return 0;
				1318
				1319	faults = group_faults(p, nid);
Rik van Riel	6c6b119	2014-10-17 03:29:52 -0400	[diff] [blame]	1320	faults += score_nearby_nodes(p, nid, dist, false);
				1321
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1322	return 1000 * faults / total_faults;
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	1323	}
				1324
Rik van Riel	10f3904	2014-01-27 17:03:44 -0500	[diff] [blame]	1325	bool should_numa_migrate_memory(struct task_struct p, struct page page,
				1326	int src_nid, int dst_cpu)
				1327	{
				1328	struct numa_group *ng = p->numa_group;
				1329	int dst_nid = cpu_to_node(dst_cpu);
				1330	int last_cpupid, this_cpupid;
				1331
				1332	this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
				1333
				1334	/*
				1335	* Multi-stage node selection is used in conjunction with a periodic
				1336	* migration fault to build a temporal task<->page relation. By using
				1337	* a two-stage filter we remove short/unlikely relations.
				1338	*
				1339	* Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
				1340	* a task's usage of a particular page (n_p) per total usage of this
				1341	* page (n_t) (in a given time-span) to a probability.
				1342	*
				1343	* Our periodic faults will sample this probability and getting the
				1344	* same result twice in a row, given these samples are fully
				1345	* independent, is then given by P(n)^2, provided our sample period
				1346	* is sufficiently short compared to the usage pattern.
				1347	*
				1348	* This quadric squishes small probabilities, making it less likely we
				1349	* act on an unlikely task<->page relation.
				1350	*/
				1351	last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
				1352	if (!cpupid_pid_unset(last_cpupid) &&
				1353	cpupid_to_nid(last_cpupid) != dst_nid)
				1354	return false;
				1355
				1356	/* Always allow migrate on private faults */
				1357	if (cpupid_match_pid(p, last_cpupid))
				1358	return true;
				1359
				1360	/* A shared fault, but p->numa_group has not been set up yet. */
				1361	if (!ng)
				1362	return true;
				1363
				1364	/*
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	1365	* Destination node is much more heavily used than the source
				1366	* node? Allow migration.
Rik van Riel	10f3904	2014-01-27 17:03:44 -0500	[diff] [blame]	1367	*/
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	1368	if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
				1369	ACTIVE_NODE_FRACTION)
Rik van Riel	10f3904	2014-01-27 17:03:44 -0500	[diff] [blame]	1370	return true;
				1371
				1372	/*
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	1373	* Distribute memory according to CPU & memory use on each node,
				1374	* with 3/4 hysteresis to avoid unnecessary memory migrations:
				1375	*
				1376	* faults_cpu(dst) 3 faults_cpu(src)
				1377	* --------------- * - > ---------------
				1378	* faults_mem(dst) 4 faults_mem(src)
Rik van Riel	10f3904	2014-01-27 17:03:44 -0500	[diff] [blame]	1379	*/
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	1380	return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
				1381	group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
Rik van Riel	10f3904	2014-01-27 17:03:44 -0500	[diff] [blame]	1382	}
				1383
Mel Gorman	e6628d5	2013-10-07 11:29:02 +0100	[diff] [blame]	1384	static unsigned long weighted_cpuload(const int cpu);
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1385	static unsigned long source_load(int cpu, int type);
				1386	static unsigned long target_load(int cpu, int type);
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	1387	static unsigned long capacity_of(int cpu);
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1388	static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
Mel Gorman	e6628d5	2013-10-07 11:29:02 +0100	[diff] [blame]	1389
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1390	/* Cached statistics for all CPUs within a node */
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1391	struct numa_stats {
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1392	unsigned long nr_running;
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1393	unsigned long load;
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1394
				1395	/* Total compute capacity of CPUs on a node */
Nicolas Pitre	5ef20ca	2014-05-26 18:19:34 -0400	[diff] [blame]	1396	unsigned long compute_capacity;
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1397
				1398	/* Approximate capacity in terms of runnable tasks on a node */
Nicolas Pitre	5ef20ca	2014-05-26 18:19:34 -0400	[diff] [blame]	1399	unsigned long task_capacity;
Nicolas Pitre	1b6a749	2014-05-26 18:19:35 -0400	[diff] [blame]	1400	int has_free_capacity;
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1401	};
Mel Gorman	e6628d5	2013-10-07 11:29:02 +0100	[diff] [blame]	1402
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1403	/*
				1404	* XXX borrowed from update_sg_lb_stats
				1405	*/
				1406	static void update_numa_stats(struct numa_stats *ns, int nid)
				1407	{
Rik van Riel	83d7f24	2014-08-04 13:23:28 -0400	[diff] [blame]	1408	int smt, cpu, cpus = 0;
				1409	unsigned long capacity;
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1410
				1411	memset(ns, 0, sizeof(*ns));
				1412	for_each_cpu(cpu, cpumask_of_node(nid)) {
				1413	struct rq *rq = cpu_rq(cpu);
				1414
				1415	ns->nr_running += rq->nr_running;
				1416	ns->load += weighted_cpuload(cpu);
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	1417	ns->compute_capacity += capacity_of(cpu);
Peter Zijlstra	5eca82a	2013-11-06 18:47:57 +0100	[diff] [blame]	1418
				1419	cpus++;
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1420	}
				1421
Peter Zijlstra	5eca82a	2013-11-06 18:47:57 +0100	[diff] [blame]	1422	/*
				1423	* If we raced with hotplug and there are no CPUs left in our mask
				1424	* the @ns structure is NULL'ed and task_numa_compare() will
				1425	* not find this node attractive.
				1426	*
Nicolas Pitre	1b6a749	2014-05-26 18:19:35 -0400	[diff] [blame]	1427	* We'll either bail at !has_free_capacity, or we'll detect a huge
				1428	* imbalance and bail there.
Peter Zijlstra	5eca82a	2013-11-06 18:47:57 +0100	[diff] [blame]	1429	*/
				1430	if (!cpus)
				1431	return;
				1432
Rik van Riel	83d7f24	2014-08-04 13:23:28 -0400	[diff] [blame]	1433	/* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
				1434	smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
				1435	capacity = cpus / smt; /* cores */
				1436
				1437	ns->task_capacity = min_t(unsigned, capacity,
				1438	DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
Nicolas Pitre	1b6a749	2014-05-26 18:19:35 -0400	[diff] [blame]	1439	ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1440	}
				1441
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1442	struct task_numa_env {
				1443	struct task_struct *p;
				1444
				1445	int src_cpu, src_nid;
				1446	int dst_cpu, dst_nid;
				1447
				1448	struct numa_stats src_stats, dst_stats;
				1449
Wanpeng Li	40ea2b4	2013-12-05 19:10:17 +0800	[diff] [blame]	1450	int imbalance_pct;
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1451	int dist;
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1452
				1453	struct task_struct *best_task;
				1454	long best_imp;
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1455	int best_cpu;
				1456	};
				1457
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1458	static void task_numa_assign(struct task_numa_env *env,
				1459	struct task_struct *p, long imp)
				1460	{
				1461	if (env->best_task)
				1462	put_task_struct(env->best_task);
Oleg Nesterov	bac7857	2016-05-18 21:57:33 +0200	[diff] [blame]	1463	if (p)
				1464	get_task_struct(p);
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1465
				1466	env->best_task = p;
				1467	env->best_imp = imp;
				1468	env->best_cpu = env->dst_cpu;
				1469	}
				1470
Rik van Riel	28a2174	2014-06-23 11:46:13 -0400	[diff] [blame]	1471	static bool load_too_imbalanced(long src_load, long dst_load,
Rik van Riel	e63da03	2014-05-14 13:22:21 -0400	[diff] [blame]	1472	struct task_numa_env *env)
				1473	{
Rik van Riel	e4991b2	2015-05-27 15:04:27 -0400	[diff] [blame]	1474	long imb, old_imb;
				1475	long orig_src_load, orig_dst_load;
Rik van Riel	28a2174	2014-06-23 11:46:13 -0400	[diff] [blame]	1476	long src_capacity, dst_capacity;
				1477
				1478	/*
				1479	* The load is corrected for the CPU capacity available on each node.
				1480	*
				1481	* src_load dst_load
				1482	* ------------ vs ---------
				1483	* src_capacity dst_capacity
				1484	*/
				1485	src_capacity = env->src_stats.compute_capacity;
				1486	dst_capacity = env->dst_stats.compute_capacity;
Rik van Riel	e63da03	2014-05-14 13:22:21 -0400	[diff] [blame]	1487
				1488	/* We care about the slope of the imbalance, not the direction. */
Rik van Riel	e4991b2	2015-05-27 15:04:27 -0400	[diff] [blame]	1489	if (dst_load < src_load)
				1490	swap(dst_load, src_load);
Rik van Riel	e63da03	2014-05-14 13:22:21 -0400	[diff] [blame]	1491
				1492	/* Is the difference below the threshold? */
Rik van Riel	e4991b2	2015-05-27 15:04:27 -0400	[diff] [blame]	1493	imb = dst_load * src_capacity * 100 -
				1494	src_load * dst_capacity * env->imbalance_pct;
Rik van Riel	e63da03	2014-05-14 13:22:21 -0400	[diff] [blame]	1495	if (imb <= 0)
				1496	return false;
				1497
				1498	/*
				1499	* The imbalance is above the allowed threshold.
Rik van Riel	e4991b2	2015-05-27 15:04:27 -0400	[diff] [blame]	1500	* Compare it with the old imbalance.
Rik van Riel	e63da03	2014-05-14 13:22:21 -0400	[diff] [blame]	1501	*/
Rik van Riel	28a2174	2014-06-23 11:46:13 -0400	[diff] [blame]	1502	orig_src_load = env->src_stats.load;
Rik van Riel	e4991b2	2015-05-27 15:04:27 -0400	[diff] [blame]	1503	orig_dst_load = env->dst_stats.load;
Rik van Riel	28a2174	2014-06-23 11:46:13 -0400	[diff] [blame]	1504
Rik van Riel	e4991b2	2015-05-27 15:04:27 -0400	[diff] [blame]	1505	if (orig_dst_load < orig_src_load)
				1506	swap(orig_dst_load, orig_src_load);
Rik van Riel	e63da03	2014-05-14 13:22:21 -0400	[diff] [blame]	1507
Rik van Riel	e4991b2	2015-05-27 15:04:27 -0400	[diff] [blame]	1508	old_imb = orig_dst_load * src_capacity * 100 -
				1509	orig_src_load * dst_capacity * env->imbalance_pct;
				1510
				1511	/* Would this change make things worse? */
				1512	return (imb > old_imb);
Rik van Riel	e63da03	2014-05-14 13:22:21 -0400	[diff] [blame]	1513	}
				1514
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1515	/*
				1516	* This checks if the overall compute and NUMA accesses of the system would
				1517	* be improved if the source tasks was migrated to the target dst_cpu taking
				1518	* into account that it might be best if task running on the dst_cpu should
				1519	* be exchanged with the source task
				1520	*/
Rik van Riel	887c290	2013-10-07 11:29:31 +0100	[diff] [blame]	1521	static void task_numa_compare(struct task_numa_env *env,
				1522	long taskimp, long groupimp)
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1523	{
				1524	struct rq *src_rq = cpu_rq(env->src_cpu);
				1525	struct rq *dst_rq = cpu_rq(env->dst_cpu);
				1526	struct task_struct *cur;
Rik van Riel	28a2174	2014-06-23 11:46:13 -0400	[diff] [blame]	1527	long src_load, dst_load;
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1528	long load;
Rik van Riel	1c5d3eb	2014-06-23 11:46:15 -0400	[diff] [blame]	1529	long imp = env->p->numa_group ? groupimp : taskimp;
Rik van Riel	0132c3e	2014-06-23 11:46:16 -0400	[diff] [blame]	1530	long moveimp = imp;
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1531	int dist = env->dist;
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1532
				1533	rcu_read_lock();
Oleg Nesterov	bac7857	2016-05-18 21:57:33 +0200	[diff] [blame]	1534	cur = task_rcu_dereference(&dst_rq->curr);
				1535	if (cur && ((cur->flags & PF_EXITING) \|\| is_idle_task(cur)))
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1536	cur = NULL;
				1537
				1538	/*
Peter Zijlstra	7af6833	2014-11-10 10:54:35 +0100	[diff] [blame]	1539	* Because we have preemption enabled we can get migrated around and
				1540	* end try selecting ourselves (current == env->p) as a swap candidate.
				1541	*/
				1542	if (cur == env->p)
				1543	goto unlock;
				1544
				1545	/*
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1546	* "imp" is the fault differential for the source task between the
				1547	* source and destination node. Calculate the total differential for
				1548	* the source task and potential destination task. The more negative
				1549	* the value is, the more rmeote accesses that would be expected to
				1550	* be incurred if the tasks were swapped.
				1551	*/
				1552	if (cur) {
				1553	/* Skip this swap candidate if cannot move to the source cpu */
				1554	if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
				1555	goto unlock;
				1556
Rik van Riel	887c290	2013-10-07 11:29:31 +0100	[diff] [blame]	1557	/*
				1558	* If dst and source tasks are in the same NUMA group, or not
Rik van Riel	ca28aa53	2013-10-07 11:29:32 +0100	[diff] [blame]	1559	* in any group then look only at task weights.
Rik van Riel	887c290	2013-10-07 11:29:31 +0100	[diff] [blame]	1560	*/
Rik van Riel	ca28aa53	2013-10-07 11:29:32 +0100	[diff] [blame]	1561	if (cur->numa_group == env->p->numa_group) {
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1562	imp = taskimp + task_weight(cur, env->src_nid, dist) -
				1563	task_weight(cur, env->dst_nid, dist);
Rik van Riel	ca28aa53	2013-10-07 11:29:32 +0100	[diff] [blame]	1564	/*
				1565	* Add some hysteresis to prevent swapping the
				1566	* tasks within a group over tiny differences.
				1567	*/
				1568	if (cur->numa_group)
				1569	imp -= imp/16;
Rik van Riel	887c290	2013-10-07 11:29:31 +0100	[diff] [blame]	1570	} else {
Rik van Riel	ca28aa53	2013-10-07 11:29:32 +0100	[diff] [blame]	1571	/*
				1572	* Compare the group weights. If a task is all by
				1573	* itself (not part of a group), use the task weight
				1574	* instead.
				1575	*/
Rik van Riel	ca28aa53	2013-10-07 11:29:32 +0100	[diff] [blame]	1576	if (cur->numa_group)
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1577	imp += group_weight(cur, env->src_nid, dist) -
				1578	group_weight(cur, env->dst_nid, dist);
Rik van Riel	ca28aa53	2013-10-07 11:29:32 +0100	[diff] [blame]	1579	else
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1580	imp += task_weight(cur, env->src_nid, dist) -
				1581	task_weight(cur, env->dst_nid, dist);
Rik van Riel	887c290	2013-10-07 11:29:31 +0100	[diff] [blame]	1582	}
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1583	}
				1584
Rik van Riel	0132c3e	2014-06-23 11:46:16 -0400	[diff] [blame]	1585	if (imp <= env->best_imp && moveimp <= env->best_imp)
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1586	goto unlock;
				1587
				1588	if (!cur) {
				1589	/* Is there capacity at our destination? */
Rik van Riel	b932c03	2014-08-04 13:23:27 -0400	[diff] [blame]	1590	if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
Nicolas Pitre	1b6a749	2014-05-26 18:19:35 -0400	[diff] [blame]	1591	!env->dst_stats.has_free_capacity)
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1592	goto unlock;
				1593
				1594	goto balance;
				1595	}
				1596
				1597	/* Balance doesn't matter much if we're running a task per cpu */
Rik van Riel	0132c3e	2014-06-23 11:46:16 -0400	[diff] [blame]	1598	if (imp > env->best_imp && src_rq->nr_running == 1 &&
				1599	dst_rq->nr_running == 1)
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1600	goto assign;
				1601
				1602	/*
				1603	* In the overloaded case, try and keep the load balanced.
				1604	*/
				1605	balance:
Peter Zijlstra	e720fff	2014-07-11 16:01:53 +0200	[diff] [blame]	1606	load = task_h_load(env->p);
				1607	dst_load = env->dst_stats.load + load;
				1608	src_load = env->src_stats.load - load;
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1609
Rik van Riel	0132c3e	2014-06-23 11:46:16 -0400	[diff] [blame]	1610	if (moveimp > imp && moveimp > env->best_imp) {
				1611	/*
				1612	* If the improvement from just moving env->p direction is
				1613	* better than swapping tasks around, check if a move is
				1614	* possible. Store a slightly smaller score than moveimp,
				1615	* so an actually idle CPU will win.
				1616	*/
				1617	if (!load_too_imbalanced(src_load, dst_load, env)) {
				1618	imp = moveimp - 1;
				1619	cur = NULL;
				1620	goto assign;
				1621	}
				1622	}
				1623
				1624	if (imp <= env->best_imp)
				1625	goto unlock;
				1626
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1627	if (cur) {
Peter Zijlstra	e720fff	2014-07-11 16:01:53 +0200	[diff] [blame]	1628	load = task_h_load(cur);
				1629	dst_load -= load;
				1630	src_load += load;
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1631	}
				1632
Rik van Riel	28a2174	2014-06-23 11:46:13 -0400	[diff] [blame]	1633	if (load_too_imbalanced(src_load, dst_load, env))
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1634	goto unlock;
				1635
Rik van Riel	ba7e5a2	2014-09-04 16:35:30 -0400	[diff] [blame]	1636	/*
				1637	* One idle CPU per node is evaluated for a task numa move.
				1638	* Call select_idle_sibling to maybe find a better one.
				1639	*/
Peter Zijlstra	10e2f1a	2016-05-09 10:38:05 +0200	[diff] [blame]	1640	if (!cur) {
				1641	/*
				1642	* select_idle_siblings() uses an per-cpu cpumask that
				1643	* can be used from IRQ context.
				1644	*/
				1645	local_irq_disable();
Morten Rasmussen	772bd008c	2016-06-22 18:03:13 +0100	[diff] [blame]	1646	env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
				1647	env->dst_cpu);
Peter Zijlstra	10e2f1a	2016-05-09 10:38:05 +0200	[diff] [blame]	1648	local_irq_enable();
				1649	}
Rik van Riel	ba7e5a2	2014-09-04 16:35:30 -0400	[diff] [blame]	1650
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1651	assign:
				1652	task_numa_assign(env, cur, imp);
				1653	unlock:
				1654	rcu_read_unlock();
				1655	}
				1656
Rik van Riel	887c290	2013-10-07 11:29:31 +0100	[diff] [blame]	1657	static void task_numa_find_cpu(struct task_numa_env *env,
				1658	long taskimp, long groupimp)
Mel Gorman	2c8a50a	2013-10-07 11:29:18 +0100	[diff] [blame]	1659	{
				1660	int cpu;
				1661
				1662	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
				1663	/* Skip this CPU if the source task cannot migrate */
				1664	if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
				1665	continue;
				1666
				1667	env->dst_cpu = cpu;
Rik van Riel	887c290	2013-10-07 11:29:31 +0100	[diff] [blame]	1668	task_numa_compare(env, taskimp, groupimp);
Mel Gorman	2c8a50a	2013-10-07 11:29:18 +0100	[diff] [blame]	1669	}
				1670	}
				1671
Rik van Riel	6f9aad0	2015-05-28 09:52:49 -0400	[diff] [blame]	1672	/* Only move tasks to a NUMA node less busy than the current node. */
				1673	static bool numa_has_capacity(struct task_numa_env *env)
				1674	{
				1675	struct numa_stats *src = &env->src_stats;
				1676	struct numa_stats *dst = &env->dst_stats;
				1677
				1678	if (src->has_free_capacity && !dst->has_free_capacity)
				1679	return false;
				1680
				1681	/*
				1682	* Only consider a task move if the source has a higher load
				1683	* than the destination, corrected for CPU capacity on each node.
				1684	*
				1685	* src->load dst->load
				1686	* --------------------- vs ---------------------
				1687	* src->compute_capacity dst->compute_capacity
				1688	*/
Srikar Dronamraju	44dcb04	2015-06-16 17:26:00 +0530	[diff] [blame]	1689	if (src->load * dst->compute_capacity * env->imbalance_pct >
				1690
				1691	dst->load * src->compute_capacity * 100)
Rik van Riel	6f9aad0	2015-05-28 09:52:49 -0400	[diff] [blame]	1692	return true;
				1693
				1694	return false;
				1695	}
				1696
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1697	static int task_numa_migrate(struct task_struct *p)
Mel Gorman	e6628d5	2013-10-07 11:29:02 +0100	[diff] [blame]	1698	{
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1699	struct task_numa_env env = {
				1700	.p = p,
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1701
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1702	.src_cpu = task_cpu(p),
Ingo Molnar	b32e86b	2013-10-07 11:29:30 +0100	[diff] [blame]	1703	.src_nid = task_node(p),
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1704
				1705	.imbalance_pct = 112,
				1706
				1707	.best_task = NULL,
				1708	.best_imp = 0,
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	1709	.best_cpu = -1,
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1710	};
				1711	struct sched_domain *sd;
Rik van Riel	887c290	2013-10-07 11:29:31 +0100	[diff] [blame]	1712	unsigned long taskweight, groupweight;
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1713	int nid, ret, dist;
Rik van Riel	887c290	2013-10-07 11:29:31 +0100	[diff] [blame]	1714	long taskimp, groupimp;
Mel Gorman	e6628d5	2013-10-07 11:29:02 +0100	[diff] [blame]	1715
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1716	/*
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1717	* Pick the lowest SD_NUMA domain, as that would have the smallest
				1718	* imbalance and would be the first to start moving tasks about.
				1719	*
				1720	* And we want to avoid any moving of tasks about, as that would create
				1721	* random movement of tasks -- counter the numa conditions we're trying
				1722	* to satisfy here.
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1723	*/
Mel Gorman	e6628d5	2013-10-07 11:29:02 +0100	[diff] [blame]	1724	rcu_read_lock();
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1725	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
Rik van Riel	46a73e8	2013-11-11 19:29:25 -0500	[diff] [blame]	1726	if (sd)
				1727	env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
Mel Gorman	e6628d5	2013-10-07 11:29:02 +0100	[diff] [blame]	1728	rcu_read_unlock();
				1729
Rik van Riel	46a73e8	2013-11-11 19:29:25 -0500	[diff] [blame]	1730	/*
				1731	* Cpusets can break the scheduler domain tree into smaller
				1732	* balance domains, some of which do not cross NUMA boundaries.
				1733	* Tasks that are "trapped" in such domains cannot be migrated
				1734	* elsewhere, so there is no point in (re)trying.
				1735	*/
				1736	if (unlikely(!sd)) {
Wanpeng Li	de1b301	2013-12-12 15:23:24 +0800	[diff] [blame]	1737	p->numa_preferred_nid = task_node(p);
Rik van Riel	46a73e8	2013-11-11 19:29:25 -0500	[diff] [blame]	1738	return -EINVAL;
				1739	}
				1740
Mel Gorman	2c8a50a	2013-10-07 11:29:18 +0100	[diff] [blame]	1741	env.dst_nid = p->numa_preferred_nid;
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1742	dist = env.dist = node_distance(env.src_nid, env.dst_nid);
				1743	taskweight = task_weight(p, env.src_nid, dist);
				1744	groupweight = group_weight(p, env.src_nid, dist);
				1745	update_numa_stats(&env.src_stats, env.src_nid);
				1746	taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
				1747	groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
Mel Gorman	2c8a50a	2013-10-07 11:29:18 +0100	[diff] [blame]	1748	update_numa_stats(&env.dst_stats, env.dst_nid);
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1749
Rik van Riel	a43455a	2014-06-04 16:09:42 -0400	[diff] [blame]	1750	/* Try to find a spot on the preferred nid. */
Rik van Riel	6f9aad0	2015-05-28 09:52:49 -0400	[diff] [blame]	1751	if (numa_has_capacity(&env))
				1752	task_numa_find_cpu(&env, taskimp, groupimp);
Rik van Riel	e1dda8a	2013-10-07 11:29:19 +0100	[diff] [blame]	1753
Rik van Riel	9de05d4	2014-10-09 17:27:47 -0400	[diff] [blame]	1754	/*
				1755	* Look at other nodes in these cases:
				1756	* - there is no space available on the preferred_nid
				1757	* - the task is part of a numa_group that is interleaved across
				1758	* multiple NUMA nodes; in order to better consolidate the group,
				1759	* we need to check other locations.
				1760	*/
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	1761	if (env.best_cpu == -1 \|\| (p->numa_group && p->numa_group->active_nodes > 1)) {
Mel Gorman	2c8a50a	2013-10-07 11:29:18 +0100	[diff] [blame]	1762	for_each_online_node(nid) {
				1763	if (nid == env.src_nid \|\| nid == p->numa_preferred_nid)
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1764	continue;
				1765
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1766	dist = node_distance(env.src_nid, env.dst_nid);
Rik van Riel	6c6b119	2014-10-17 03:29:52 -0400	[diff] [blame]	1767	if (sched_numa_topology_type == NUMA_BACKPLANE &&
				1768	dist != env.dist) {
				1769	taskweight = task_weight(p, env.src_nid, dist);
				1770	groupweight = group_weight(p, env.src_nid, dist);
				1771	}
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1772
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	1773	/* Only consider nodes where both task and groups benefit */
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1774	taskimp = task_weight(p, nid, dist) - taskweight;
				1775	groupimp = group_weight(p, nid, dist) - groupweight;
Rik van Riel	887c290	2013-10-07 11:29:31 +0100	[diff] [blame]	1776	if (taskimp < 0 && groupimp < 0)
Mel Gorman	2c8a50a	2013-10-07 11:29:18 +0100	[diff] [blame]	1777	continue;
				1778
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1779	env.dist = dist;
Mel Gorman	2c8a50a	2013-10-07 11:29:18 +0100	[diff] [blame]	1780	env.dst_nid = nid;
				1781	update_numa_stats(&env.dst_stats, env.dst_nid);
Rik van Riel	6f9aad0	2015-05-28 09:52:49 -0400	[diff] [blame]	1782	if (numa_has_capacity(&env))
				1783	task_numa_find_cpu(&env, taskimp, groupimp);
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1784	}
				1785	}
				1786
Rik van Riel	68d1b02	2014-04-11 13:00:29 -0400	[diff] [blame]	1787	/*
				1788	* If the task is part of a workload that spans multiple NUMA nodes,
				1789	* and is migrating into one of the workload's active nodes, remember
				1790	* this node as the task's preferred numa node, so the workload can
				1791	* settle down.
				1792	* A task that migrated to a second choice node will be better off
				1793	* trying for a better one later. Do not set the preferred node here.
				1794	*/
Rik van Riel	db015da	2014-06-23 11:41:34 -0400	[diff] [blame]	1795	if (p->numa_group) {
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	1796	struct numa_group *ng = p->numa_group;
				1797
Rik van Riel	db015da	2014-06-23 11:41:34 -0400	[diff] [blame]	1798	if (env.best_cpu == -1)
				1799	nid = env.src_nid;
				1800	else
				1801	nid = env.dst_nid;
				1802
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	1803	if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))
Rik van Riel	db015da	2014-06-23 11:41:34 -0400	[diff] [blame]	1804	sched_setnuma(p, env.dst_nid);
				1805	}
				1806
				1807	/* No better CPU than the current one was found. */
				1808	if (env.best_cpu == -1)
				1809	return -EAGAIN;
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	1810
Rik van Riel	04bb2f9	2013-10-07 11:29:36 +0100	[diff] [blame]	1811	/*
				1812	* Reset the scan period if the task is being rescheduled on an
				1813	* alternative node to recheck if the tasks is now properly placed.
				1814	*/
				1815	p->numa_scan_period = task_scan_min(p);
				1816
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1817	if (env.best_task == NULL) {
Mel Gorman	286549d	2014-01-21 15:51:03 -0800	[diff] [blame]	1818	ret = migrate_task_to(p, env.best_cpu);
				1819	if (ret != 0)
				1820	trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1821	return ret;
				1822	}
				1823
				1824	ret = migrate_swap(p, env.best_task);
Mel Gorman	286549d	2014-01-21 15:51:03 -0800	[diff] [blame]	1825	if (ret != 0)
				1826	trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1827	put_task_struct(env.best_task);
				1828	return ret;
Mel Gorman	e6628d5	2013-10-07 11:29:02 +0100	[diff] [blame]	1829	}
				1830
Mel Gorman	6b9a746	2013-10-07 11:29:11 +0100	[diff] [blame]	1831	/* Attempt to migrate a task to a CPU on the preferred node. */
				1832	static void numa_migrate_preferred(struct task_struct *p)
				1833	{
Rik van Riel	5085e2a	2014-04-11 13:00:28 -0400	[diff] [blame]	1834	unsigned long interval = HZ;
				1835
Rik van Riel	2739d3e	2013-10-07 11:29:41 +0100	[diff] [blame]	1836	/* This task has no NUMA fault statistics yet */
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	1837	if (unlikely(p->numa_preferred_nid == -1 \|\| !p->numa_faults))
Rik van Riel	2739d3e	2013-10-07 11:29:41 +0100	[diff] [blame]	1838	return;
				1839
				1840	/* Periodically retry migrating the task to the preferred node */
Rik van Riel	5085e2a	2014-04-11 13:00:28 -0400	[diff] [blame]	1841	interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
				1842	p->numa_migrate_retry = jiffies + interval;
Rik van Riel	2739d3e	2013-10-07 11:29:41 +0100	[diff] [blame]	1843
Mel Gorman	6b9a746	2013-10-07 11:29:11 +0100	[diff] [blame]	1844	/* Success if task is already running on preferred CPU */
Wanpeng Li	de1b301	2013-12-12 15:23:24 +0800	[diff] [blame]	1845	if (task_node(p) == p->numa_preferred_nid)
Mel Gorman	6b9a746	2013-10-07 11:29:11 +0100	[diff] [blame]	1846	return;
				1847
Mel Gorman	6b9a746	2013-10-07 11:29:11 +0100	[diff] [blame]	1848	/* Otherwise, try migrate to a CPU on the preferred node */
Rik van Riel	2739d3e	2013-10-07 11:29:41 +0100	[diff] [blame]	1849	task_numa_migrate(p);
Mel Gorman	6b9a746	2013-10-07 11:29:11 +0100	[diff] [blame]	1850	}
				1851
Rik van Riel	04bb2f9	2013-10-07 11:29:36 +0100	[diff] [blame]	1852	/*
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	1853	* Find out how many nodes on the workload is actively running on. Do this by
Rik van Riel	20e07de	2014-01-27 17:03:43 -0500	[diff] [blame]	1854	* tracking the nodes from which NUMA hinting faults are triggered. This can
				1855	* be different from the set of nodes where the workload's memory is currently
				1856	* located.
Rik van Riel	20e07de	2014-01-27 17:03:43 -0500	[diff] [blame]	1857	*/
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	1858	static void numa_group_count_active_nodes(struct numa_group *numa_group)
Rik van Riel	20e07de	2014-01-27 17:03:43 -0500	[diff] [blame]	1859	{
				1860	unsigned long faults, max_faults = 0;
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	1861	int nid, active_nodes = 0;
Rik van Riel	20e07de	2014-01-27 17:03:43 -0500	[diff] [blame]	1862
				1863	for_each_online_node(nid) {
				1864	faults = group_faults_cpu(numa_group, nid);
				1865	if (faults > max_faults)
				1866	max_faults = faults;
				1867	}
				1868
				1869	for_each_online_node(nid) {
				1870	faults = group_faults_cpu(numa_group, nid);
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	1871	if (faults * ACTIVE_NODE_FRACTION > max_faults)
				1872	active_nodes++;
Rik van Riel	20e07de	2014-01-27 17:03:43 -0500	[diff] [blame]	1873	}
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	1874
				1875	numa_group->max_faults_cpu = max_faults;
				1876	numa_group->active_nodes = active_nodes;
Rik van Riel	20e07de	2014-01-27 17:03:43 -0500	[diff] [blame]	1877	}
				1878
				1879	/*
Rik van Riel	04bb2f9	2013-10-07 11:29:36 +0100	[diff] [blame]	1880	* When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
				1881	* increments. The more local the fault statistics are, the higher the scan
Rik van Riel	a22b4b0	2014-06-23 11:41:35 -0400	[diff] [blame]	1882	* period will be for the next scan window. If local/(local+remote) ratio is
				1883	* below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
				1884	* the scan period will decrease. Aim for 70% local accesses.
Rik van Riel	04bb2f9	2013-10-07 11:29:36 +0100	[diff] [blame]	1885	*/
				1886	#define NUMA_PERIOD_SLOTS 10
Rik van Riel	a22b4b0	2014-06-23 11:41:35 -0400	[diff] [blame]	1887	#define NUMA_PERIOD_THRESHOLD 7
Rik van Riel	04bb2f9	2013-10-07 11:29:36 +0100	[diff] [blame]	1888
				1889	/*
				1890	* Increase the scan period (slow down scanning) if the majority of
				1891	* our memory is already on our local node, or if the majority of
				1892	* the page accesses are shared with other processes.
				1893	* Otherwise, decrease the scan period.
				1894	*/
				1895	static void update_task_scan_period(struct task_struct *p,
				1896	unsigned long shared, unsigned long private)
				1897	{
				1898	unsigned int period_slot;
				1899	int ratio;
				1900	int diff;
				1901
				1902	unsigned long remote = p->numa_faults_locality[0];
				1903	unsigned long local = p->numa_faults_locality[1];
				1904
				1905	/*
				1906	* If there were no record hinting faults then either the task is
				1907	* completely idle or all activity is areas that are not of interest
Mel Gorman	074c238	2015-03-25 15:55:42 -0700	[diff] [blame]	1908	* to automatic numa balancing. Related to that, if there were failed
				1909	* migration then it implies we are migrating too quickly or the local
				1910	* node is overloaded. In either case, scan slower
Rik van Riel	04bb2f9	2013-10-07 11:29:36 +0100	[diff] [blame]	1911	*/
Mel Gorman	074c238	2015-03-25 15:55:42 -0700	[diff] [blame]	1912	if (local + shared == 0 \|\| p->numa_faults_locality[2]) {
Rik van Riel	04bb2f9	2013-10-07 11:29:36 +0100	[diff] [blame]	1913	p->numa_scan_period = min(p->numa_scan_period_max,
				1914	p->numa_scan_period << 1);
				1915
				1916	p->mm->numa_next_scan = jiffies +
				1917	msecs_to_jiffies(p->numa_scan_period);
				1918
				1919	return;
				1920	}
				1921
				1922	/*
				1923	* Prepare to scale scan period relative to the current period.
				1924	* == NUMA_PERIOD_THRESHOLD scan period stays the same
				1925	* < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
				1926	* >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
				1927	*/
				1928	period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
				1929	ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
				1930	if (ratio >= NUMA_PERIOD_THRESHOLD) {
				1931	int slot = ratio - NUMA_PERIOD_THRESHOLD;
				1932	if (!slot)
				1933	slot = 1;
				1934	diff = slot * period_slot;
				1935	} else {
				1936	diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
				1937
				1938	/*
				1939	* Scale scan rate increases based on sharing. There is an
				1940	* inverse relationship between the degree of sharing and
				1941	* the adjustment made to the scanning period. Broadly
				1942	* speaking the intent is that there is little point
				1943	* scanning faster if shared accesses dominate as it may
				1944	* simply bounce migrations uselessly
				1945	*/
Yasuaki Ishimatsu	2847c90	2014-10-22 16:04:35 +0900	[diff] [blame]	1946	ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
Rik van Riel	04bb2f9	2013-10-07 11:29:36 +0100	[diff] [blame]	1947	diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
				1948	}
				1949
				1950	p->numa_scan_period = clamp(p->numa_scan_period + diff,
				1951	task_scan_min(p), task_scan_max(p));
				1952	memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
				1953	}
				1954
Rik van Riel	7e2703e	2014-01-27 17:03:45 -0500	[diff] [blame]	1955	/*
				1956	* Get the fraction of time the task has been running since the last
				1957	* NUMA placement cycle. The scheduler keeps similar statistics, but
				1958	* decays those on a 32ms period, which is orders of magnitude off
				1959	* from the dozens-of-seconds NUMA balancing period. Use the scheduler
				1960	* stats only if the task is so new there are no NUMA statistics yet.
				1961	*/
				1962	static u64 numa_get_avg_runtime(struct task_struct p, u64 period)
				1963	{
				1964	u64 runtime, delta, now;
				1965	/* Use the start of this time slice to avoid calculations. */
				1966	now = p->se.exec_start;
				1967	runtime = p->se.sum_exec_runtime;
				1968
				1969	if (p->last_task_numa_placement) {
				1970	delta = runtime - p->last_sum_exec_runtime;
				1971	*period = now - p->last_task_numa_placement;
				1972	} else {
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	1973	delta = p->se.avg.load_sum / p->se.load.weight;
				1974	*period = LOAD_AVG_MAX;
Rik van Riel	7e2703e	2014-01-27 17:03:45 -0500	[diff] [blame]	1975	}
				1976
				1977	p->last_sum_exec_runtime = runtime;
				1978	p->last_task_numa_placement = now;
				1979
				1980	return delta;
				1981	}
				1982
Rik van Riel	5400941	2014-10-17 03:29:53 -0400	[diff] [blame]	1983	/*
				1984	* Determine the preferred nid for a task in a numa_group. This needs to
				1985	* be done in a way that produces consistent results with group_weight,
				1986	* otherwise workloads might not converge.
				1987	*/
				1988	static int preferred_group_nid(struct task_struct *p, int nid)
				1989	{
				1990	nodemask_t nodes;
				1991	int dist;
				1992
				1993	/* Direct connections between all NUMA nodes. */
				1994	if (sched_numa_topology_type == NUMA_DIRECT)
				1995	return nid;
				1996
				1997	/*
				1998	* On a system with glueless mesh NUMA topology, group_weight
				1999	* scores nodes according to the number of NUMA hinting faults on
				2000	* both the node itself, and on nearby nodes.
				2001	*/
				2002	if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
				2003	unsigned long score, max_score = 0;
				2004	int node, max_node = nid;
				2005
				2006	dist = sched_max_numa_distance;
				2007
				2008	for_each_online_node(node) {
				2009	score = group_weight(p, node, dist);
				2010	if (score > max_score) {
				2011	max_score = score;
				2012	max_node = node;
				2013	}
				2014	}
				2015	return max_node;
				2016	}
				2017
				2018	/*
				2019	* Finding the preferred nid in a system with NUMA backplane
				2020	* interconnect topology is more involved. The goal is to locate
				2021	* tasks from numa_groups near each other in the system, and
				2022	* untangle workloads from different sides of the system. This requires
				2023	* searching down the hierarchy of node groups, recursively searching
				2024	* inside the highest scoring group of nodes. The nodemask tricks
				2025	* keep the complexity of the search down.
				2026	*/
				2027	nodes = node_online_map;
				2028	for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
				2029	unsigned long max_faults = 0;
Jan Beulich	8190747	2015-01-23 08:25:38 +0000	[diff] [blame]	2030	nodemask_t max_group = NODE_MASK_NONE;
Rik van Riel	5400941	2014-10-17 03:29:53 -0400	[diff] [blame]	2031	int a, b;
				2032
				2033	/* Are there nodes at this distance from each other? */
				2034	if (!find_numa_distance(dist))
				2035	continue;
				2036
				2037	for_each_node_mask(a, nodes) {
				2038	unsigned long faults = 0;
				2039	nodemask_t this_group;
				2040	nodes_clear(this_group);
				2041
				2042	/* Sum group's NUMA faults; includes a==b case. */
				2043	for_each_node_mask(b, nodes) {
				2044	if (node_distance(a, b) < dist) {
				2045	faults += group_faults(p, b);
				2046	node_set(b, this_group);
				2047	node_clear(b, nodes);
				2048	}
				2049	}
				2050
				2051	/* Remember the top group. */
				2052	if (faults > max_faults) {
				2053	max_faults = faults;
				2054	max_group = this_group;
				2055	/*
				2056	* subtle: at the smallest distance there is
				2057	* just one node left in each "group", the
				2058	* winner is the preferred nid.
				2059	*/
				2060	nid = a;
				2061	}
				2062	}
				2063	/* Next round, evaluate the nodes within max_group. */
Jan Beulich	890a540	2015-02-09 12:30:00 +0100	[diff] [blame]	2064	if (!max_faults)
				2065	break;
Rik van Riel	5400941	2014-10-17 03:29:53 -0400	[diff] [blame]	2066	nodes = max_group;
				2067	}
				2068	return nid;
				2069	}
				2070
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2071	static void task_numa_placement(struct task_struct *p)
				2072	{
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	2073	int seq, nid, max_nid = -1, max_group_nid = -1;
				2074	unsigned long max_faults = 0, max_group_faults = 0;
Rik van Riel	04bb2f9	2013-10-07 11:29:36 +0100	[diff] [blame]	2075	unsigned long fault_types[2] = { 0, 0 };
Rik van Riel	7e2703e	2014-01-27 17:03:45 -0500	[diff] [blame]	2076	unsigned long total_faults;
				2077	u64 runtime, period;
Mel Gorman	7dbd13e	2013-10-07 11:29:29 +0100	[diff] [blame]	2078	spinlock_t *group_lock = NULL;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2079
Jason Low	7e5a2c1	2015-04-30 17:28:14 -0700	[diff] [blame]	2080	/*
				2081	* The p->mm->numa_scan_seq field gets updated without
				2082	* exclusive access. Use READ_ONCE() here to ensure
				2083	* that the field is read in a single access:
				2084	*/
Jason Low	316c1608d	2015-04-28 13:00:20 -0700	[diff] [blame]	2085	seq = READ_ONCE(p->mm->numa_scan_seq);
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2086	if (p->numa_scan_seq == seq)
				2087	return;
				2088	p->numa_scan_seq = seq;
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	2089	p->numa_scan_period_max = task_scan_max(p);
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2090
Rik van Riel	7e2703e	2014-01-27 17:03:45 -0500	[diff] [blame]	2091	total_faults = p->numa_faults_locality[0] +
				2092	p->numa_faults_locality[1];
				2093	runtime = numa_get_avg_runtime(p, &period);
				2094
Mel Gorman	7dbd13e	2013-10-07 11:29:29 +0100	[diff] [blame]	2095	/* If the task is part of a group prevent parallel updates to group stats */
				2096	if (p->numa_group) {
				2097	group_lock = &p->numa_group->lock;
Mike Galbraith	60e69ee	2014-04-07 10:55:15 +0200	[diff] [blame]	2098	spin_lock_irq(group_lock);
Mel Gorman	7dbd13e	2013-10-07 11:29:29 +0100	[diff] [blame]	2099	}
				2100
Mel Gorman	688b758	2013-10-07 11:28:58 +0100	[diff] [blame]	2101	/* Find the node with the highest number of faults */
				2102	for_each_online_node(nid) {
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2103	/* Keep track of the offsets in numa_faults array */
				2104	int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	2105	unsigned long faults = 0, group_faults = 0;
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2106	int priv;
Mel Gorman	745d614	2013-10-07 11:28:59 +0100	[diff] [blame]	2107
Rik van Riel	be1e4e7	2014-01-27 17:03:48 -0500	[diff] [blame]	2108	for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
Rik van Riel	7e2703e	2014-01-27 17:03:45 -0500	[diff] [blame]	2109	long diff, f_diff, f_weight;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2110
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2111	mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
				2112	membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
				2113	cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
				2114	cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
Mel Gorman	745d614	2013-10-07 11:28:59 +0100	[diff] [blame]	2115
Mel Gorman	ac8e895	2013-10-07 11:29:03 +0100	[diff] [blame]	2116	/* Decay existing window, copy faults since last scan */
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2117	diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
				2118	fault_types[priv] += p->numa_faults[membuf_idx];
				2119	p->numa_faults[membuf_idx] = 0;
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	2120
Rik van Riel	7e2703e	2014-01-27 17:03:45 -0500	[diff] [blame]	2121	/*
				2122	* Normalize the faults_from, so all tasks in a group
				2123	* count according to CPU use, instead of by the raw
				2124	* number of faults. Tasks with little runtime have
				2125	* little over-all impact on throughput, and thus their
				2126	* faults are less important.
				2127	*/
				2128	f_weight = div64_u64(runtime << 16, period + 1);
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2129	f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
Rik van Riel	7e2703e	2014-01-27 17:03:45 -0500	[diff] [blame]	2130	(total_faults + 1);
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2131	f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
				2132	p->numa_faults[cpubuf_idx] = 0;
Rik van Riel	50ec8a4	2014-01-27 17:03:42 -0500	[diff] [blame]	2133
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2134	p->numa_faults[mem_idx] += diff;
				2135	p->numa_faults[cpu_idx] += f_diff;
				2136	faults += p->numa_faults[mem_idx];
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	2137	p->total_numa_faults += diff;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2138	if (p->numa_group) {
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2139	/*
				2140	* safe because we can only change our own group
				2141	*
				2142	* mem_idx represents the offset for a given
				2143	* nid and priv in a specific region because it
				2144	* is at the beginning of the numa_faults array.
				2145	*/
				2146	p->numa_group->faults[mem_idx] += diff;
				2147	p->numa_group->faults_cpu[mem_idx] += f_diff;
Mel Gorman	989348b	2013-10-07 11:29:40 +0100	[diff] [blame]	2148	p->numa_group->total_faults += diff;
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2149	group_faults += p->numa_group->faults[mem_idx];
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2150	}
Mel Gorman	ac8e895	2013-10-07 11:29:03 +0100	[diff] [blame]	2151	}
				2152
Mel Gorman	688b758	2013-10-07 11:28:58 +0100	[diff] [blame]	2153	if (faults > max_faults) {
				2154	max_faults = faults;
				2155	max_nid = nid;
				2156	}
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	2157
				2158	if (group_faults > max_group_faults) {
				2159	max_group_faults = group_faults;
				2160	max_group_nid = nid;
				2161	}
				2162	}
				2163
Rik van Riel	04bb2f9	2013-10-07 11:29:36 +0100	[diff] [blame]	2164	update_task_scan_period(p, fault_types[0], fault_types[1]);
				2165
Mel Gorman	7dbd13e	2013-10-07 11:29:29 +0100	[diff] [blame]	2166	if (p->numa_group) {
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	2167	numa_group_count_active_nodes(p->numa_group);
Mike Galbraith	60e69ee	2014-04-07 10:55:15 +0200	[diff] [blame]	2168	spin_unlock_irq(group_lock);
Rik van Riel	5400941	2014-10-17 03:29:53 -0400	[diff] [blame]	2169	max_nid = preferred_group_nid(p, max_group_nid);
Mel Gorman	688b758	2013-10-07 11:28:58 +0100	[diff] [blame]	2170	}
				2171
Rik van Riel	bb97fc3	2014-06-04 16:33:15 -0400	[diff] [blame]	2172	if (max_faults) {
				2173	/* Set the new preferred node */
				2174	if (max_nid != p->numa_preferred_nid)
				2175	sched_setnuma(p, max_nid);
				2176
				2177	if (task_node(p) != p->numa_preferred_nid)
				2178	numa_migrate_preferred(p);
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	2179	}
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2180	}
				2181
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2182	static inline int get_numa_group(struct numa_group *grp)
				2183	{
				2184	return atomic_inc_not_zero(&grp->refcount);
				2185	}
				2186
				2187	static inline void put_numa_group(struct numa_group *grp)
				2188	{
				2189	if (atomic_dec_and_test(&grp->refcount))
				2190	kfree_rcu(grp, rcu);
				2191	}
				2192
Mel Gorman	3e6a941	2013-10-07 11:29:35 +0100	[diff] [blame]	2193	static void task_numa_group(struct task_struct *p, int cpupid, int flags,
				2194	int *priv)
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2195	{
				2196	struct numa_group grp, my_grp;
				2197	struct task_struct *tsk;
				2198	bool join = false;
				2199	int cpu = cpupid_to_cpu(cpupid);
				2200	int i;
				2201
				2202	if (unlikely(!p->numa_group)) {
				2203	unsigned int size = sizeof(struct numa_group) +
Rik van Riel	50ec8a4	2014-01-27 17:03:42 -0500	[diff] [blame]	2204	4nr_node_idssizeof(unsigned long);
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2205
				2206	grp = kzalloc(size, GFP_KERNEL \| __GFP_NOWARN);
				2207	if (!grp)
				2208	return;
				2209
				2210	atomic_set(&grp->refcount, 1);
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	2211	grp->active_nodes = 1;
				2212	grp->max_faults_cpu = 0;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2213	spin_lock_init(&grp->lock);
Mel Gorman	e29cf08	2013-10-07 11:29:22 +0100	[diff] [blame]	2214	grp->gid = p->pid;
Rik van Riel	50ec8a4	2014-01-27 17:03:42 -0500	[diff] [blame]	2215	/* Second half of the array tracks nids where faults happen */
Rik van Riel	be1e4e7	2014-01-27 17:03:48 -0500	[diff] [blame]	2216	grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
				2217	nr_node_ids;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2218
Rik van Riel	be1e4e7	2014-01-27 17:03:48 -0500	[diff] [blame]	2219	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2220	grp->faults[i] = p->numa_faults[i];
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2221
Mel Gorman	989348b	2013-10-07 11:29:40 +0100	[diff] [blame]	2222	grp->total_faults = p->total_numa_faults;
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	2223
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2224	grp->nr_tasks++;
				2225	rcu_assign_pointer(p->numa_group, grp);
				2226	}
				2227
				2228	rcu_read_lock();
Jason Low	316c1608d	2015-04-28 13:00:20 -0700	[diff] [blame]	2229	tsk = READ_ONCE(cpu_rq(cpu)->curr);
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2230
				2231	if (!cpupid_match_pid(tsk, cpupid))
Peter Zijlstra	3354781	2013-10-09 10:24:48 +0200	[diff] [blame]	2232	goto no_join;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2233
				2234	grp = rcu_dereference(tsk->numa_group);
				2235	if (!grp)
Peter Zijlstra	3354781	2013-10-09 10:24:48 +0200	[diff] [blame]	2236	goto no_join;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2237
				2238	my_grp = p->numa_group;
				2239	if (grp == my_grp)
Peter Zijlstra	3354781	2013-10-09 10:24:48 +0200	[diff] [blame]	2240	goto no_join;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2241
				2242	/*
				2243	* Only join the other group if its bigger; if we're the bigger group,
				2244	* the other task will join us.
				2245	*/
				2246	if (my_grp->nr_tasks > grp->nr_tasks)
Peter Zijlstra	3354781	2013-10-09 10:24:48 +0200	[diff] [blame]	2247	goto no_join;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2248
				2249	/*
				2250	* Tie-break on the grp address.
				2251	*/
				2252	if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
Peter Zijlstra	3354781	2013-10-09 10:24:48 +0200	[diff] [blame]	2253	goto no_join;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2254
Rik van Riel	dabe1d9	2013-10-07 11:29:34 +0100	[diff] [blame]	2255	/* Always join threads in the same process. */
				2256	if (tsk->mm == current->mm)
				2257	join = true;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2258
Rik van Riel	dabe1d9	2013-10-07 11:29:34 +0100	[diff] [blame]	2259	/* Simple filter to avoid false positives due to PID collisions */
				2260	if (flags & TNF_SHARED)
				2261	join = true;
				2262
Mel Gorman	3e6a941	2013-10-07 11:29:35 +0100	[diff] [blame]	2263	/* Update priv based on whether false sharing was detected */
				2264	*priv = !join;
				2265
Rik van Riel	dabe1d9	2013-10-07 11:29:34 +0100	[diff] [blame]	2266	if (join && !get_numa_group(grp))
Peter Zijlstra	3354781	2013-10-09 10:24:48 +0200	[diff] [blame]	2267	goto no_join;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2268
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2269	rcu_read_unlock();
				2270
				2271	if (!join)
				2272	return;
				2273
Mike Galbraith	60e69ee	2014-04-07 10:55:15 +0200	[diff] [blame]	2274	BUG_ON(irqs_disabled());
				2275	double_lock_irq(&my_grp->lock, &grp->lock);
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2276
Rik van Riel	be1e4e7	2014-01-27 17:03:48 -0500	[diff] [blame]	2277	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2278	my_grp->faults[i] -= p->numa_faults[i];
				2279	grp->faults[i] += p->numa_faults[i];
Mel Gorman	989348b	2013-10-07 11:29:40 +0100	[diff] [blame]	2280	}
				2281	my_grp->total_faults -= p->total_numa_faults;
				2282	grp->total_faults += p->total_numa_faults;
				2283
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2284	my_grp->nr_tasks--;
				2285	grp->nr_tasks++;
				2286
				2287	spin_unlock(&my_grp->lock);
Mike Galbraith	60e69ee	2014-04-07 10:55:15 +0200	[diff] [blame]	2288	spin_unlock_irq(&grp->lock);
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2289
				2290	rcu_assign_pointer(p->numa_group, grp);
				2291
				2292	put_numa_group(my_grp);
Peter Zijlstra	3354781	2013-10-09 10:24:48 +0200	[diff] [blame]	2293	return;
				2294
				2295	no_join:
				2296	rcu_read_unlock();
				2297	return;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2298	}
				2299
				2300	void task_numa_free(struct task_struct *p)
				2301	{
				2302	struct numa_group *grp = p->numa_group;
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2303	void *numa_faults = p->numa_faults;
Steven Rostedt	e9dd685	2014-05-27 17:02:04 -0400	[diff] [blame]	2304	unsigned long flags;
				2305	int i;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2306
				2307	if (grp) {
Steven Rostedt	e9dd685	2014-05-27 17:02:04 -0400	[diff] [blame]	2308	spin_lock_irqsave(&grp->lock, flags);
Rik van Riel	be1e4e7	2014-01-27 17:03:48 -0500	[diff] [blame]	2309	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2310	grp->faults[i] -= p->numa_faults[i];
Mel Gorman	989348b	2013-10-07 11:29:40 +0100	[diff] [blame]	2311	grp->total_faults -= p->total_numa_faults;
				2312
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2313	grp->nr_tasks--;
Steven Rostedt	e9dd685	2014-05-27 17:02:04 -0400	[diff] [blame]	2314	spin_unlock_irqrestore(&grp->lock, flags);
Andreea-Cristina Bernat	35b123e	2014-08-22 17:50:43 +0300	[diff] [blame]	2315	RCU_INIT_POINTER(p->numa_group, NULL);
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2316	put_numa_group(grp);
				2317	}
				2318
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2319	p->numa_faults = NULL;
Rik van Riel	8272701	2013-10-07 11:29:28 +0100	[diff] [blame]	2320	kfree(numa_faults);
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2321	}
				2322
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2323	/*
				2324	* Got a PROT_NONE fault for a page on @node.
				2325	*/
Rik van Riel	58b46da	2014-01-27 17:03:47 -0500	[diff] [blame]	2326	void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2327	{
				2328	struct task_struct *p = current;
Peter Zijlstra	6688cc0	2013-10-07 11:29:24 +0100	[diff] [blame]	2329	bool migrated = flags & TNF_MIGRATED;
Rik van Riel	58b46da	2014-01-27 17:03:47 -0500	[diff] [blame]	2330	int cpu_node = task_node(current);
Rik van Riel	792568e	2014-04-11 13:00:27 -0400	[diff] [blame]	2331	int local = !!(flags & TNF_FAULT_LOCAL);
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	2332	struct numa_group *ng;
Mel Gorman	ac8e895	2013-10-07 11:29:03 +0100	[diff] [blame]	2333	int priv;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2334
Srikar Dronamraju	2a59572	2015-08-11 21:54:21 +0530	[diff] [blame]	2335	if (!static_branch_likely(&sched_numa_balancing))
Mel Gorman	1a687c2	2012-11-22 11:16:36 +0000	[diff] [blame]	2336	return;
				2337
Mel Gorman	9ff1d9f	2013-10-07 11:29:04 +0100	[diff] [blame]	2338	/* for example, ksmd faulting in a user's mm */
				2339	if (!p->mm)
				2340	return;
				2341
Mel Gorman	f809ca9	2013-10-07 11:28:57 +0100	[diff] [blame]	2342	/* Allocate buffer to track faults on a per-node basis */
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2343	if (unlikely(!p->numa_faults)) {
				2344	int size = sizeof(p->numa_faults)
Rik van Riel	be1e4e7	2014-01-27 17:03:48 -0500	[diff] [blame]	2345	NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
Mel Gorman	f809ca9	2013-10-07 11:28:57 +0100	[diff] [blame]	2346
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2347	p->numa_faults = kzalloc(size, GFP_KERNEL\|__GFP_NOWARN);
				2348	if (!p->numa_faults)
Mel Gorman	f809ca9	2013-10-07 11:28:57 +0100	[diff] [blame]	2349	return;
Mel Gorman	745d614	2013-10-07 11:28:59 +0100	[diff] [blame]	2350
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	2351	p->total_numa_faults = 0;
Rik van Riel	04bb2f9	2013-10-07 11:29:36 +0100	[diff] [blame]	2352	memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
Mel Gorman	f809ca9	2013-10-07 11:28:57 +0100	[diff] [blame]	2353	}
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2354
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	2355	/*
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2356	* First accesses are treated as private, otherwise consider accesses
				2357	* to be private if the accessing pid has not changed
				2358	*/
				2359	if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
				2360	priv = 1;
				2361	} else {
				2362	priv = cpupid_match_pid(p, last_cpupid);
Peter Zijlstra	6688cc0	2013-10-07 11:29:24 +0100	[diff] [blame]	2363	if (!priv && !(flags & TNF_NO_GROUP))
Mel Gorman	3e6a941	2013-10-07 11:29:35 +0100	[diff] [blame]	2364	task_numa_group(p, last_cpupid, flags, &priv);
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2365	}
				2366
Rik van Riel	792568e	2014-04-11 13:00:27 -0400	[diff] [blame]	2367	/*
				2368	* If a workload spans multiple NUMA nodes, a shared fault that
				2369	* occurs wholly within the set of nodes that the workload is
				2370	* actively using should be counted as local. This allows the
				2371	* scan rate to slow down when a workload has settled down.
				2372	*/
Rik van Riel	4142c3e	2016-01-25 17:07:39 -0500	[diff] [blame]	2373	ng = p->numa_group;
				2374	if (!priv && !local && ng && ng->active_nodes > 1 &&
				2375	numa_is_active_node(cpu_node, ng) &&
				2376	numa_is_active_node(mem_node, ng))
Rik van Riel	792568e	2014-04-11 13:00:27 -0400	[diff] [blame]	2377	local = 1;
				2378
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2379	task_numa_placement(p);
Mel Gorman	f809ca9	2013-10-07 11:28:57 +0100	[diff] [blame]	2380
Rik van Riel	2739d3e	2013-10-07 11:29:41 +0100	[diff] [blame]	2381	/*
				2382	* Retry task to preferred node migration periodically, in case it
				2383	* case it previously failed, or the scheduler moved us.
				2384	*/
				2385	if (time_after(jiffies, p->numa_migrate_retry))
Mel Gorman	6b9a746	2013-10-07 11:29:11 +0100	[diff] [blame]	2386	numa_migrate_preferred(p);
				2387
Ingo Molnar	b32e86b	2013-10-07 11:29:30 +0100	[diff] [blame]	2388	if (migrated)
				2389	p->numa_pages_migrated += pages;
Mel Gorman	074c238	2015-03-25 15:55:42 -0700	[diff] [blame]	2390	if (flags & TNF_MIGRATE_FAIL)
				2391	p->numa_faults_locality[2] += pages;
Ingo Molnar	b32e86b	2013-10-07 11:29:30 +0100	[diff] [blame]	2392
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2393	p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
				2394	p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
Rik van Riel	792568e	2014-04-11 13:00:27 -0400	[diff] [blame]	2395	p->numa_faults_locality[local] += pages;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2396	}
				2397
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2398	static void reset_ptenuma_scan(struct task_struct *p)
				2399	{
Jason Low	7e5a2c1	2015-04-30 17:28:14 -0700	[diff] [blame]	2400	/*
				2401	* We only did a read acquisition of the mmap sem, so
				2402	* p->mm->numa_scan_seq is written to without exclusive access
				2403	* and the update is not guaranteed to be atomic. That's not
				2404	* much of an issue though, since this is just used for
				2405	* statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
				2406	* expensive, to avoid any form of compiler optimizations:
				2407	*/
Jason Low	316c1608d	2015-04-28 13:00:20 -0700	[diff] [blame]	2408	WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2409	p->mm->numa_scan_offset = 0;
				2410	}
				2411
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2412	/*
				2413	* The expensive part of numa migration is done from task_work context.
				2414	* Triggered from task_tick_numa().
				2415	*/
				2416	void task_numa_work(struct callback_head *work)
				2417	{
				2418	unsigned long migrate, next_scan, now = jiffies;
				2419	struct task_struct *p = current;
				2420	struct mm_struct *mm = p->mm;
Rik van Riel	5117084	2015-11-05 15:56:23 -0500	[diff] [blame]	2421	u64 runtime = p->se.sum_exec_runtime;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2422	struct vm_area_struct *vma;
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	2423	unsigned long start, end;
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	2424	unsigned long nr_pte_updates = 0;
Rik van Riel	4620f8c	2015-09-11 09:00:27 -0400	[diff] [blame]	2425	long pages, virtpages;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2426
Peter Zijlstra	9148a3a	2016-09-20 22:34:51 +0200	[diff] [blame]	2427	SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2428
				2429	work->next = work; /* protect against double add */
				2430	/*
				2431	* Who cares about NUMA placement when they're dying.
				2432	*
				2433	* NOTE: make sure not to dereference p->mm before this check,
				2434	* exit_task_work() happens _after_ exit_mm() so we could be called
				2435	* without p->mm even though we still had it when we enqueued this
				2436	* work.
				2437	*/
				2438	if (p->flags & PF_EXITING)
				2439	return;
				2440
Mel Gorman	930aa17	2013-10-07 11:29:37 +0100	[diff] [blame]	2441	if (!mm->numa_next_scan) {
Mel Gorman	7e8d16b	2013-10-07 11:28:54 +0100	[diff] [blame]	2442	mm->numa_next_scan = now +
				2443	msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	2444	}
				2445
				2446	/*
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2447	* Enforce maximal scan/migration frequency..
				2448	*/
				2449	migrate = mm->numa_next_scan;
				2450	if (time_before(now, migrate))
				2451	return;
				2452
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	2453	if (p->numa_scan_period == 0) {
				2454	p->numa_scan_period_max = task_scan_max(p);
				2455	p->numa_scan_period = task_scan_min(p);
				2456	}
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2457
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	2458	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2459	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
				2460	return;
				2461
Mel Gorman	e14808b	2012-11-19 10:59:15 +0000	[diff] [blame]	2462	/*
Peter Zijlstra	19a78d1	2013-10-07 11:28:51 +0100	[diff] [blame]	2463	* Delay this task enough that another task of this mm will likely win
				2464	* the next time around.
				2465	*/
				2466	p->node_stamp += 2 * TICK_NSEC;
				2467
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	2468	start = mm->numa_scan_offset;
				2469	pages = sysctl_numa_balancing_scan_size;
				2470	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
Rik van Riel	4620f8c	2015-09-11 09:00:27 -0400	[diff] [blame]	2471	virtpages = pages * 8; /* Scan up to this much virtual space */
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	2472	if (!pages)
				2473	return;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2474
Rik van Riel	4620f8c	2015-09-11 09:00:27 -0400	[diff] [blame]	2475
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2476	down_read(&mm->mmap_sem);
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	2477	vma = find_vma(mm, start);
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2478	if (!vma) {
				2479	reset_ptenuma_scan(p);
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	2480	start = 0;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2481	vma = mm->mmap;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2482	}
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	2483	for (; vma; vma = vma->vm_next) {
Naoya Horiguchi	6b79c57	2015-04-07 14:26:47 -0700	[diff] [blame]	2484	if (!vma_migratable(vma) \|\| !vma_policy_mof(vma) \|\|
Mel Gorman	8e76d4e	2015-06-10 11:15:00 -0700	[diff] [blame]	2485	is_vm_hugetlb_page(vma) \|\| (vma->vm_flags & VM_MIXEDMAP)) {
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2486	continue;
Naoya Horiguchi	6b79c57	2015-04-07 14:26:47 -0700	[diff] [blame]	2487	}
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2488
Mel Gorman	4591ce4f	2013-10-07 11:29:13 +0100	[diff] [blame]	2489	/*
				2490	* Shared library pages mapped by multiple processes are not
				2491	* migrated as it is expected they are cache replicated. Avoid
				2492	* hinting faults in read-only file-backed mappings or the vdso
				2493	* as migrating the pages will be of marginal benefit.
				2494	*/
				2495	if (!vma->vm_mm \|\|
				2496	(vma->vm_file && (vma->vm_flags & (VM_READ\|VM_WRITE)) == (VM_READ)))
				2497	continue;
				2498
Mel Gorman	3c67f47	2013-12-18 17:08:40 -0800	[diff] [blame]	2499	/*
				2500	* Skip inaccessible VMAs to avoid any confusion between
				2501	* PROT_NONE and NUMA hinting ptes
				2502	*/
				2503	if (!(vma->vm_flags & (VM_READ \| VM_EXEC \| VM_WRITE)))
				2504	continue;
				2505
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	2506	do {
				2507	start = max(start, vma->vm_start);
				2508	end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
				2509	end = min(end, vma->vm_end);
Rik van Riel	4620f8c	2015-09-11 09:00:27 -0400	[diff] [blame]	2510	nr_pte_updates = change_prot_numa(vma, start, end);
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	2511
				2512	/*
Rik van Riel	4620f8c	2015-09-11 09:00:27 -0400	[diff] [blame]	2513	* Try to scan sysctl_numa_balancing_size worth of
				2514	* hpages that have at least one present PTE that
				2515	* is not already pte-numa. If the VMA contains
				2516	* areas that are unused or already full of prot_numa
				2517	* PTEs, scan up to virtpages, to skip through those
				2518	* areas faster.
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	2519	*/
				2520	if (nr_pte_updates)
				2521	pages -= (end - start) >> PAGE_SHIFT;
Rik van Riel	4620f8c	2015-09-11 09:00:27 -0400	[diff] [blame]	2522	virtpages -= (end - start) >> PAGE_SHIFT;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2523
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	2524	start = end;
Rik van Riel	4620f8c	2015-09-11 09:00:27 -0400	[diff] [blame]	2525	if (pages <= 0 \|\| virtpages <= 0)
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	2526	goto out;
Rik van Riel	3cf1962	2014-02-18 17:12:44 -0500	[diff] [blame]	2527
				2528	cond_resched();
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	2529	} while (end != vma->vm_end);
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2530	}
				2531
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	2532	out:
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2533	/*
Peter Zijlstra	c69307d	2013-10-07 11:28:41 +0100	[diff] [blame]	2534	* It is possible to reach the end of the VMA list but the last few
				2535	* VMAs are not guaranteed to the vma_migratable. If they are not, we
				2536	* would find the !migratable VMA on the next scan but not reset the
				2537	* scanner to the start so check it now.
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2538	*/
				2539	if (vma)
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	2540	mm->numa_scan_offset = start;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2541	else
				2542	reset_ptenuma_scan(p);
				2543	up_read(&mm->mmap_sem);
Rik van Riel	5117084	2015-11-05 15:56:23 -0500	[diff] [blame]	2544
				2545	/*
				2546	* Make sure tasks use at least 32x as much time to run other code
				2547	* than they used here, to limit NUMA PTE scanning overhead to 3% max.
				2548	* Usually update_task_scan_period slows down scanning enough; on an
				2549	* overloaded system we need to limit overhead on a per task basis.
				2550	*/
				2551	if (unlikely(p->se.sum_exec_runtime != runtime)) {
				2552	u64 diff = p->se.sum_exec_runtime - runtime;
				2553	p->node_stamp += 32 * diff;
				2554	}
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2555	}
				2556
				2557	/*
				2558	* Drive the periodic memory faults..
				2559	*/
				2560	void task_tick_numa(struct rq rq, struct task_struct curr)
				2561	{
				2562	struct callback_head *work = &curr->numa_work;
				2563	u64 period, now;
				2564
				2565	/*
				2566	* We don't care about NUMA placement if we don't have memory.
				2567	*/
				2568	if (!curr->mm \|\| (curr->flags & PF_EXITING) \|\| work->next != work)
				2569	return;
				2570
				2571	/*
				2572	* Using runtime rather than walltime has the dual advantage that
				2573	* we (mostly) drive the selection from busy threads and that the
				2574	* task needs to have done some actual work before we bother with
				2575	* NUMA placement.
				2576	*/
				2577	now = curr->se.sum_exec_runtime;
				2578	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
				2579
Rik van Riel	25b3e5a	2015-11-05 15:56:22 -0500	[diff] [blame]	2580	if (now > curr->node_stamp + period) {
Peter Zijlstra	4b96a29	2012-10-25 14:16:47 +0200	[diff] [blame]	2581	if (!curr->node_stamp)
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	2582	curr->numa_scan_period = task_scan_min(curr);
Peter Zijlstra	19a78d1	2013-10-07 11:28:51 +0100	[diff] [blame]	2583	curr->node_stamp += period;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2584
				2585	if (!time_before(jiffies, curr->mm->numa_next_scan)) {
				2586	init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
				2587	task_work_add(curr, work, true);
				2588	}
				2589	}
				2590	}
				2591	#else
				2592	static void task_tick_numa(struct rq rq, struct task_struct curr)
				2593	{
				2594	}
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	2595
				2596	static inline void account_numa_enqueue(struct rq rq, struct task_struct p)
				2597	{
				2598	}
				2599
				2600	static inline void account_numa_dequeue(struct rq rq, struct task_struct p)
				2601	{
				2602	}
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2603	#endif /* CONFIG_NUMA_BALANCING */
				2604
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	2605	static void
				2606	account_entity_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
				2607	{
				2608	update_load_add(&cfs_rq->load, se->load.weight);
Peter Zijlstra	c09595f	2008-06-27 13:41:14 +0200	[diff] [blame]	2609	if (!parent_entity(se))
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2610	update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	2611	#ifdef CONFIG_SMP
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	2612	if (entity_is_task(se)) {
				2613	struct rq *rq = rq_of(cfs_rq);
				2614
				2615	account_numa_enqueue(rq, task_of(se));
				2616	list_add(&se->group_node, &rq->cfs_tasks);
				2617	}
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	2618	#endif
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	2619	cfs_rq->nr_running++;
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	2620	}
				2621
				2622	static void
				2623	account_entity_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)
				2624	{
				2625	update_load_sub(&cfs_rq->load, se->load.weight);
Peter Zijlstra	c09595f	2008-06-27 13:41:14 +0200	[diff] [blame]	2626	if (!parent_entity(se))
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2627	update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
Tim Chen	bfdb198	2016-02-01 14:47:59 -0800	[diff] [blame]	2628	#ifdef CONFIG_SMP
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	2629	if (entity_is_task(se)) {
				2630	account_numa_dequeue(rq_of(cfs_rq), task_of(se));
Bharata B Rao	b87f172	2008-09-25 09:53:54 +0530	[diff] [blame]	2631	list_del_init(&se->group_node);
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	2632	}
Tim Chen	bfdb198	2016-02-01 14:47:59 -0800	[diff] [blame]	2633	#endif
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	2634	cfs_rq->nr_running--;
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	2635	}
				2636
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	2637	#ifdef CONFIG_FAIR_GROUP_SCHED
				2638	# ifdef CONFIG_SMP
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	2639	static long calc_cfs_shares(struct cfs_rq cfs_rq, struct task_group tg)
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	2640	{
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	2641	long tg_weight, load, shares;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	2642
Peter Zijlstra	ea1dc6f	2016-06-24 16:11:02 +0200	[diff] [blame]	2643	/*
				2644	* This really should be: cfs_rq->avg.load_avg, but instead we use
				2645	* cfs_rq->load.weight, which is its upper bound. This helps ramp up
				2646	* the shares for small weight interactive tasks.
				2647	*/
				2648	load = scale_load_down(cfs_rq->load.weight);
				2649
				2650	tg_weight = atomic_long_read(&tg->load_avg);
				2651
				2652	/* Ensure tg_weight >= load */
				2653	tg_weight -= cfs_rq->tg_load_avg_contrib;
				2654	tg_weight += load;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	2655
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	2656	shares = (tg->shares * load);
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	2657	if (tg_weight)
				2658	shares /= tg_weight;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	2659
				2660	if (shares < MIN_SHARES)
				2661	shares = MIN_SHARES;
				2662	if (shares > tg->shares)
				2663	shares = tg->shares;
				2664
				2665	return shares;
				2666	}
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	2667	# else /* CONFIG_SMP */
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	2668	static inline long calc_cfs_shares(struct cfs_rq cfs_rq, struct task_group tg)
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	2669	{
				2670	return tg->shares;
				2671	}
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	2672	# endif /* CONFIG_SMP */
Peter Zijlstra	ea1dc6f	2016-06-24 16:11:02 +0200	[diff] [blame]	2673
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2674	static void reweight_entity(struct cfs_rq cfs_rq, struct sched_entity se,
				2675	unsigned long weight)
				2676	{
Paul Turner	19e5eeb	2010-12-15 19:10:18 -0800	[diff] [blame]	2677	if (se->on_rq) {
				2678	/* commit outstanding execution time */
				2679	if (cfs_rq->curr == se)
				2680	update_curr(cfs_rq);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2681	account_entity_dequeue(cfs_rq, se);
Paul Turner	19e5eeb	2010-12-15 19:10:18 -0800	[diff] [blame]	2682	}
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2683
				2684	update_load_set(&se->load, weight);
				2685
				2686	if (se->on_rq)
				2687	account_entity_enqueue(cfs_rq, se);
				2688	}
				2689
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	2690	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
				2691
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	2692	static void update_cfs_shares(struct cfs_rq *cfs_rq)
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2693	{
				2694	struct task_group *tg;
				2695	struct sched_entity *se;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	2696	long shares;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2697
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2698	tg = cfs_rq->tg;
				2699	se = tg->se[cpu_of(rq_of(cfs_rq))];
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2700	if (!se \|\| throttled_hierarchy(cfs_rq))
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2701	return;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	2702	#ifndef CONFIG_SMP
				2703	if (likely(se->load.weight == tg->shares))
				2704	return;
				2705	#endif
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	2706	shares = calc_cfs_shares(cfs_rq, tg);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2707
				2708	reweight_entity(cfs_rq_of(se), se, shares);
				2709	}
				2710	#else /* CONFIG_FAIR_GROUP_SCHED */
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	2711	static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2712	{
				2713	}
				2714	#endif /* CONFIG_FAIR_GROUP_SCHED */
				2715
Alex Shi	141965c	2013-06-26 13:05:39 +0800	[diff] [blame]	2716	#ifdef CONFIG_SMP
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	2717	/* Precomputed fixed inverse multiplies for multiplication by y^n */
				2718	static const u32 runnable_avg_yN_inv[] = {
				2719	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
				2720	0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
				2721	0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
				2722	0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
				2723	0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
				2724	0x85aac367, 0x82cd8698,
				2725	};
				2726
				2727	/*
				2728	* Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
				2729	* over-estimates when re-combining.
				2730	*/
				2731	static const u32 runnable_avg_yN_sum[] = {
				2732	0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
				2733	9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
				2734	17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
				2735	};
				2736
				2737	/*
Yuyang Du	7b20b91	2016-05-03 05:54:27 +0800	[diff] [blame]	2738	* Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to
				2739	* lower integers. See Documentation/scheduler/sched-avg.txt how these
				2740	* were generated:
				2741	*/
				2742	static const u32 __accumulated_sum_N32[] = {
				2743	0, 23371, 35056, 40899, 43820, 45281,
				2744	46011, 46376, 46559, 46650, 46696, 46719,
				2745	};
				2746
				2747	/*
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2748	* Approximate:
				2749	* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
				2750	*/
				2751	static __always_inline u64 decay_load(u64 val, u64 n)
				2752	{
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	2753	unsigned int local_n;
				2754
				2755	if (!n)
				2756	return val;
				2757	else if (unlikely(n > LOAD_AVG_PERIOD * 63))
				2758	return 0;
				2759
				2760	/* after bounds checking we can collapse to 32-bit */
				2761	local_n = n;
				2762
				2763	/*
				2764	* As y^PERIOD = 1/2, we can combine
Zhihui Zhang	9c58c79	2014-09-20 21:24:36 -0400	[diff] [blame]	2765	* y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
				2766	* With a look-up table which covers y^n (n<PERIOD)
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	2767	*
				2768	* To achieve constant time decay_load.
				2769	*/
				2770	if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
				2771	val >>= local_n / LOAD_AVG_PERIOD;
				2772	local_n %= LOAD_AVG_PERIOD;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2773	}
				2774
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2775	val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
				2776	return val;
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	2777	}
				2778
				2779	/*
				2780	* For updates fully spanning n periods, the contribution to runnable
				2781	* average will be: \Sum 1024*y^n
				2782	*
				2783	* We can compute this reasonably efficiently by combining:
				2784	* y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
				2785	*/
				2786	static u32 __compute_runnable_contrib(u64 n)
				2787	{
				2788	u32 contrib = 0;
				2789
				2790	if (likely(n <= LOAD_AVG_PERIOD))
				2791	return runnable_avg_yN_sum[n];
				2792	else if (unlikely(n >= LOAD_AVG_MAX_N))
				2793	return LOAD_AVG_MAX;
				2794
Yuyang Du	7b20b91	2016-05-03 05:54:27 +0800	[diff] [blame]	2795	/* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */
				2796	contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];
				2797	n %= LOAD_AVG_PERIOD;
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	2798	contrib = decay_load(contrib, n);
				2799	return contrib + runnable_avg_yN_sum[n];
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2800	}
				2801
Peter Zijlstra	54a2138	2015-09-07 15:05:42 +0200	[diff] [blame]	2802	#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
Dietmar Eggemann	e0f5f3a	2015-08-14 17:23:09 +0100	[diff] [blame]	2803
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2804	/*
				2805	* We can represent the historical contribution to runnable average as the
				2806	* coefficients of a geometric series. To do this we sub-divide our runnable
				2807	* history into segments of approximately 1ms (1024us); label the segment that
				2808	* occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
				2809	*
				2810	* [<- 1024us ->\|<- 1024us ->\|<- 1024us ->\| ...
				2811	* p0 p1 p2
				2812	* (now) (~1ms ago) (~2ms ago)
				2813	*
				2814	* Let u_i denote the fraction of p_i that the entity was runnable.
				2815	*
				2816	* We then designate the fractions u_i as our co-efficients, yielding the
				2817	* following representation of historical load:
				2818	* u_0 + u_1y + u_2y^2 + u_3*y^3 + ...
				2819	*
				2820	* We choose y based on the with of a reasonably scheduling period, fixing:
				2821	* y^32 = 0.5
				2822	*
				2823	* This means that the contribution to load ~32ms ago (u_32) will be weighted
				2824	* approximately half as much as the contribution to load within the last ms
				2825	* (u_0).
				2826	*
				2827	* When a period "rolls over" and we have new u_0`, multiplying the previous
				2828	* sum again by y is sufficient to update:
				2829	* load_avg = u_0` + y(u_0 + u_1y + u_2*y^2 + ... )
				2830	* = u_0 + u_1y + u_2y^2 + ... [re-labeling u_i --> u_{i+1}]
				2831	*/
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2832	static __always_inline int
				2833	__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	2834	unsigned long weight, int running, struct cfs_rq *cfs_rq)
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2835	{
Dietmar Eggemann	e0f5f3a	2015-08-14 17:23:09 +0100	[diff] [blame]	2836	u64 delta, scaled_delta, periods;
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2837	u32 contrib;
Peter Zijlstra	6115c79	2015-09-07 15:09:15 +0200	[diff] [blame]	2838	unsigned int delta_w, scaled_delta_w, decayed = 0;
Dietmar Eggemann	6f2b045	2015-09-07 14:57:22 +0100	[diff] [blame]	2839	unsigned long scale_freq, scale_cpu;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2840
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2841	delta = now - sa->last_update_time;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2842	/*
				2843	* This should only happen when time goes backwards, which it
				2844	* unfortunately does during sched clock init when we swap over to TSC.
				2845	*/
				2846	if ((s64)delta < 0) {
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2847	sa->last_update_time = now;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2848	return 0;
				2849	}
				2850
				2851	/*
				2852	* Use 1024ns as the unit of measurement since it's a reasonable
				2853	* approximation of 1us and fast to compute.
				2854	*/
				2855	delta >>= 10;
				2856	if (!delta)
				2857	return 0;
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2858	sa->last_update_time = now;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2859
Dietmar Eggemann	6f2b045	2015-09-07 14:57:22 +0100	[diff] [blame]	2860	scale_freq = arch_scale_freq_capacity(NULL, cpu);
				2861	scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
				2862
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2863	/* delta_w is the amount already accumulated against our next period */
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2864	delta_w = sa->period_contrib;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2865	if (delta + delta_w >= 1024) {
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2866	decayed = 1;
				2867
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2868	/* how much left for next period will start over, we don't know yet */
				2869	sa->period_contrib = 0;
				2870
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2871	/*
				2872	* Now that we know we're crossing a period boundary, figure
				2873	* out how much from delta we need to complete the current
				2874	* period and accrue it.
				2875	*/
				2876	delta_w = 1024 - delta_w;
Peter Zijlstra	54a2138	2015-09-07 15:05:42 +0200	[diff] [blame]	2877	scaled_delta_w = cap_scale(delta_w, scale_freq);
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	2878	if (weight) {
Dietmar Eggemann	e0f5f3a	2015-08-14 17:23:09 +0100	[diff] [blame]	2879	sa->load_sum += weight * scaled_delta_w;
				2880	if (cfs_rq) {
				2881	cfs_rq->runnable_load_sum +=
				2882	weight * scaled_delta_w;
				2883	}
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	2884	}
Vincent Guittot	36ee28e	2015-02-27 16:54:04 +0100	[diff] [blame]	2885	if (running)
Peter Zijlstra	006cdf0	2015-09-09 09:06:17 +0200	[diff] [blame]	2886	sa->util_sum += scaled_delta_w * scale_cpu;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2887
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	2888	delta -= delta_w;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2889
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	2890	/* Figure out how many additional periods this update spans */
				2891	periods = delta / 1024;
				2892	delta %= 1024;
				2893
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2894	sa->load_sum = decay_load(sa->load_sum, periods + 1);
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	2895	if (cfs_rq) {
				2896	cfs_rq->runnable_load_sum =
				2897	decay_load(cfs_rq->runnable_load_sum, periods + 1);
				2898	}
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2899	sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	2900
				2901	/* Efficiently calculate \sum (1..n_period) 1024y^i /
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2902	contrib = __compute_runnable_contrib(periods);
Peter Zijlstra	54a2138	2015-09-07 15:05:42 +0200	[diff] [blame]	2903	contrib = cap_scale(contrib, scale_freq);
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	2904	if (weight) {
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2905	sa->load_sum += weight * contrib;
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	2906	if (cfs_rq)
				2907	cfs_rq->runnable_load_sum += weight * contrib;
				2908	}
Vincent Guittot	36ee28e	2015-02-27 16:54:04 +0100	[diff] [blame]	2909	if (running)
Peter Zijlstra	006cdf0	2015-09-09 09:06:17 +0200	[diff] [blame]	2910	sa->util_sum += contrib * scale_cpu;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2911	}
				2912
				2913	/* Remainder of delta accrued against u_0` */
Peter Zijlstra	54a2138	2015-09-07 15:05:42 +0200	[diff] [blame]	2914	scaled_delta = cap_scale(delta, scale_freq);
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	2915	if (weight) {
Dietmar Eggemann	e0f5f3a	2015-08-14 17:23:09 +0100	[diff] [blame]	2916	sa->load_sum += weight * scaled_delta;
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	2917	if (cfs_rq)
Dietmar Eggemann	e0f5f3a	2015-08-14 17:23:09 +0100	[diff] [blame]	2918	cfs_rq->runnable_load_sum += weight * scaled_delta;
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	2919	}
Vincent Guittot	36ee28e	2015-02-27 16:54:04 +0100	[diff] [blame]	2920	if (running)
Peter Zijlstra	006cdf0	2015-09-09 09:06:17 +0200	[diff] [blame]	2921	sa->util_sum += scaled_delta * scale_cpu;
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2922
				2923	sa->period_contrib += delta;
				2924
				2925	if (decayed) {
				2926	sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	2927	if (cfs_rq) {
				2928	cfs_rq->runnable_load_avg =
				2929	div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
				2930	}
Peter Zijlstra	006cdf0	2015-09-09 09:06:17 +0200	[diff] [blame]	2931	sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2932	}
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2933
				2934	return decayed;
				2935	}
				2936
Vincent Guittot	09a43ac	2016-11-08 10:53:45 +0100	[diff] [blame]	2937	/*
				2938	* Signed add and clamp on underflow.
				2939	*
				2940	* Explicitly do a load-store to ensure the intermediate value never hits
				2941	* memory. This allows lockless observations without ever seeing the negative
				2942	* values.
				2943	*/
				2944	#define add_positive(_ptr, _val) do { \
				2945	typeof(_ptr) ptr = (_ptr); \
				2946	typeof(_val) val = (_val); \
				2947	typeof(ptr) res, var = READ_ONCE(ptr); \
				2948	\
				2949	res = var + val; \
				2950	\
				2951	if (val < 0 && res > var) \
				2952	res = 0; \
				2953	\
				2954	WRITE_ONCE(*ptr, res); \
				2955	} while (0)
				2956
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	2957	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	7c3edd2	2016-07-13 10:56:25 +0200	[diff] [blame]	2958	/**
				2959	* update_tg_load_avg - update the tg's load avg
				2960	* @cfs_rq: the cfs_rq whose avg changed
				2961	* @force: update regardless of how small the difference
				2962	*
				2963	* This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
				2964	* However, because tg->load_avg is a global value there are performance
				2965	* considerations.
				2966	*
				2967	* In order to avoid having to look at the other cfs_rq's, we use a
				2968	* differential update where we store the last value we propagated. This in
				2969	* turn allows skipping updates if the differential is 'small'.
				2970	*
				2971	* Updating tg's load_avg is necessary before update_cfs_share() (which is
				2972	* done) and effective_load() (which is not done because it is too costly).
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	2973	*/
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2974	static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	2975	{
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2976	long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	2977
Waiman Long	aa0b7ae	2015-12-02 13:41:50 -0500	[diff] [blame]	2978	/*
				2979	* No need to update load_avg for root_task_group as it is not used.
				2980	*/
				2981	if (cfs_rq->tg == &root_task_group)
				2982	return;
				2983
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2984	if (force \|\| abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
				2985	atomic_long_add(delta, &cfs_rq->tg->load_avg);
				2986	cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	2987	}
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	2988	}
Dietmar Eggemann	f5f9739	2014-02-26 11:19:33 +0000	[diff] [blame]	2989
Byungchul Park	ad936d8	2015-10-24 01:16:19 +0900	[diff] [blame]	2990	/*
				2991	* Called within set_task_rq() right before setting a task's cpu. The
				2992	* caller only guarantees p->pi_lock is held; no other assumptions,
				2993	* including the state of rq->lock, should be made.
				2994	*/
				2995	void set_task_rq_fair(struct sched_entity *se,
				2996	struct cfs_rq prev, struct cfs_rq next)
				2997	{
				2998	if (!sched_feat(ATTACH_AGE_LOAD))
				2999	return;
				3000
				3001	/*
				3002	* We are supposed to update the task to "current" time, then its up to
				3003	* date and ready to go to new CPU/cfs_rq. But we have difficulty in
				3004	* getting what current time is, so simply throw away the out-of-date
				3005	* time. This will result in the wakee task is less decayed, but giving
				3006	* the wakee more load sounds not bad.
				3007	*/
				3008	if (se->avg.last_update_time && prev) {
				3009	u64 p_last_update_time;
				3010	u64 n_last_update_time;
				3011
				3012	#ifndef CONFIG_64BIT
				3013	u64 p_last_update_time_copy;
				3014	u64 n_last_update_time_copy;
				3015
				3016	do {
				3017	p_last_update_time_copy = prev->load_last_update_time_copy;
				3018	n_last_update_time_copy = next->load_last_update_time_copy;
				3019
				3020	smp_rmb();
				3021
				3022	p_last_update_time = prev->avg.last_update_time;
				3023	n_last_update_time = next->avg.last_update_time;
				3024
				3025	} while (p_last_update_time != p_last_update_time_copy \|\|
				3026	n_last_update_time != n_last_update_time_copy);
				3027	#else
				3028	p_last_update_time = prev->avg.last_update_time;
				3029	n_last_update_time = next->avg.last_update_time;
				3030	#endif
				3031	__update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
				3032	&se->avg, 0, 0, NULL);
				3033	se->avg.last_update_time = n_last_update_time;
				3034	}
				3035	}
Vincent Guittot	09a43ac	2016-11-08 10:53:45 +0100	[diff] [blame]	3036
				3037	/* Take into account change of utilization of a child task group */
				3038	static inline void
				3039	update_tg_cfs_util(struct cfs_rq cfs_rq, struct sched_entity se)
				3040	{
				3041	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
				3042	long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
				3043
				3044	/* Nothing to update */
				3045	if (!delta)
				3046	return;
				3047
				3048	/* Set new sched_entity's utilization */
				3049	se->avg.util_avg = gcfs_rq->avg.util_avg;
				3050	se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
				3051
				3052	/* Update parent cfs_rq utilization */
				3053	add_positive(&cfs_rq->avg.util_avg, delta);
				3054	cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
				3055	}
				3056
				3057	/* Take into account change of load of a child task group */
				3058	static inline void
				3059	update_tg_cfs_load(struct cfs_rq cfs_rq, struct sched_entity se)
				3060	{
				3061	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
				3062	long delta, load = gcfs_rq->avg.load_avg;
				3063
				3064	/*
				3065	* If the load of group cfs_rq is null, the load of the
				3066	* sched_entity will also be null so we can skip the formula
				3067	*/
				3068	if (load) {
				3069	long tg_load;
				3070
				3071	/* Get tg's load and ensure tg_load > 0 */
				3072	tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
				3073
				3074	/* Ensure tg_load >= load and updated with current load*/
				3075	tg_load -= gcfs_rq->tg_load_avg_contrib;
				3076	tg_load += load;
				3077
				3078	/*
				3079	* We need to compute a correction term in the case that the
				3080	* task group is consuming more CPU than a task of equal
				3081	* weight. A task with a weight equals to tg->shares will have
				3082	* a load less or equal to scale_load_down(tg->shares).
				3083	* Similarly, the sched_entities that represent the task group
				3084	* at parent level, can't have a load higher than
				3085	* scale_load_down(tg->shares). And the Sum of sched_entities'
				3086	* load must be <= scale_load_down(tg->shares).
				3087	*/
				3088	if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
				3089	/* scale gcfs_rq's load into tg's shares*/
				3090	load *= scale_load_down(gcfs_rq->tg->shares);
				3091	load /= tg_load;
				3092	}
				3093	}
				3094
				3095	delta = load - se->avg.load_avg;
				3096
				3097	/* Nothing to update */
				3098	if (!delta)
				3099	return;
				3100
				3101	/* Set new sched_entity's load */
				3102	se->avg.load_avg = load;
				3103	se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
				3104
				3105	/* Update parent cfs_rq load */
				3106	add_positive(&cfs_rq->avg.load_avg, delta);
				3107	cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
				3108
				3109	/*
				3110	* If the sched_entity is already enqueued, we also have to update the
				3111	* runnable load avg.
				3112	*/
				3113	if (se->on_rq) {
				3114	/* Update parent cfs_rq runnable_load_avg */
				3115	add_positive(&cfs_rq->runnable_load_avg, delta);
				3116	cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
				3117	}
				3118	}
				3119
				3120	static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
				3121	{
				3122	cfs_rq->propagate_avg = 1;
				3123	}
				3124
				3125	static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
				3126	{
				3127	struct cfs_rq *cfs_rq = group_cfs_rq(se);
				3128
				3129	if (!cfs_rq->propagate_avg)
				3130	return 0;
				3131
				3132	cfs_rq->propagate_avg = 0;
				3133	return 1;
				3134	}
				3135
				3136	/* Update task and its cfs_rq load average */
				3137	static inline int propagate_entity_load_avg(struct sched_entity *se)
				3138	{
				3139	struct cfs_rq *cfs_rq;
				3140
				3141	if (entity_is_task(se))
				3142	return 0;
				3143
				3144	if (!test_and_clear_tg_cfs_propagate(se))
				3145	return 0;
				3146
				3147	cfs_rq = cfs_rq_of(se);
				3148
				3149	set_tg_cfs_propagate(cfs_rq);
				3150
				3151	update_tg_cfs_util(cfs_rq, se);
				3152	update_tg_cfs_load(cfs_rq, se);
				3153
				3154	return 1;
				3155	}
				3156
Peter Zijlstra	6e83125	2014-02-11 16:11:48 +0100	[diff] [blame]	3157	#else /* CONFIG_FAIR_GROUP_SCHED */
Vincent Guittot	09a43ac	2016-11-08 10:53:45 +0100	[diff] [blame]	3158
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3159	static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
Vincent Guittot	09a43ac	2016-11-08 10:53:45 +0100	[diff] [blame]	3160
				3161	static inline int propagate_entity_load_avg(struct sched_entity *se)
				3162	{
				3163	return 0;
				3164	}
				3165
				3166	static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
				3167
Peter Zijlstra	6e83125	2014-02-11 16:11:48 +0100	[diff] [blame]	3168	#endif /* CONFIG_FAIR_GROUP_SCHED */
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	3169
Steve Muckle	a2c6c91	2016-03-24 15:26:07 -0700	[diff] [blame]	3170	static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3171	{
Rafael J. Wysocki	58919e8	2016-08-16 22:14:55 +0200	[diff] [blame]	3172	if (&this_rq()->cfs == cfs_rq) {
Steve Muckle	21e96f8	2016-03-21 17:21:07 -0700	[diff] [blame]	3173	/*
				3174	* There are a few boundary cases this might miss but it should
				3175	* get called often enough that that should (hopefully) not be
				3176	* a real problem -- added to that it only calls on the local
				3177	* CPU, so if we enqueue remotely we'll miss an update, but
				3178	* the next tick/schedule should update.
				3179	*
				3180	* It will not get called when we go idle, because the idle
				3181	* thread is a different class (!fair), nor will the utilization
				3182	* number include things like RT tasks.
				3183	*
				3184	* As is, the util number is not freq-invariant (we'd have to
				3185	* implement arch_scale_freq_capacity() for that).
				3186	*
				3187	* See cpu_util().
				3188	*/
Rafael J. Wysocki	12bde33	2016-08-10 03:11:17 +0200	[diff] [blame]	3189	cpufreq_update_util(rq_of(cfs_rq), 0);
Steve Muckle	21e96f8	2016-03-21 17:21:07 -0700	[diff] [blame]	3190	}
Steve Muckle	a2c6c91	2016-03-24 15:26:07 -0700	[diff] [blame]	3191	}
				3192
Peter Zijlstra	8974189	2016-06-16 10:50:40 +0200	[diff] [blame]	3193	/*
				3194	* Unsigned subtract and clamp on underflow.
				3195	*
				3196	* Explicitly do a load-store to ensure the intermediate value never hits
				3197	* memory. This allows lockless observations without ever seeing the negative
				3198	* values.
				3199	*/
				3200	#define sub_positive(_ptr, _val) do { \
				3201	typeof(_ptr) ptr = (_ptr); \
				3202	typeof(*ptr) val = (_val); \
				3203	typeof(ptr) res, var = READ_ONCE(ptr); \
				3204	res = var - val; \
				3205	if (res > var) \
				3206	res = 0; \
				3207	WRITE_ONCE(*ptr, res); \
				3208	} while (0)
				3209
Peter Zijlstra	3d30544f	2016-06-21 14:27:50 +0200	[diff] [blame]	3210	/**
				3211	* update_cfs_rq_load_avg - update the cfs_rq's load/util averages
				3212	* @now: current time, as per cfs_rq_clock_task()
				3213	* @cfs_rq: cfs_rq to update
				3214	* @update_freq: should we call cfs_rq_util_change() or will the call do so
				3215	*
				3216	* The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
				3217	* avg. The immediate corollary is that all (fair) tasks must be attached, see
				3218	* post_init_entity_util_avg().
				3219	*
				3220	* cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
				3221	*
Peter Zijlstra	7c3edd2	2016-07-13 10:56:25 +0200	[diff] [blame]	3222	* Returns true if the load decayed or we removed load.
				3223	*
				3224	* Since both these conditions indicate a changed cfs_rq->avg.load we should
				3225	* call update_tg_load_avg() when this function returns true.
Peter Zijlstra	3d30544f	2016-06-21 14:27:50 +0200	[diff] [blame]	3226	*/
Steve Muckle	a2c6c91	2016-03-24 15:26:07 -0700	[diff] [blame]	3227	static inline int
				3228	update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
				3229	{
				3230	struct sched_avg *sa = &cfs_rq->avg;
				3231	int decayed, removed_load = 0, removed_util = 0;
				3232
				3233	if (atomic_long_read(&cfs_rq->removed_load_avg)) {
				3234	s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
Peter Zijlstra	8974189	2016-06-16 10:50:40 +0200	[diff] [blame]	3235	sub_positive(&sa->load_avg, r);
				3236	sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
Steve Muckle	a2c6c91	2016-03-24 15:26:07 -0700	[diff] [blame]	3237	removed_load = 1;
Vincent Guittot	4e51607	2016-11-08 10:53:46 +0100	[diff] [blame]	3238	set_tg_cfs_propagate(cfs_rq);
Steve Muckle	a2c6c91	2016-03-24 15:26:07 -0700	[diff] [blame]	3239	}
				3240
				3241	if (atomic_long_read(&cfs_rq->removed_util_avg)) {
				3242	long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
Peter Zijlstra	8974189	2016-06-16 10:50:40 +0200	[diff] [blame]	3243	sub_positive(&sa->util_avg, r);
				3244	sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
Steve Muckle	a2c6c91	2016-03-24 15:26:07 -0700	[diff] [blame]	3245	removed_util = 1;
Vincent Guittot	4e51607	2016-11-08 10:53:46 +0100	[diff] [blame]	3246	set_tg_cfs_propagate(cfs_rq);
Steve Muckle	a2c6c91	2016-03-24 15:26:07 -0700	[diff] [blame]	3247	}
				3248
				3249	decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
				3250	scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
				3251
				3252	#ifndef CONFIG_64BIT
				3253	smp_wmb();
				3254	cfs_rq->load_last_update_time_copy = sa->last_update_time;
				3255	#endif
				3256
				3257	if (update_freq && (decayed \|\| removed_util))
				3258	cfs_rq_util_change(cfs_rq);
Steve Muckle	21e96f8	2016-03-21 17:21:07 -0700	[diff] [blame]	3259
Steve Muckle	41e0d37	2016-03-21 17:21:08 -0700	[diff] [blame]	3260	return decayed \|\| removed_load;
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3261	}
				3262
Vincent Guittot	d31b1a6	2016-11-08 10:53:44 +0100	[diff] [blame]	3263	/*
				3264	* Optional action to be done while updating the load average
				3265	*/
				3266	#define UPDATE_TG 0x1
				3267	#define SKIP_AGE_LOAD 0x2
				3268
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3269	/* Update task and its cfs_rq load average */
Vincent Guittot	d31b1a6	2016-11-08 10:53:44 +0100	[diff] [blame]	3270	static inline void update_load_avg(struct sched_entity *se, int flags)
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	3271	{
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	3272	struct cfs_rq *cfs_rq = cfs_rq_of(se);
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3273	u64 now = cfs_rq_clock_task(cfs_rq);
Rafael J. Wysocki	34e2c55	2016-02-15 20:20:42 +0100	[diff] [blame]	3274	struct rq *rq = rq_of(cfs_rq);
				3275	int cpu = cpu_of(rq);
Vincent Guittot	09a43ac	2016-11-08 10:53:45 +0100	[diff] [blame]	3276	int decayed;
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	3277
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	3278	/*
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3279	* Track task load average for carrying it to new CPU after migrated, and
				3280	* track group sched_entity load average for task_h_load calc in migration
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	3281	*/
Vincent Guittot	d31b1a6	2016-11-08 10:53:44 +0100	[diff] [blame]	3282	if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
				3283	__update_load_avg(now, cpu, &se->avg,
Byungchul Park	a05e8c5	2015-08-20 20:21:56 +0900	[diff] [blame]	3284	se->on_rq * scale_load_down(se->load.weight),
				3285	cfs_rq->curr == se, NULL);
Vincent Guittot	d31b1a6	2016-11-08 10:53:44 +0100	[diff] [blame]	3286	}
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	3287
Vincent Guittot	09a43ac	2016-11-08 10:53:45 +0100	[diff] [blame]	3288	decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
				3289	decayed \|= propagate_entity_load_avg(se);
				3290
				3291	if (decayed && (flags & UPDATE_TG))
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3292	update_tg_load_avg(cfs_rq, 0);
				3293	}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	3294
Peter Zijlstra	3d30544f	2016-06-21 14:27:50 +0200	[diff] [blame]	3295	/**
				3296	* attach_entity_load_avg - attach this entity to its cfs_rq load avg
				3297	* @cfs_rq: cfs_rq to attach to
				3298	* @se: sched_entity to attach
				3299	*
				3300	* Must call update_cfs_rq_load_avg() before this, since we rely on
				3301	* cfs_rq->avg.last_update_time being current.
				3302	*/
Byungchul Park	a05e8c5	2015-08-20 20:21:56 +0900	[diff] [blame]	3303	static void attach_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se)
				3304	{
				3305	se->avg.last_update_time = cfs_rq->avg.last_update_time;
				3306	cfs_rq->avg.load_avg += se->avg.load_avg;
				3307	cfs_rq->avg.load_sum += se->avg.load_sum;
				3308	cfs_rq->avg.util_avg += se->avg.util_avg;
				3309	cfs_rq->avg.util_sum += se->avg.util_sum;
Vincent Guittot	09a43ac	2016-11-08 10:53:45 +0100	[diff] [blame]	3310	set_tg_cfs_propagate(cfs_rq);
Steve Muckle	a2c6c91	2016-03-24 15:26:07 -0700	[diff] [blame]	3311
				3312	cfs_rq_util_change(cfs_rq);
Byungchul Park	a05e8c5	2015-08-20 20:21:56 +0900	[diff] [blame]	3313	}
				3314
Peter Zijlstra	3d30544f	2016-06-21 14:27:50 +0200	[diff] [blame]	3315	/**
				3316	* detach_entity_load_avg - detach this entity from its cfs_rq load avg
				3317	* @cfs_rq: cfs_rq to detach from
				3318	* @se: sched_entity to detach
				3319	*
				3320	* Must call update_cfs_rq_load_avg() before this, since we rely on
				3321	* cfs_rq->avg.last_update_time being current.
				3322	*/
Byungchul Park	a05e8c5	2015-08-20 20:21:56 +0900	[diff] [blame]	3323	static void detach_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se)
				3324	{
Byungchul Park	a05e8c5	2015-08-20 20:21:56 +0900	[diff] [blame]	3325
Peter Zijlstra	8974189	2016-06-16 10:50:40 +0200	[diff] [blame]	3326	sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
				3327	sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
				3328	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
				3329	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
Vincent Guittot	09a43ac	2016-11-08 10:53:45 +0100	[diff] [blame]	3330	set_tg_cfs_propagate(cfs_rq);
Steve Muckle	a2c6c91	2016-03-24 15:26:07 -0700	[diff] [blame]	3331
				3332	cfs_rq_util_change(cfs_rq);
Byungchul Park	a05e8c5	2015-08-20 20:21:56 +0900	[diff] [blame]	3333	}
				3334
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3335	/* Add the load generated by se into cfs_rq's load average */
				3336	static inline void
				3337	enqueue_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se)
				3338	{
				3339	struct sched_avg *sa = &se->avg;
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3340
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	3341	cfs_rq->runnable_load_avg += sa->load_avg;
				3342	cfs_rq->runnable_load_sum += sa->load_sum;
				3343
Vincent Guittot	d31b1a6	2016-11-08 10:53:44 +0100	[diff] [blame]	3344	if (!sa->last_update_time) {
Byungchul Park	a05e8c5	2015-08-20 20:21:56 +0900	[diff] [blame]	3345	attach_entity_load_avg(cfs_rq, se);
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3346	update_tg_load_avg(cfs_rq, 0);
Vincent Guittot	d31b1a6	2016-11-08 10:53:44 +0100	[diff] [blame]	3347	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	3348	}
				3349
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	3350	/* Remove the runnable load generated by se from cfs_rq's runnable load average */
				3351	static inline void
				3352	dequeue_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se)
				3353	{
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	3354	cfs_rq->runnable_load_avg =
				3355	max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
				3356	cfs_rq->runnable_load_sum =
Byungchul Park	a05e8c5	2015-08-20 20:21:56 +0900	[diff] [blame]	3357	max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	3358	}
				3359
Yuyang Du	0905f04	2015-12-17 07:34:27 +0800	[diff] [blame]	3360	#ifndef CONFIG_64BIT
				3361	static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
				3362	{
				3363	u64 last_update_time_copy;
				3364	u64 last_update_time;
				3365
				3366	do {
				3367	last_update_time_copy = cfs_rq->load_last_update_time_copy;
				3368	smp_rmb();
				3369	last_update_time = cfs_rq->avg.last_update_time;
				3370	} while (last_update_time != last_update_time_copy);
				3371
				3372	return last_update_time;
				3373	}
				3374	#else
				3375	static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
				3376	{
				3377	return cfs_rq->avg.last_update_time;
				3378	}
				3379	#endif
				3380
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	3381	/*
Morten Rasmussen	104cb16	2016-10-14 14:41:07 +0100	[diff] [blame]	3382	* Synchronize entity load avg of dequeued entity without locking
				3383	* the previous rq.
				3384	*/
				3385	void sync_entity_load_avg(struct sched_entity *se)
				3386	{
				3387	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				3388	u64 last_update_time;
				3389
				3390	last_update_time = cfs_rq_last_update_time(cfs_rq);
				3391	__update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
				3392	}
				3393
				3394	/*
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3395	* Task first catches up with cfs_rq, and then subtract
				3396	* itself from the cfs_rq (task must be off the queue now).
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	3397	*/
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3398	void remove_entity_load_avg(struct sched_entity *se)
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	3399	{
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3400	struct cfs_rq *cfs_rq = cfs_rq_of(se);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	3401
Yuyang Du	0905f04	2015-12-17 07:34:27 +0800	[diff] [blame]	3402	/*
Peter Zijlstra	7dc603c	2016-06-16 13:29:28 +0200	[diff] [blame]	3403	* tasks cannot exit without having gone through wake_up_new_task() ->
				3404	* post_init_entity_util_avg() which will have added things to the
				3405	* cfs_rq, so we can remove unconditionally.
				3406	*
				3407	* Similarly for groups, they will have passed through
				3408	* post_init_entity_util_avg() before unregister_sched_fair_group()
				3409	* calls this.
Yuyang Du	0905f04	2015-12-17 07:34:27 +0800	[diff] [blame]	3410	*/
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	3411
Morten Rasmussen	104cb16	2016-10-14 14:41:07 +0100	[diff] [blame]	3412	sync_entity_load_avg(se);
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3413	atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
				3414	atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	3415	}
Vincent Guittot	642dbc3	2013-04-18 18:34:26 +0200	[diff] [blame]	3416
Yuyang Du	7ea241a	2015-07-15 08:04:42 +0800	[diff] [blame]	3417	static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
				3418	{
				3419	return cfs_rq->runnable_load_avg;
				3420	}
				3421
				3422	static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
				3423	{
				3424	return cfs_rq->avg.load_avg;
				3425	}
				3426
Peter Zijlstra	6e83125	2014-02-11 16:11:48 +0100	[diff] [blame]	3427	static int idle_balance(struct rq *this_rq);
				3428
Peter Zijlstra	38033c3	2014-01-23 20:32:21 +0100	[diff] [blame]	3429	#else /* CONFIG_SMP */
				3430
Peter Zijlstra	0101147	2016-06-17 11:20:46 +0200	[diff] [blame]	3431	static inline int
				3432	update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
				3433	{
				3434	return 0;
				3435	}
				3436
Vincent Guittot	d31b1a6	2016-11-08 10:53:44 +0100	[diff] [blame]	3437	#define UPDATE_TG 0x0
				3438	#define SKIP_AGE_LOAD 0x0
				3439
				3440	static inline void update_load_avg(struct sched_entity *se, int not_used1)
Rafael J. Wysocki	536bd00	2016-05-06 14:58:43 +0200	[diff] [blame]	3441	{
Rafael J. Wysocki	12bde33	2016-08-10 03:11:17 +0200	[diff] [blame]	3442	cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
Rafael J. Wysocki	536bd00	2016-05-06 14:58:43 +0200	[diff] [blame]	3443	}
				3444
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3445	static inline void
				3446	enqueue_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) {}
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	3447	static inline void
				3448	dequeue_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) {}
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3449	static inline void remove_entity_load_avg(struct sched_entity *se) {}
Peter Zijlstra	6e83125	2014-02-11 16:11:48 +0100	[diff] [blame]	3450
Byungchul Park	a05e8c5	2015-08-20 20:21:56 +0900	[diff] [blame]	3451	static inline void
				3452	attach_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) {}
				3453	static inline void
				3454	detach_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) {}
				3455
Peter Zijlstra	6e83125	2014-02-11 16:11:48 +0100	[diff] [blame]	3456	static inline int idle_balance(struct rq *rq)
				3457	{
				3458	return 0;
				3459	}
				3460
Peter Zijlstra	38033c3	2014-01-23 20:32:21 +0100	[diff] [blame]	3461	#endif /* CONFIG_SMP */
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	3462
Peter Zijlstra	ddc9729	2007-10-15 17:00:10 +0200	[diff] [blame]	3463	static void check_spread(struct cfs_rq cfs_rq, struct sched_entity se)
				3464	{
				3465	#ifdef CONFIG_SCHED_DEBUG
				3466	s64 d = se->vruntime - cfs_rq->min_vruntime;
				3467
				3468	if (d < 0)
				3469	d = -d;
				3470
				3471	if (d > 3*sysctl_sched_latency)
Josh Poimboeuf	ae92882	2016-06-17 12:43:24 -0500	[diff] [blame]	3472	schedstat_inc(cfs_rq->nr_spread_over);
Peter Zijlstra	ddc9729	2007-10-15 17:00:10 +0200	[diff] [blame]	3473	#endif
				3474	}
				3475
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3476	static void
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	3477	place_entity(struct cfs_rq cfs_rq, struct sched_entity se, int initial)
				3478	{
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	3479	u64 vruntime = cfs_rq->min_vruntime;
Peter Zijlstra	94dfb5e	2007-10-15 17:00:05 +0200	[diff] [blame]	3480
Peter Zijlstra	2cb8600	2007-11-09 22:39:37 +0100	[diff] [blame]	3481	/*
				3482	* The 'current' period is already promised to the current tasks,
				3483	* however the extra weight of the new task will slow them down a
				3484	* little, place the new task so that it fits in the slot that
				3485	* stays open at the end.
				3486	*/
Peter Zijlstra	94dfb5e	2007-10-15 17:00:05 +0200	[diff] [blame]	3487	if (initial && sched_feat(START_DEBIT))
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	3488	vruntime += sched_vslice(cfs_rq, se);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	3489
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	3490	/* sleeps up to a single latency don't count. */
Mike Galbraith	5ca9880	2010-03-11 17:17:17 +0100	[diff] [blame]	3491	if (!initial) {
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	3492	unsigned long thresh = sysctl_sched_latency;
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	3493
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	3494	/*
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	3495	* Halve their sleep time's effect, to allow
				3496	* for a gentler effect of sleepers:
				3497	*/
				3498	if (sched_feat(GENTLE_FAIR_SLEEPERS))
				3499	thresh >>= 1;
Ingo Molnar	51e0304	2009-09-16 08:54:45 +0200	[diff] [blame]	3500
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	3501	vruntime -= thresh;
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	3502	}
				3503
Mike Galbraith	b5d9d73	2009-09-08 11:12:28 +0200	[diff] [blame]	3504	/* ensure we never gain time by being placed backwards. */
Viresh Kumar	16c8f1c	2012-11-08 13:33:46 +0530	[diff] [blame]	3505	se->vruntime = max_vruntime(se->vruntime, vruntime);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	3506	}
				3507
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	3508	static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
				3509
Mel Gorman	cb25176	2016-02-05 09:08:36 +0000	[diff] [blame]	3510	static inline void check_schedstat_required(void)
				3511	{
				3512	#ifdef CONFIG_SCHEDSTATS
				3513	if (schedstat_enabled())
				3514	return;
				3515
				3516	/* Force schedstat enabled if a dependent tracepoint is active */
				3517	if (trace_sched_stat_wait_enabled() \|\|
				3518	trace_sched_stat_sleep_enabled() \|\|
				3519	trace_sched_stat_iowait_enabled() \|\|
				3520	trace_sched_stat_blocked_enabled() \|\|
				3521	trace_sched_stat_runtime_enabled()) {
Josh Poimboeuf	eda8dca	2016-06-13 02:32:09 -0500	[diff] [blame]	3522	printk_deferred_once("Scheduler tracepoints stat_sleep, stat_iowait, "
Mel Gorman	cb25176	2016-02-05 09:08:36 +0000	[diff] [blame]	3523	"stat_blocked and stat_runtime require the "
				3524	"kernel parameter schedstats=enabled or "
				3525	"kernel.sched_schedstats=1\n");
				3526	}
				3527	#endif
				3528	}
				3529
Peter Zijlstra	b5179ac	2016-05-11 16:10:34 +0200	[diff] [blame]	3530
				3531	/*
				3532	* MIGRATION
				3533	*
				3534	* dequeue
				3535	* update_curr()
				3536	* update_min_vruntime()
				3537	* vruntime -= min_vruntime
				3538	*
				3539	* enqueue
				3540	* update_curr()
				3541	* update_min_vruntime()
				3542	* vruntime += min_vruntime
				3543	*
				3544	* this way the vruntime transition between RQs is done when both
				3545	* min_vruntime are up-to-date.
				3546	*
				3547	* WAKEUP (remote)
				3548	*
Peter Zijlstra	59efa0b	2016-05-10 18:24:37 +0200	[diff] [blame]	3549	* ->migrate_task_rq_fair() (p->state == TASK_WAKING)
Peter Zijlstra	b5179ac	2016-05-11 16:10:34 +0200	[diff] [blame]	3550	* vruntime -= min_vruntime
				3551	*
				3552	* enqueue
				3553	* update_curr()
				3554	* update_min_vruntime()
				3555	* vruntime += min_vruntime
				3556	*
				3557	* this way we don't have the most up-to-date min_vruntime on the originating
				3558	* CPU and an up-to-date min_vruntime on the destination CPU.
				3559	*/
				3560
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	3561	static void
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	3562	enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3563	{
Peter Zijlstra	2f95035	2016-05-11 19:27:56 +0200	[diff] [blame]	3564	bool renorm = !(flags & ENQUEUE_WAKEUP) \|\| (flags & ENQUEUE_MIGRATED);
				3565	bool curr = cfs_rq->curr == se;
Peter Zijlstra	3a47d51	2016-03-09 13:04:03 +0100	[diff] [blame]	3566
Ingo Molnar	53d3bc7	2016-05-11 08:25:53 +0200	[diff] [blame]	3567	/*
Peter Zijlstra	2f95035	2016-05-11 19:27:56 +0200	[diff] [blame]	3568	* If we're the current task, we must renormalise before calling
				3569	* update_curr().
Ingo Molnar	53d3bc7	2016-05-11 08:25:53 +0200	[diff] [blame]	3570	*/
Peter Zijlstra	2f95035	2016-05-11 19:27:56 +0200	[diff] [blame]	3571	if (renorm && curr)
				3572	se->vruntime += cfs_rq->min_vruntime;
				3573
Ingo Molnar	b7cc089	2007-08-09 11:16:47 +0200	[diff] [blame]	3574	update_curr(cfs_rq);
Peter Zijlstra	2f95035	2016-05-11 19:27:56 +0200	[diff] [blame]	3575
				3576	/*
				3577	* Otherwise, renormalise after, such that we're placed at the current
				3578	* moment in time, instead of some random moment in the past. Being
				3579	* placed in the past could significantly boost this task to the
				3580	* fairness detriment of existing tasks.
				3581	*/
				3582	if (renorm && !curr)
				3583	se->vruntime += cfs_rq->min_vruntime;
				3584
Vincent Guittot	d31b1a6	2016-11-08 10:53:44 +0100	[diff] [blame]	3585	update_load_avg(se, UPDATE_TG);
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3586	enqueue_entity_load_avg(cfs_rq, se);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	3587	account_entity_enqueue(cfs_rq, se);
				3588	update_cfs_shares(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3589
Josh Poimboeuf	1a3d027	2016-06-17 12:43:23 -0500	[diff] [blame]	3590	if (flags & ENQUEUE_WAKEUP)
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	3591	place_entity(cfs_rq, se, 0);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3592
Mel Gorman	cb25176	2016-02-05 09:08:36 +0000	[diff] [blame]	3593	check_schedstat_required();
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	3594	update_stats_enqueue(cfs_rq, se, flags);
				3595	check_spread(cfs_rq, se);
Peter Zijlstra	2f95035	2016-05-11 19:27:56 +0200	[diff] [blame]	3596	if (!curr)
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	3597	__enqueue_entity(cfs_rq, se);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	3598	se->on_rq = 1;
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	3599
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	3600	if (cfs_rq->nr_running == 1) {
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	3601	list_add_leaf_cfs_rq(cfs_rq);
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	3602	check_enqueue_throttle(cfs_rq);
				3603	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3604	}
				3605
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	3606	static void __clear_buddies_last(struct sched_entity *se)
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	3607	{
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	3608	for_each_sched_entity(se) {
				3609	struct cfs_rq *cfs_rq = cfs_rq_of(se);
Peter Zijlstra	f104479	2012-02-11 06:05:00 +0100	[diff] [blame]	3610	if (cfs_rq->last != se)
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	3611	break;
Peter Zijlstra	f104479	2012-02-11 06:05:00 +0100	[diff] [blame]	3612
				3613	cfs_rq->last = NULL;
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	3614	}
				3615	}
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	3616
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	3617	static void __clear_buddies_next(struct sched_entity *se)
				3618	{
				3619	for_each_sched_entity(se) {
				3620	struct cfs_rq *cfs_rq = cfs_rq_of(se);
Peter Zijlstra	f104479	2012-02-11 06:05:00 +0100	[diff] [blame]	3621	if (cfs_rq->next != se)
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	3622	break;
Peter Zijlstra	f104479	2012-02-11 06:05:00 +0100	[diff] [blame]	3623
				3624	cfs_rq->next = NULL;
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	3625	}
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	3626	}
				3627
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3628	static void __clear_buddies_skip(struct sched_entity *se)
				3629	{
				3630	for_each_sched_entity(se) {
				3631	struct cfs_rq *cfs_rq = cfs_rq_of(se);
Peter Zijlstra	f104479	2012-02-11 06:05:00 +0100	[diff] [blame]	3632	if (cfs_rq->skip != se)
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3633	break;
Peter Zijlstra	f104479	2012-02-11 06:05:00 +0100	[diff] [blame]	3634
				3635	cfs_rq->skip = NULL;
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3636	}
				3637	}
				3638
Peter Zijlstra	a571bbe	2009-01-28 14:51:40 +0100	[diff] [blame]	3639	static void clear_buddies(struct cfs_rq cfs_rq, struct sched_entity se)
				3640	{
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	3641	if (cfs_rq->last == se)
				3642	__clear_buddies_last(se);
				3643
				3644	if (cfs_rq->next == se)
				3645	__clear_buddies_next(se);
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3646
				3647	if (cfs_rq->skip == se)
				3648	__clear_buddies_skip(se);
Peter Zijlstra	a571bbe	2009-01-28 14:51:40 +0100	[diff] [blame]	3649	}
				3650
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	3651	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	3652
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3653	static void
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	3654	dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3655	{
Dmitry Adamushko	a2a2d68	2007-10-15 17:00:13 +0200	[diff] [blame]	3656	/*
				3657	* Update run-time statistics of the 'current'.
				3658	*/
				3659	update_curr(cfs_rq);
Vincent Guittot	d31b1a6	2016-11-08 10:53:44 +0100	[diff] [blame]	3660	update_load_avg(se, UPDATE_TG);
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	3661	dequeue_entity_load_avg(cfs_rq, se);
Dmitry Adamushko	a2a2d68	2007-10-15 17:00:13 +0200	[diff] [blame]	3662
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	3663	update_stats_dequeue(cfs_rq, se, flags);
Peter Zijlstra	67e9fb2	2007-10-15 17:00:10 +0200	[diff] [blame]	3664
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	3665	clear_buddies(cfs_rq, se);
Peter Zijlstra	4793241	2008-11-04 21:25:09 +0100	[diff] [blame]	3666
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	3667	if (se != cfs_rq->curr)
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	3668	__dequeue_entity(cfs_rq, se);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	3669	se->on_rq = 0;
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	3670	account_entity_dequeue(cfs_rq, se);
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	3671
				3672	/*
Peter Zijlstra	b60205c	2016-09-20 21:58:12 +0200	[diff] [blame]	3673	* Normalize after update_curr(); which will also have moved
				3674	* min_vruntime if @se is the one holding it back. But before doing
				3675	* update_min_vruntime() again, which will discount @se's position and
				3676	* can move min_vruntime forward still more.
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	3677	*/
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	3678	if (!(flags & DEQUEUE_SLEEP))
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	3679	se->vruntime -= cfs_rq->min_vruntime;
Peter Zijlstra	1e87623	2011-05-17 16:21:10 -0700	[diff] [blame]	3680
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	3681	/* return excess runtime on last dequeue */
				3682	return_cfs_rq_runtime(cfs_rq);
				3683
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	3684	update_cfs_shares(cfs_rq);
Peter Zijlstra	b60205c	2016-09-20 21:58:12 +0200	[diff] [blame]	3685
				3686	/*
				3687	* Now advance min_vruntime if @se was the entity holding it back,
				3688	* except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
				3689	* put back on, and if we advance min_vruntime, we'll be placed back
				3690	* further than we started -- ie. we'll be penalized.
				3691	*/
				3692	if ((flags & (DEQUEUE_SAVE \| DEQUEUE_MOVE)) == DEQUEUE_SAVE)
				3693	update_min_vruntime(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3694	}
				3695
				3696	/*
				3697	* Preempt the current task with a newly woken task if needed:
				3698	*/
Peter Zijlstra	7c92e54	2007-09-05 14:32:49 +0200	[diff] [blame]	3699	static void
Ingo Molnar	2e09bf5	2007-10-15 17:00:05 +0200	[diff] [blame]	3700	check_preempt_tick(struct cfs_rq cfs_rq, struct sched_entity curr)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3701	{
Peter Zijlstra	1169783	2007-09-05 14:32:49 +0200	[diff] [blame]	3702	unsigned long ideal_runtime, delta_exec;
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	3703	struct sched_entity *se;
				3704	s64 delta;
Peter Zijlstra	1169783	2007-09-05 14:32:49 +0200	[diff] [blame]	3705
Peter Zijlstra	6d0f0eb	2007-10-15 17:00:05 +0200	[diff] [blame]	3706	ideal_runtime = sched_slice(cfs_rq, curr);
Peter Zijlstra	1169783	2007-09-05 14:32:49 +0200	[diff] [blame]	3707	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
Mike Galbraith	a9f3e2b	2009-01-28 14:51:39 +0100	[diff] [blame]	3708	if (delta_exec > ideal_runtime) {
Kirill Tkhai	8875125	2014-06-29 00:03:57 +0400	[diff] [blame]	3709	resched_curr(rq_of(cfs_rq));
Mike Galbraith	a9f3e2b	2009-01-28 14:51:39 +0100	[diff] [blame]	3710	/*
				3711	* The current task ran long enough, ensure it doesn't get
				3712	* re-elected due to buddy favours.
				3713	*/
				3714	clear_buddies(cfs_rq, curr);
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	3715	return;
				3716	}
				3717
				3718	/*
				3719	* Ensure that a task that missed wakeup preemption by a
				3720	* narrow margin doesn't have to wait for a full slice.
				3721	* This also mitigates buddy induced latencies under load.
				3722	*/
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	3723	if (delta_exec < sysctl_sched_min_granularity)
				3724	return;
				3725
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	3726	se = __pick_first_entity(cfs_rq);
				3727	delta = curr->vruntime - se->vruntime;
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	3728
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	3729	if (delta < 0)
				3730	return;
Mike Galbraith	d7d8294	2011-01-05 05:41:17 +0100	[diff] [blame]	3731
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	3732	if (delta > ideal_runtime)
Kirill Tkhai	8875125	2014-06-29 00:03:57 +0400	[diff] [blame]	3733	resched_curr(rq_of(cfs_rq));
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3734	}
				3735
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	3736	static void
Ingo Molnar	8494f41	2007-08-09 11:16:48 +0200	[diff] [blame]	3737	set_next_entity(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3738	{
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	3739	/* 'current' is not kept within the tree. */
				3740	if (se->on_rq) {
				3741	/*
				3742	* Any task has to be enqueued before it get to execute on
				3743	* a CPU. So account for the time it spent waiting on the
				3744	* runqueue.
				3745	*/
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	3746	update_stats_wait_end(cfs_rq, se);
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	3747	__dequeue_entity(cfs_rq, se);
Vincent Guittot	d31b1a6	2016-11-08 10:53:44 +0100	[diff] [blame]	3748	update_load_avg(se, UPDATE_TG);
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	3749	}
				3750
Ingo Molnar	79303e9	2007-08-09 11:16:47 +0200	[diff] [blame]	3751	update_stats_curr_start(cfs_rq, se);
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	3752	cfs_rq->curr = se;
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	3753
Ingo Molnar	eba1ed4	2007-10-15 17:00:02 +0200	[diff] [blame]	3754	/*
				3755	* Track our maximum slice length, if the CPU's load is at
				3756	* least twice that of our own weight (i.e. dont track it
				3757	* when there are only lesser-weight tasks around):
				3758	*/
Mel Gorman	cb25176	2016-02-05 09:08:36 +0000	[diff] [blame]	3759	if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	3760	schedstat_set(se->statistics.slice_max,
				3761	max((u64)schedstat_val(se->statistics.slice_max),
				3762	se->sum_exec_runtime - se->prev_sum_exec_runtime));
Ingo Molnar	eba1ed4	2007-10-15 17:00:02 +0200	[diff] [blame]	3763	}
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	3764
Peter Zijlstra	4a55b45	2007-09-05 14:32:49 +0200	[diff] [blame]	3765	se->prev_sum_exec_runtime = se->sum_exec_runtime;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3766	}
				3767
Peter Zijlstra	3f3a490	2008-10-24 11:06:16 +0200	[diff] [blame]	3768	static int
				3769	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se);
				3770
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3771	/*
				3772	* Pick the next process, keeping these things in mind, in this order:
				3773	* 1) keep things fair between processes/task groups
				3774	* 2) pick the "next" process, since someone really wants that to run
				3775	* 3) pick the "last" process, for cache locality
				3776	* 4) do not run the "skip" process, if something else is available
				3777	*/
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	3778	static struct sched_entity *
				3779	pick_next_entity(struct cfs_rq cfs_rq, struct sched_entity curr)
Peter Zijlstra	aa2ac25	2008-03-14 21:12:12 +0100	[diff] [blame]	3780	{
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	3781	struct sched_entity *left = __pick_first_entity(cfs_rq);
				3782	struct sched_entity *se;
				3783
				3784	/*
				3785	* If curr is set we have to see if its left of the leftmost entity
				3786	* still in the tree, provided there was anything in the tree at all.
				3787	*/
				3788	if (!left \|\| (curr && entity_before(curr, left)))
				3789	left = curr;
				3790
				3791	se = left; /* ideally we run the leftmost entity */
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	3792
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3793	/*
				3794	* Avoid running the skip buddy, if running something else can
				3795	* be done without getting too unfair.
				3796	*/
				3797	if (cfs_rq->skip == se) {
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	3798	struct sched_entity *second;
				3799
				3800	if (se == curr) {
				3801	second = __pick_first_entity(cfs_rq);
				3802	} else {
				3803	second = __pick_next_entity(se);
				3804	if (!second \|\| (curr && entity_before(curr, second)))
				3805	second = curr;
				3806	}
				3807
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3808	if (second && wakeup_preempt_entity(second, left) < 1)
				3809	se = second;
				3810	}
Peter Zijlstra	aa2ac25	2008-03-14 21:12:12 +0100	[diff] [blame]	3811
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	3812	/*
				3813	* Prefer last buddy, try to return the CPU to a preempted task.
				3814	*/
				3815	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
				3816	se = cfs_rq->last;
				3817
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3818	/*
				3819	* Someone really wants this to run. If it's not unfair, run it.
				3820	*/
				3821	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
				3822	se = cfs_rq->next;
				3823
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	3824	clear_buddies(cfs_rq, se);
Peter Zijlstra	4793241	2008-11-04 21:25:09 +0100	[diff] [blame]	3825
				3826	return se;
Peter Zijlstra	aa2ac25	2008-03-14 21:12:12 +0100	[diff] [blame]	3827	}
				3828
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	3829	static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	3830
Ingo Molnar	ab6cde2	2007-08-09 11:16:48 +0200	[diff] [blame]	3831	static void put_prev_entity(struct cfs_rq cfs_rq, struct sched_entity prev)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3832	{
				3833	/*
				3834	* If still on the runqueue then deactivate_task()
				3835	* was not called and update_curr() has to be done:
				3836	*/
				3837	if (prev->on_rq)
Ingo Molnar	b7cc089	2007-08-09 11:16:47 +0200	[diff] [blame]	3838	update_curr(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3839
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	3840	/* throttle cfs_rqs exceeding runtime */
				3841	check_cfs_rq_runtime(cfs_rq);
				3842
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	3843	check_spread(cfs_rq, prev);
Mel Gorman	cb25176	2016-02-05 09:08:36 +0000	[diff] [blame]	3844
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	3845	if (prev->on_rq) {
Josh Poimboeuf	4fa8d29	2016-06-17 12:43:26 -0500	[diff] [blame]	3846	update_stats_wait_start(cfs_rq, prev);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	3847	/* Put 'current' back into the tree. */
				3848	__enqueue_entity(cfs_rq, prev);
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	3849	/* in !on_rq case, update occurred at dequeue */
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3850	update_load_avg(prev, 0);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	3851	}
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	3852	cfs_rq->curr = NULL;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3853	}
				3854
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3855	static void
				3856	entity_tick(struct cfs_rq cfs_rq, struct sched_entity curr, int queued)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3857	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3858	/*
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	3859	* Update run-time statistics of the 'current'.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3860	*/
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	3861	update_curr(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3862
Paul Turner	43365bd	2010-12-15 19:10:17 -0800	[diff] [blame]	3863	/*
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	3864	* Ensure that runnable average is periodically updated.
				3865	*/
Vincent Guittot	d31b1a6	2016-11-08 10:53:44 +0100	[diff] [blame]	3866	update_load_avg(curr, UPDATE_TG);
Peter Zijlstra	bf0bd94	2013-07-26 23:48:42 +0200	[diff] [blame]	3867	update_cfs_shares(cfs_rq);
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	3868
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3869	#ifdef CONFIG_SCHED_HRTICK
				3870	/*
				3871	* queued ticks are scheduled to match the slice, so don't bother
				3872	* validating it and just reschedule.
				3873	*/
Harvey Harrison	983ed7a	2008-04-24 18:17:55 -0700	[diff] [blame]	3874	if (queued) {
Kirill Tkhai	8875125	2014-06-29 00:03:57 +0400	[diff] [blame]	3875	resched_curr(rq_of(cfs_rq));
Harvey Harrison	983ed7a	2008-04-24 18:17:55 -0700	[diff] [blame]	3876	return;
				3877	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3878	/*
				3879	* don't let the period tick interfere with the hrtick preemption
				3880	*/
				3881	if (!sched_feat(DOUBLE_TICK) &&
				3882	hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
				3883	return;
				3884	#endif
				3885
Yong Zhang	2c2efae	2011-07-29 16:20:33 +0800	[diff] [blame]	3886	if (cfs_rq->nr_running > 1)
Ingo Molnar	2e09bf5	2007-10-15 17:00:05 +0200	[diff] [blame]	3887	check_preempt_tick(cfs_rq, curr);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3888	}
				3889
Paul Turner	ab84d31	2011-07-21 09:43:28 -0700	[diff] [blame]	3890
				3891	/**************************************************
				3892	* CFS bandwidth control machinery
				3893	*/
				3894
				3895	#ifdef CONFIG_CFS_BANDWIDTH
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3896
				3897	#ifdef HAVE_JUMP_LABEL
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	3898	static struct static_key __cfs_bandwidth_used;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3899
				3900	static inline bool cfs_bandwidth_used(void)
				3901	{
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	3902	return static_key_false(&__cfs_bandwidth_used);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3903	}
				3904
Ben Segall	1ee14e6	2013-10-16 11:16:12 -0700	[diff] [blame]	3905	void cfs_bandwidth_usage_inc(void)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3906	{
Ben Segall	1ee14e6	2013-10-16 11:16:12 -0700	[diff] [blame]	3907	static_key_slow_inc(&__cfs_bandwidth_used);
				3908	}
				3909
				3910	void cfs_bandwidth_usage_dec(void)
				3911	{
				3912	static_key_slow_dec(&__cfs_bandwidth_used);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3913	}
				3914	#else /* HAVE_JUMP_LABEL */
				3915	static bool cfs_bandwidth_used(void)
				3916	{
				3917	return true;
				3918	}
				3919
Ben Segall	1ee14e6	2013-10-16 11:16:12 -0700	[diff] [blame]	3920	void cfs_bandwidth_usage_inc(void) {}
				3921	void cfs_bandwidth_usage_dec(void) {}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3922	#endif /* HAVE_JUMP_LABEL */
				3923
Paul Turner	ab84d31	2011-07-21 09:43:28 -0700	[diff] [blame]	3924	/*
				3925	* default period for cfs group bandwidth.
				3926	* default: 0.1s, units: nanoseconds
				3927	*/
				3928	static inline u64 default_cfs_period(void)
				3929	{
				3930	return 100000000ULL;
				3931	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	3932
				3933	static inline u64 sched_cfs_bandwidth_slice(void)
				3934	{
				3935	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
				3936	}
				3937
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	3938	/*
				3939	* Replenish runtime according to assigned quota and update expiration time.
				3940	* We use sched_clock_cpu directly instead of rq->clock to avoid adding
				3941	* additional synchronization around rq->lock.
				3942	*
				3943	* requires cfs_b->lock
				3944	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3945	void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	3946	{
				3947	u64 now;
				3948
				3949	if (cfs_b->quota == RUNTIME_INF)
				3950	return;
				3951
				3952	now = sched_clock_cpu(smp_processor_id());
				3953	cfs_b->runtime = cfs_b->quota;
				3954	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
				3955	}
				3956
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3957	static inline struct cfs_bandwidth tg_cfs_bandwidth(struct task_group tg)
				3958	{
				3959	return &tg->cfs_bandwidth;
				3960	}
				3961
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	3962	/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
				3963	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
				3964	{
				3965	if (unlikely(cfs_rq->throttle_count))
Xunlei Pang	1a99ae3	2016-05-10 21:03:18 +0800	[diff] [blame]	3966	return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	3967
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	3968	return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	3969	}
				3970
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	3971	/* returns 0 on failure to allocate runtime */
				3972	static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	3973	{
				3974	struct task_group *tg = cfs_rq->tg;
				3975	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	3976	u64 amount = 0, min_amount, expires;
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	3977
				3978	/* note: this is a positive sum as runtime_remaining <= 0 */
				3979	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
				3980
				3981	raw_spin_lock(&cfs_b->lock);
				3982	if (cfs_b->quota == RUNTIME_INF)
				3983	amount = min_amount;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	3984	else {
Peter Zijlstra	77a4d1a	2015-04-15 11:41:57 +0200	[diff] [blame]	3985	start_cfs_bandwidth(cfs_b);
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	3986
				3987	if (cfs_b->runtime > 0) {
				3988	amount = min(cfs_b->runtime, min_amount);
				3989	cfs_b->runtime -= amount;
				3990	cfs_b->idle = 0;
				3991	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	3992	}
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	3993	expires = cfs_b->runtime_expires;
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	3994	raw_spin_unlock(&cfs_b->lock);
				3995
				3996	cfs_rq->runtime_remaining += amount;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	3997	/*
				3998	* we may have advanced our local expiration to account for allowed
				3999	* spread between our sched_clock and the one on which runtime was
				4000	* issued.
				4001	*/
				4002	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
				4003	cfs_rq->runtime_expires = expires;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4004
				4005	return cfs_rq->runtime_remaining > 0;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	4006	}
				4007
				4008	/*
				4009	* Note: This depends on the synchronization provided by sched_clock and the
				4010	* fact that rq->clock snapshots this value.
				4011	*/
				4012	static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				4013	{
				4014	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	4015
				4016	/* if the deadline is ahead of our clock, nothing to do */
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	4017	if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	4018	return;
				4019
				4020	if (cfs_rq->runtime_remaining < 0)
				4021	return;
				4022
				4023	/*
				4024	* If the local deadline has passed we have to consider the
				4025	* possibility that our sched_clock is 'fast' and the global deadline
				4026	* has not truly expired.
				4027	*
				4028	* Fortunately we can check determine whether this the case by checking
Ben Segall	51f2176	2014-05-19 15:49:45 -0700	[diff] [blame]	4029	* whether the global deadline has advanced. It is valid to compare
				4030	* cfs_b->runtime_expires without any locks since we only care about
				4031	* exact equality, so a partial write will still work.
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	4032	*/
				4033
Ben Segall	51f2176	2014-05-19 15:49:45 -0700	[diff] [blame]	4034	if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	4035	/* extend local deadline, drift is bounded above by 2 ticks */
				4036	cfs_rq->runtime_expires += TICK_NSEC;
				4037	} else {
				4038	/* global deadline is ahead, expiration has passed */
				4039	cfs_rq->runtime_remaining = 0;
				4040	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	4041	}
				4042
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	4043	static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	4044	{
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	4045	/* dock delta_exec before expiring quota (as it could span periods) */
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	4046	cfs_rq->runtime_remaining -= delta_exec;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	4047	expire_cfs_rq_runtime(cfs_rq);
				4048
				4049	if (likely(cfs_rq->runtime_remaining > 0))
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	4050	return;
				4051
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4052	/*
				4053	* if we're unable to extend our runtime we resched so that the active
				4054	* hierarchy can be throttled
				4055	*/
				4056	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
Kirill Tkhai	8875125	2014-06-29 00:03:57 +0400	[diff] [blame]	4057	resched_curr(rq_of(cfs_rq));
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	4058	}
				4059
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	4060	static __always_inline
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	4061	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	4062	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	4063	if (!cfs_bandwidth_used() \|\| !cfs_rq->runtime_enabled)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	4064	return;
				4065
				4066	__account_cfs_rq_runtime(cfs_rq, delta_exec);
				4067	}
				4068
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4069	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
				4070	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	4071	return cfs_bandwidth_used() && cfs_rq->throttled;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4072	}
				4073
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	4074	/* check whether cfs_rq, or any parent, is throttled */
				4075	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
				4076	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	4077	return cfs_bandwidth_used() && cfs_rq->throttle_count;
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	4078	}
				4079
				4080	/*
				4081	* Ensure that neither of the group entities corresponding to src_cpu or
				4082	* dest_cpu are members of a throttled hierarchy when performing group
				4083	* load-balance operations.
				4084	*/
				4085	static inline int throttled_lb_pair(struct task_group *tg,
				4086	int src_cpu, int dest_cpu)
				4087	{
				4088	struct cfs_rq src_cfs_rq, dest_cfs_rq;
				4089
				4090	src_cfs_rq = tg->cfs_rq[src_cpu];
				4091	dest_cfs_rq = tg->cfs_rq[dest_cpu];
				4092
				4093	return throttled_hierarchy(src_cfs_rq) \|\|
				4094	throttled_hierarchy(dest_cfs_rq);
				4095	}
				4096
				4097	/* updated child weight may affect parent so we have to do this bottom up */
				4098	static int tg_unthrottle_up(struct task_group tg, void data)
				4099	{
				4100	struct rq *rq = data;
				4101	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
				4102
				4103	cfs_rq->throttle_count--;
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	4104	if (!cfs_rq->throttle_count) {
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	4105	/* adjust cfs_rq_clock_task() */
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	4106	cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	4107	cfs_rq->throttled_clock_task;
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	4108	}
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	4109
				4110	return 0;
				4111	}
				4112
				4113	static int tg_throttle_down(struct task_group tg, void data)
				4114	{
				4115	struct rq *rq = data;
				4116	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
				4117
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	4118	/* group is entering throttled state, stop time */
				4119	if (!cfs_rq->throttle_count)
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	4120	cfs_rq->throttled_clock_task = rq_clock_task(rq);
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	4121	cfs_rq->throttle_count++;
				4122
				4123	return 0;
				4124	}
				4125
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	4126	static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4127	{
				4128	struct rq *rq = rq_of(cfs_rq);
				4129	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				4130	struct sched_entity *se;
				4131	long task_delta, dequeue = 1;
Peter Zijlstra	77a4d1a	2015-04-15 11:41:57 +0200	[diff] [blame]	4132	bool empty;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4133
				4134	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
				4135
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	4136	/* freeze hierarchy runnable averages while throttled */
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	4137	rcu_read_lock();
				4138	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
				4139	rcu_read_unlock();
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4140
				4141	task_delta = cfs_rq->h_nr_running;
				4142	for_each_sched_entity(se) {
				4143	struct cfs_rq *qcfs_rq = cfs_rq_of(se);
				4144	/* throttled entity or throttle-on-deactivate */
				4145	if (!se->on_rq)
				4146	break;
				4147
				4148	if (dequeue)
				4149	dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
				4150	qcfs_rq->h_nr_running -= task_delta;
				4151
				4152	if (qcfs_rq->load.weight)
				4153	dequeue = 0;
				4154	}
				4155
				4156	if (!se)
Kirill Tkhai	7246544	2014-05-09 03:00:14 +0400	[diff] [blame]	4157	sub_nr_running(rq, task_delta);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4158
				4159	cfs_rq->throttled = 1;
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	4160	cfs_rq->throttled_clock = rq_clock(rq);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4161	raw_spin_lock(&cfs_b->lock);
Cong Wang	d49db34	2015-06-24 12:41:47 -0700	[diff] [blame]	4162	empty = list_empty(&cfs_b->throttled_cfs_rq);
Peter Zijlstra	77a4d1a	2015-04-15 11:41:57 +0200	[diff] [blame]	4163
Ben Segall	c06f04c	2014-06-20 15:21:20 -0700	[diff] [blame]	4164	/*
				4165	* Add to the _head_ of the list, so that an already-started
				4166	* distribute_cfs_runtime will not see us
				4167	*/
				4168	list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
Peter Zijlstra	77a4d1a	2015-04-15 11:41:57 +0200	[diff] [blame]	4169
				4170	/*
				4171	* If we're the first throttled task, make sure the bandwidth
				4172	* timer is running.
				4173	*/
				4174	if (empty)
				4175	start_cfs_bandwidth(cfs_b);
				4176
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4177	raw_spin_unlock(&cfs_b->lock);
				4178	}
				4179
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4180	void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4181	{
				4182	struct rq *rq = rq_of(cfs_rq);
				4183	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				4184	struct sched_entity *se;
				4185	int enqueue = 1;
				4186	long task_delta;
				4187
Michael Wang	22b958d	2013-06-04 14:23:39 +0800	[diff] [blame]	4188	se = cfs_rq->tg->se[cpu_of(rq)];
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4189
				4190	cfs_rq->throttled = 0;
Frederic Weisbecker	1a55af2	2013-04-12 01:51:01 +0200	[diff] [blame]	4191
				4192	update_rq_clock(rq);
				4193
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4194	raw_spin_lock(&cfs_b->lock);
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	4195	cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4196	list_del_rcu(&cfs_rq->throttled_list);
				4197	raw_spin_unlock(&cfs_b->lock);
				4198
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	4199	/* update hierarchical throttle state */
				4200	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
				4201
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4202	if (!cfs_rq->load.weight)
				4203	return;
				4204
				4205	task_delta = cfs_rq->h_nr_running;
				4206	for_each_sched_entity(se) {
				4207	if (se->on_rq)
				4208	enqueue = 0;
				4209
				4210	cfs_rq = cfs_rq_of(se);
				4211	if (enqueue)
				4212	enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
				4213	cfs_rq->h_nr_running += task_delta;
				4214
				4215	if (cfs_rq_throttled(cfs_rq))
				4216	break;
				4217	}
				4218
				4219	if (!se)
Kirill Tkhai	7246544	2014-05-09 03:00:14 +0400	[diff] [blame]	4220	add_nr_running(rq, task_delta);
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4221
				4222	/* determine whether we need to wake up potentially idle cpu */
				4223	if (rq->curr == rq->idle && rq->cfs.nr_running)
Kirill Tkhai	8875125	2014-06-29 00:03:57 +0400	[diff] [blame]	4224	resched_curr(rq);
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4225	}
				4226
				4227	static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
				4228	u64 remaining, u64 expires)
				4229	{
				4230	struct cfs_rq *cfs_rq;
Ben Segall	c06f04c	2014-06-20 15:21:20 -0700	[diff] [blame]	4231	u64 runtime;
				4232	u64 starting_runtime = remaining;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4233
				4234	rcu_read_lock();
				4235	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
				4236	throttled_list) {
				4237	struct rq *rq = rq_of(cfs_rq);
				4238
				4239	raw_spin_lock(&rq->lock);
				4240	if (!cfs_rq_throttled(cfs_rq))
				4241	goto next;
				4242
				4243	runtime = -cfs_rq->runtime_remaining + 1;
				4244	if (runtime > remaining)
				4245	runtime = remaining;
				4246	remaining -= runtime;
				4247
				4248	cfs_rq->runtime_remaining += runtime;
				4249	cfs_rq->runtime_expires = expires;
				4250
				4251	/* we check whether we're throttled above */
				4252	if (cfs_rq->runtime_remaining > 0)
				4253	unthrottle_cfs_rq(cfs_rq);
				4254
				4255	next:
				4256	raw_spin_unlock(&rq->lock);
				4257
				4258	if (!remaining)
				4259	break;
				4260	}
				4261	rcu_read_unlock();
				4262
Ben Segall	c06f04c	2014-06-20 15:21:20 -0700	[diff] [blame]	4263	return starting_runtime - remaining;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4264	}
				4265
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	4266	/*
				4267	* Responsible for refilling a task_group's bandwidth and unthrottling its
				4268	* cfs_rqs as appropriate. If there has been no activity within the last
				4269	* period the timer is deactivated until scheduling resumes; cfs_b->idle is
				4270	* used to track this state.
				4271	*/
				4272	static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
				4273	{
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4274	u64 runtime, runtime_expires;
Ben Segall	51f2176	2014-05-19 15:49:45 -0700	[diff] [blame]	4275	int throttled;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	4276
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	4277	/* no need to continue the timer with no bandwidth constraint */
				4278	if (cfs_b->quota == RUNTIME_INF)
Ben Segall	51f2176	2014-05-19 15:49:45 -0700	[diff] [blame]	4279	goto out_deactivate;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	4280
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4281	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
Nikhil Rao	e8da1b1	2011-07-21 09:43:40 -0700	[diff] [blame]	4282	cfs_b->nr_periods += overrun;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4283
Ben Segall	51f2176	2014-05-19 15:49:45 -0700	[diff] [blame]	4284	/*
				4285	* idle depends on !throttled (for the case of a large deficit), and if
				4286	* we're going inactive then everything else can be deferred
				4287	*/
				4288	if (cfs_b->idle && !throttled)
				4289	goto out_deactivate;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	4290
				4291	__refill_cfs_bandwidth_runtime(cfs_b);
				4292
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4293	if (!throttled) {
				4294	/* mark as potentially idle for the upcoming period */
				4295	cfs_b->idle = 1;
Ben Segall	51f2176	2014-05-19 15:49:45 -0700	[diff] [blame]	4296	return 0;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4297	}
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	4298
Nikhil Rao	e8da1b1	2011-07-21 09:43:40 -0700	[diff] [blame]	4299	/* account preceding periods in which throttling occurred */
				4300	cfs_b->nr_throttled += overrun;
				4301
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4302	runtime_expires = cfs_b->runtime_expires;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4303
				4304	/*
Ben Segall	c06f04c	2014-06-20 15:21:20 -0700	[diff] [blame]	4305	* This check is repeated as we are holding onto the new bandwidth while
				4306	* we unthrottle. This can potentially race with an unthrottled group
				4307	* trying to acquire new bandwidth from the global pool. This can result
				4308	* in us over-using our runtime if it is all used during this loop, but
				4309	* only by limited amounts in that extreme case.
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4310	*/
Ben Segall	c06f04c	2014-06-20 15:21:20 -0700	[diff] [blame]	4311	while (throttled && cfs_b->runtime > 0) {
				4312	runtime = cfs_b->runtime;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4313	raw_spin_unlock(&cfs_b->lock);
				4314	/* we can't nest cfs_b->lock while distributing bandwidth */
				4315	runtime = distribute_cfs_runtime(cfs_b, runtime,
				4316	runtime_expires);
				4317	raw_spin_lock(&cfs_b->lock);
				4318
				4319	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
Ben Segall	c06f04c	2014-06-20 15:21:20 -0700	[diff] [blame]	4320
				4321	cfs_b->runtime -= min(runtime, cfs_b->runtime);
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4322	}
				4323
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	4324	/*
				4325	* While we are ensured activity in the period following an
				4326	* unthrottle, this also covers the case in which the new bandwidth is
				4327	* insufficient to cover the existing bandwidth deficit. (Forcing the
				4328	* timer to remain active while there are any throttled entities.)
				4329	*/
				4330	cfs_b->idle = 0;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	4331
Ben Segall	51f2176	2014-05-19 15:49:45 -0700	[diff] [blame]	4332	return 0;
				4333
				4334	out_deactivate:
Ben Segall	51f2176	2014-05-19 15:49:45 -0700	[diff] [blame]	4335	return 1;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	4336	}
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	4337
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	4338	/* a cfs_rq won't donate quota below this amount */
				4339	static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
				4340	/* minimum remaining period time to redistribute slack quota */
				4341	static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
				4342	/* how long we wait to gather additional slack before distributing */
				4343	static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
				4344
Ben Segall	db06e78	2013-10-16 11:16:17 -0700	[diff] [blame]	4345	/*
				4346	* Are we near the end of the current quota period?
				4347	*
				4348	* Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
Thomas Gleixner	4961b6e	2015-04-14 21:09:05 +0000	[diff] [blame]	4349	* hrtimer base being cleared by hrtimer_start. In the case of
Ben Segall	db06e78	2013-10-16 11:16:17 -0700	[diff] [blame]	4350	* migrate_hrtimers, base is never cleared, so we are fine.
				4351	*/
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	4352	static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
				4353	{
				4354	struct hrtimer *refresh_timer = &cfs_b->period_timer;
				4355	u64 remaining;
				4356
				4357	/* if the call-back is running a quota refresh is already occurring */
				4358	if (hrtimer_callback_running(refresh_timer))
				4359	return 1;
				4360
				4361	/* is a quota refresh about to occur? */
				4362	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
				4363	if (remaining < min_expire)
				4364	return 1;
				4365
				4366	return 0;
				4367	}
				4368
				4369	static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
				4370	{
				4371	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
				4372
				4373	/* if there's a quota refresh soon don't bother with slack */
				4374	if (runtime_refresh_within(cfs_b, min_left))
				4375	return;
				4376
Peter Zijlstra	4cfafd3	2015-05-14 12:23:11 +0200	[diff] [blame]	4377	hrtimer_start(&cfs_b->slack_timer,
				4378	ns_to_ktime(cfs_bandwidth_slack_period),
				4379	HRTIMER_MODE_REL);
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	4380	}
				4381
				4382	/* we know any runtime found here is valid as update_curr() precedes return */
				4383	static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				4384	{
				4385	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				4386	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
				4387
				4388	if (slack_runtime <= 0)
				4389	return;
				4390
				4391	raw_spin_lock(&cfs_b->lock);
				4392	if (cfs_b->quota != RUNTIME_INF &&
				4393	cfs_rq->runtime_expires == cfs_b->runtime_expires) {
				4394	cfs_b->runtime += slack_runtime;
				4395
				4396	/* we are under rq->lock, defer unthrottling using a timer */
				4397	if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
				4398	!list_empty(&cfs_b->throttled_cfs_rq))
				4399	start_cfs_slack_bandwidth(cfs_b);
				4400	}
				4401	raw_spin_unlock(&cfs_b->lock);
				4402
				4403	/* even if it's not valid for return we don't want to try again */
				4404	cfs_rq->runtime_remaining -= slack_runtime;
				4405	}
				4406
				4407	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				4408	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	4409	if (!cfs_bandwidth_used())
				4410	return;
				4411
Paul Turner	fccfdc6	2011-11-07 20:26:34 -0800	[diff] [blame]	4412	if (!cfs_rq->runtime_enabled \|\| cfs_rq->nr_running)
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	4413	return;
				4414
				4415	__return_cfs_rq_runtime(cfs_rq);
				4416	}
				4417
				4418	/*
				4419	* This is done with a timer (instead of inline with bandwidth return) since
				4420	* it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
				4421	*/
				4422	static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
				4423	{
				4424	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
				4425	u64 expires;
				4426
				4427	/* confirm we're still not at a refresh boundary */
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	4428	raw_spin_lock(&cfs_b->lock);
Ben Segall	db06e78	2013-10-16 11:16:17 -0700	[diff] [blame]	4429	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
				4430	raw_spin_unlock(&cfs_b->lock);
				4431	return;
				4432	}
				4433
Ben Segall	c06f04c	2014-06-20 15:21:20 -0700	[diff] [blame]	4434	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	4435	runtime = cfs_b->runtime;
Ben Segall	c06f04c	2014-06-20 15:21:20 -0700	[diff] [blame]	4436
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	4437	expires = cfs_b->runtime_expires;
				4438	raw_spin_unlock(&cfs_b->lock);
				4439
				4440	if (!runtime)
				4441	return;
				4442
				4443	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
				4444
				4445	raw_spin_lock(&cfs_b->lock);
				4446	if (expires == cfs_b->runtime_expires)
Ben Segall	c06f04c	2014-06-20 15:21:20 -0700	[diff] [blame]	4447	cfs_b->runtime -= min(runtime, cfs_b->runtime);
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	4448	raw_spin_unlock(&cfs_b->lock);
				4449	}
				4450
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	4451	/*
				4452	* When a group wakes up we want to make sure that its quota is not already
				4453	* expired/exceeded, otherwise it may be allowed to steal additional ticks of
				4454	* runtime as update_curr() throttling can not not trigger until it's on-rq.
				4455	*/
				4456	static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
				4457	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	4458	if (!cfs_bandwidth_used())
				4459	return;
				4460
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	4461	/* an active group must be handled by the update_curr()->put() path */
				4462	if (!cfs_rq->runtime_enabled \|\| cfs_rq->curr)
				4463	return;
				4464
				4465	/* ensure the group is not already throttled */
				4466	if (cfs_rq_throttled(cfs_rq))
				4467	return;
				4468
				4469	/* update runtime allocation */
				4470	account_cfs_rq_runtime(cfs_rq, 0);
				4471	if (cfs_rq->runtime_remaining <= 0)
				4472	throttle_cfs_rq(cfs_rq);
				4473	}
				4474
Peter Zijlstra	55e16d3	2016-06-22 15:14:26 +0200	[diff] [blame]	4475	static void sync_throttle(struct task_group *tg, int cpu)
				4476	{
				4477	struct cfs_rq pcfs_rq, cfs_rq;
				4478
				4479	if (!cfs_bandwidth_used())
				4480	return;
				4481
				4482	if (!tg->parent)
				4483	return;
				4484
				4485	cfs_rq = tg->cfs_rq[cpu];
				4486	pcfs_rq = tg->parent->cfs_rq[cpu];
				4487
				4488	cfs_rq->throttle_count = pcfs_rq->throttle_count;
Xunlei Pang	b892212	2016-07-09 15:54:22 +0800	[diff] [blame]	4489	cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
Peter Zijlstra	55e16d3	2016-06-22 15:14:26 +0200	[diff] [blame]	4490	}
				4491
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	4492	/* conditionally throttle active cfs_rq's from put_prev_entity() */
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	4493	static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	4494	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	4495	if (!cfs_bandwidth_used())
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	4496	return false;
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	4497
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	4498	if (likely(!cfs_rq->runtime_enabled \|\| cfs_rq->runtime_remaining > 0))
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	4499	return false;
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	4500
				4501	/*
				4502	* it's possible for a throttled entity to be forced into a running
				4503	* state (e.g. set_curr_task), in this case we're finished.
				4504	*/
				4505	if (cfs_rq_throttled(cfs_rq))
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	4506	return true;
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	4507
				4508	throttle_cfs_rq(cfs_rq);
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	4509	return true;
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	4510	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4511
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4512	static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
				4513	{
				4514	struct cfs_bandwidth *cfs_b =
				4515	container_of(timer, struct cfs_bandwidth, slack_timer);
Peter Zijlstra	77a4d1a	2015-04-15 11:41:57 +0200	[diff] [blame]	4516
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4517	do_sched_cfs_slack_timer(cfs_b);
				4518
				4519	return HRTIMER_NORESTART;
				4520	}
				4521
				4522	static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
				4523	{
				4524	struct cfs_bandwidth *cfs_b =
				4525	container_of(timer, struct cfs_bandwidth, period_timer);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4526	int overrun;
				4527	int idle = 0;
				4528
Ben Segall	51f2176	2014-05-19 15:49:45 -0700	[diff] [blame]	4529	raw_spin_lock(&cfs_b->lock);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4530	for (;;) {
Peter Zijlstra	77a4d1a	2015-04-15 11:41:57 +0200	[diff] [blame]	4531	overrun = hrtimer_forward_now(timer, cfs_b->period);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4532	if (!overrun)
				4533	break;
				4534
				4535	idle = do_sched_cfs_period_timer(cfs_b, overrun);
				4536	}
Peter Zijlstra	4cfafd3	2015-05-14 12:23:11 +0200	[diff] [blame]	4537	if (idle)
				4538	cfs_b->period_active = 0;
Ben Segall	51f2176	2014-05-19 15:49:45 -0700	[diff] [blame]	4539	raw_spin_unlock(&cfs_b->lock);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4540
				4541	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
				4542	}
				4543
				4544	void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				4545	{
				4546	raw_spin_lock_init(&cfs_b->lock);
				4547	cfs_b->runtime = 0;
				4548	cfs_b->quota = RUNTIME_INF;
				4549	cfs_b->period = ns_to_ktime(default_cfs_period());
				4550
				4551	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
Peter Zijlstra	4cfafd3	2015-05-14 12:23:11 +0200	[diff] [blame]	4552	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4553	cfs_b->period_timer.function = sched_cfs_period_timer;
				4554	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
				4555	cfs_b->slack_timer.function = sched_cfs_slack_timer;
				4556	}
				4557
				4558	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				4559	{
				4560	cfs_rq->runtime_enabled = 0;
				4561	INIT_LIST_HEAD(&cfs_rq->throttled_list);
				4562	}
				4563
Peter Zijlstra	77a4d1a	2015-04-15 11:41:57 +0200	[diff] [blame]	4564	void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4565	{
Peter Zijlstra	4cfafd3	2015-05-14 12:23:11 +0200	[diff] [blame]	4566	lockdep_assert_held(&cfs_b->lock);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4567
Peter Zijlstra	4cfafd3	2015-05-14 12:23:11 +0200	[diff] [blame]	4568	if (!cfs_b->period_active) {
				4569	cfs_b->period_active = 1;
				4570	hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
				4571	hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
				4572	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4573	}
				4574
				4575	static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				4576	{
Tetsuo Handa	7f1a169	2014-12-25 15:51:21 +0900	[diff] [blame]	4577	/* init_cfs_bandwidth() was not called */
				4578	if (!cfs_b->throttled_cfs_rq.next)
				4579	return;
				4580
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4581	hrtimer_cancel(&cfs_b->period_timer);
				4582	hrtimer_cancel(&cfs_b->slack_timer);
				4583	}
				4584
Kirill Tkhai	0e59bda	2014-06-25 12:19:42 +0400	[diff] [blame]	4585	static void __maybe_unused update_runtime_enabled(struct rq *rq)
				4586	{
				4587	struct cfs_rq *cfs_rq;
				4588
				4589	for_each_leaf_cfs_rq(rq, cfs_rq) {
				4590	struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
				4591
				4592	raw_spin_lock(&cfs_b->lock);
				4593	cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
				4594	raw_spin_unlock(&cfs_b->lock);
				4595	}
				4596	}
				4597
Arnd Bergmann	38dc334	2013-01-25 14:14:22 +0000	[diff] [blame]	4598	static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4599	{
				4600	struct cfs_rq *cfs_rq;
				4601
				4602	for_each_leaf_cfs_rq(rq, cfs_rq) {
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4603	if (!cfs_rq->runtime_enabled)
				4604	continue;
				4605
				4606	/*
				4607	* clock_task is not advancing so we just need to make sure
				4608	* there's some valid quota amount
				4609	*/
Ben Segall	51f2176	2014-05-19 15:49:45 -0700	[diff] [blame]	4610	cfs_rq->runtime_remaining = 1;
Kirill Tkhai	0e59bda	2014-06-25 12:19:42 +0400	[diff] [blame]	4611	/*
				4612	* Offline rq is schedulable till cpu is completely disabled
				4613	* in take_cpu_down(), so we prevent new cfs throttling here.
				4614	*/
				4615	cfs_rq->runtime_enabled = 0;
				4616
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4617	if (cfs_rq_throttled(cfs_rq))
				4618	unthrottle_cfs_rq(cfs_rq);
				4619	}
				4620	}
				4621
				4622	#else /* CONFIG_CFS_BANDWIDTH */
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	4623	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
				4624	{
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	4625	return rq_clock_task(rq_of(cfs_rq));
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	4626	}
				4627
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	4628	static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	4629	static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	4630	static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
Peter Zijlstra	55e16d3	2016-06-22 15:14:26 +0200	[diff] [blame]	4631	static inline void sync_throttle(struct task_group *tg, int cpu) {}
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	4632	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4633
				4634	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
				4635	{
				4636	return 0;
				4637	}
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	4638
				4639	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
				4640	{
				4641	return 0;
				4642	}
				4643
				4644	static inline int throttled_lb_pair(struct task_group *tg,
				4645	int src_cpu, int dest_cpu)
				4646	{
				4647	return 0;
				4648	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4649
				4650	void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
				4651
				4652	#ifdef CONFIG_FAIR_GROUP_SCHED
				4653	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
Paul Turner	ab84d31	2011-07-21 09:43:28 -0700	[diff] [blame]	4654	#endif
				4655
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4656	static inline struct cfs_bandwidth tg_cfs_bandwidth(struct task_group tg)
				4657	{
				4658	return NULL;
				4659	}
				4660	static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
Kirill Tkhai	0e59bda	2014-06-25 12:19:42 +0400	[diff] [blame]	4661	static inline void update_runtime_enabled(struct rq *rq) {}
Peter Boonstoppel	a4c96ae	2012-08-09 15:34:47 -0700	[diff] [blame]	4662	static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4663
				4664	#endif /* CONFIG_CFS_BANDWIDTH */
				4665
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4666	/**************************************************
				4667	* CFS operations on tasks:
				4668	*/
				4669
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	4670	#ifdef CONFIG_SCHED_HRTICK
				4671	static void hrtick_start_fair(struct rq rq, struct task_struct p)
				4672	{
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	4673	struct sched_entity *se = &p->se;
				4674	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				4675
Peter Zijlstra	9148a3a	2016-09-20 22:34:51 +0200	[diff] [blame]	4676	SCHED_WARN_ON(task_rq(p) != rq);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	4677
Srivatsa Vaddagiri	8bf46a3	2016-09-16 18:28:51 -0700	[diff] [blame]	4678	if (rq->cfs.h_nr_running > 1) {
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	4679	u64 slice = sched_slice(cfs_rq, se);
				4680	u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
				4681	s64 delta = slice - ran;
				4682
				4683	if (delta < 0) {
				4684	if (rq->curr == p)
Kirill Tkhai	8875125	2014-06-29 00:03:57 +0400	[diff] [blame]	4685	resched_curr(rq);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	4686	return;
				4687	}
Peter Zijlstra	3165651	2008-07-18 18:01:23 +0200	[diff] [blame]	4688	hrtick_start(rq, delta);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	4689	}
				4690	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	4691
				4692	/*
				4693	* called from enqueue/dequeue and updates the hrtick when the
				4694	* current task is from our class and nr_running is low enough
				4695	* to matter.
				4696	*/
				4697	static void hrtick_update(struct rq *rq)
				4698	{
				4699	struct task_struct *curr = rq->curr;
				4700
Mike Galbraith	b39e66e	2011-11-22 15:20:07 +0100	[diff] [blame]	4701	if (!hrtick_enabled(rq) \|\| curr->sched_class != &fair_sched_class)
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	4702	return;
				4703
				4704	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
				4705	hrtick_start_fair(rq, curr);
				4706	}
Dhaval Giani	55e12e5	2008-06-24 23:39:43 +0530	[diff] [blame]	4707	#else /* !CONFIG_SCHED_HRTICK */
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	4708	static inline void
				4709	hrtick_start_fair(struct rq rq, struct task_struct p)
				4710	{
				4711	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	4712
				4713	static inline void hrtick_update(struct rq *rq)
				4714	{
				4715	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	4716	#endif
				4717
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4718	/*
				4719	* The enqueue_task method is called before nr_running is
				4720	* increased. Here we update the fair scheduling stats and
				4721	* then put the task into the rbtree:
				4722	*/
Thomas Gleixner	ea87bb7	2010-01-20 20:58:57 +0000	[diff] [blame]	4723	static void
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	4724	enqueue_task_fair(struct rq rq, struct task_struct p, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4725	{
				4726	struct cfs_rq *cfs_rq;
Peter Zijlstra	62fb185	2008-02-25 17:34:02 +0100	[diff] [blame]	4727	struct sched_entity *se = &p->se;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4728
Rafael J. Wysocki	8c34ab1	2016-09-09 23:59:33 +0200	[diff] [blame]	4729	/*
				4730	* If in_iowait is set, the code below may not trigger any cpufreq
				4731	* utilization updates, so do it here explicitly with the IOWAIT flag
				4732	* passed.
				4733	*/
				4734	if (p->in_iowait)
				4735	cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
				4736
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4737	for_each_sched_entity(se) {
Peter Zijlstra	62fb185	2008-02-25 17:34:02 +0100	[diff] [blame]	4738	if (se->on_rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4739	break;
				4740	cfs_rq = cfs_rq_of(se);
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	4741	enqueue_entity(cfs_rq, se, flags);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4742
				4743	/*
				4744	* end evaluation on encountering a throttled cfs_rq
				4745	*
				4746	* note: in the case of encountering a throttled cfs_rq we will
				4747	* post the final h_nr_running increment below.
Peter Zijlstra	e210bff	2016-06-16 18:51:48 +0200	[diff] [blame]	4748	*/
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4749	if (cfs_rq_throttled(cfs_rq))
				4750	break;
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	4751	cfs_rq->h_nr_running++;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4752
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	4753	flags = ENQUEUE_WAKEUP;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4754	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	4755
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	4756	for_each_sched_entity(se) {
Lin Ming	0f31714	2011-07-22 09:14:31 +0800	[diff] [blame]	4757	cfs_rq = cfs_rq_of(se);
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	4758	cfs_rq->h_nr_running++;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	4759
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4760	if (cfs_rq_throttled(cfs_rq))
				4761	break;
				4762
Vincent Guittot	d31b1a6	2016-11-08 10:53:44 +0100	[diff] [blame]	4763	update_load_avg(se, UPDATE_TG);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	4764	update_cfs_shares(cfs_rq);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	4765	}
				4766
Yuyang Du	cd126af	2015-07-15 08:04:36 +0800	[diff] [blame]	4767	if (!se)
Kirill Tkhai	7246544	2014-05-09 03:00:14 +0400	[diff] [blame]	4768	add_nr_running(rq, 1);
Yuyang Du	cd126af	2015-07-15 08:04:36 +0800	[diff] [blame]	4769
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	4770	hrtick_update(rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4771	}
				4772
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	4773	static void set_next_buddy(struct sched_entity *se);
				4774
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4775	/*
				4776	* The dequeue_task method is called before nr_running is
				4777	* decreased. We remove the task from the rbtree and
				4778	* update the fair scheduling stats:
				4779	*/
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	4780	static void dequeue_task_fair(struct rq rq, struct task_struct p, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4781	{
				4782	struct cfs_rq *cfs_rq;
Peter Zijlstra	62fb185	2008-02-25 17:34:02 +0100	[diff] [blame]	4783	struct sched_entity *se = &p->se;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	4784	int task_sleep = flags & DEQUEUE_SLEEP;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4785
				4786	for_each_sched_entity(se) {
				4787	cfs_rq = cfs_rq_of(se);
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	4788	dequeue_entity(cfs_rq, se, flags);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4789
				4790	/*
				4791	* end evaluation on encountering a throttled cfs_rq
				4792	*
				4793	* note: in the case of encountering a throttled cfs_rq we will
				4794	* post the final h_nr_running decrement below.
				4795	*/
				4796	if (cfs_rq_throttled(cfs_rq))
				4797	break;
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	4798	cfs_rq->h_nr_running--;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	4799
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4800	/* Don't dequeue parent if it has other entities besides us */
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	4801	if (cfs_rq->load.weight) {
Konstantin Khlebnikov	754bd59	2016-06-16 15:57:15 +0300	[diff] [blame]	4802	/* Avoid re-evaluating load for this entity: */
				4803	se = parent_entity(se);
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	4804	/*
				4805	* Bias pick_next to pick a task from this cfs_rq, as
				4806	* p is sleeping when it is within its sched_slice.
				4807	*/
Konstantin Khlebnikov	754bd59	2016-06-16 15:57:15 +0300	[diff] [blame]	4808	if (task_sleep && se && !throttled_hierarchy(cfs_rq))
				4809	set_next_buddy(se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4810	break;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	4811	}
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	4812	flags \|= DEQUEUE_SLEEP;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4813	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	4814
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	4815	for_each_sched_entity(se) {
Lin Ming	0f31714	2011-07-22 09:14:31 +0800	[diff] [blame]	4816	cfs_rq = cfs_rq_of(se);
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	4817	cfs_rq->h_nr_running--;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	4818
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4819	if (cfs_rq_throttled(cfs_rq))
				4820	break;
				4821
Vincent Guittot	d31b1a6	2016-11-08 10:53:44 +0100	[diff] [blame]	4822	update_load_avg(se, UPDATE_TG);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	4823	update_cfs_shares(cfs_rq);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	4824	}
				4825
Yuyang Du	cd126af	2015-07-15 08:04:36 +0800	[diff] [blame]	4826	if (!se)
Kirill Tkhai	7246544	2014-05-09 03:00:14 +0400	[diff] [blame]	4827	sub_nr_running(rq, 1);
Yuyang Du	cd126af	2015-07-15 08:04:36 +0800	[diff] [blame]	4828
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	4829	hrtick_update(rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4830	}
				4831
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	4832	#ifdef CONFIG_SMP
Peter Zijlstra	10e2f1a	2016-05-09 10:38:05 +0200	[diff] [blame]	4833
				4834	/* Working cpumask for: load_balance, load_balance_newidle. */
				4835	DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
				4836	DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
				4837
Frederic Weisbecker	9fd81dd	2016-04-19 17:36:51 +0200	[diff] [blame]	4838	#ifdef CONFIG_NO_HZ_COMMON
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4839	/*
				4840	* per rq 'load' arrray crap; XXX kill this.
				4841	*/
				4842
				4843	/*
Peter Zijlstra	d937cdc	2015-10-19 13:49:30 +0200	[diff] [blame]	4844	* The exact cpuload calculated at every tick would be:
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4845	*
Peter Zijlstra	d937cdc	2015-10-19 13:49:30 +0200	[diff] [blame]	4846	* load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
				4847	*
				4848	* If a cpu misses updates for n ticks (as it was idle) and update gets
				4849	* called on the n+1-th tick when cpu may be busy, then we have:
				4850	*
				4851	* load_n = (1 - 1/2^i)^n * load_0
				4852	* load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4853	*
				4854	* decay_load_missed() below does efficient calculation of
Peter Zijlstra	d937cdc	2015-10-19 13:49:30 +0200	[diff] [blame]	4855	*
				4856	* load' = (1 - 1/2^i)^n * load
				4857	*
				4858	* Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
				4859	* This allows us to precompute the above in said factors, thereby allowing the
				4860	* reduction of an arbitrary n in O(log_2 n) steps. (See also
				4861	* fixed_power_int())
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4862	*
				4863	* The calculation is approximated on a 128 point scale.
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4864	*/
				4865	#define DEGRADE_SHIFT 7
Peter Zijlstra	d937cdc	2015-10-19 13:49:30 +0200	[diff] [blame]	4866
				4867	static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
				4868	static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
				4869	{ 0, 0, 0, 0, 0, 0, 0, 0 },
				4870	{ 64, 32, 8, 0, 0, 0, 0, 0 },
				4871	{ 96, 72, 40, 12, 1, 0, 0, 0 },
				4872	{ 112, 98, 75, 43, 15, 1, 0, 0 },
				4873	{ 120, 112, 98, 76, 45, 16, 2, 0 }
				4874	};
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4875
				4876	/*
				4877	* Update cpu_load for any missed ticks, due to tickless idle. The backlog
				4878	* would be when CPU is idle and so we just decay the old load without
				4879	* adding any new load.
				4880	*/
				4881	static unsigned long
				4882	decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
				4883	{
				4884	int j = 0;
				4885
				4886	if (!missed_updates)
				4887	return load;
				4888
				4889	if (missed_updates >= degrade_zero_ticks[idx])
				4890	return 0;
				4891
				4892	if (idx == 1)
				4893	return load >> missed_updates;
				4894
				4895	while (missed_updates) {
				4896	if (missed_updates % 2)
				4897	load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
				4898
				4899	missed_updates >>= 1;
				4900	j++;
				4901	}
				4902	return load;
				4903	}
Frederic Weisbecker	9fd81dd	2016-04-19 17:36:51 +0200	[diff] [blame]	4904	#endif /* CONFIG_NO_HZ_COMMON */
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4905
Byungchul Park	5954327	2015-10-14 18:47:35 +0900	[diff] [blame]	4906	/**
Frederic Weisbecker	cee1afc	2016-04-13 15:56:50 +0200	[diff] [blame]	4907	* __cpu_load_update - update the rq->cpu_load[] statistics
Byungchul Park	5954327	2015-10-14 18:47:35 +0900	[diff] [blame]	4908	* @this_rq: The rq to update statistics for
				4909	* @this_load: The current load
				4910	* @pending_updates: The number of missed updates
Byungchul Park	5954327	2015-10-14 18:47:35 +0900	[diff] [blame]	4911	*
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4912	* Update rq->cpu_load[] statistics. This function is usually called every
Byungchul Park	5954327	2015-10-14 18:47:35 +0900	[diff] [blame]	4913	* scheduler tick (TICK_NSEC).
				4914	*
				4915	* This function computes a decaying average:
				4916	*
				4917	* load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
				4918	*
				4919	* Because of NOHZ it might not get called on every tick which gives need for
				4920	* the @pending_updates argument.
				4921	*
				4922	* load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
				4923	* = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
				4924	* = A * (A * load[i]_n-2 + B) + B
				4925	* = A * (A * (A * load[i]_n-3 + B) + B) + B
				4926	* = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
				4927	* = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
				4928	* = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
				4929	* = (1 - 1/2^i)^n * (load[i]_0 - load) + load
				4930	*
				4931	* In the above we've assumed load_n := load, which is true for NOHZ_FULL as
				4932	* any change in load would have resulted in the tick being turned back on.
				4933	*
				4934	* For regular NOHZ, this reduces to:
				4935	*
				4936	* load[i]_n = (1 - 1/2^i)^n * load[i]_0
				4937	*
				4938	* see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
Frederic Weisbecker	1f41906	2016-04-13 15:56:51 +0200	[diff] [blame]	4939	* term.
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4940	*/
Frederic Weisbecker	1f41906	2016-04-13 15:56:51 +0200	[diff] [blame]	4941	static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
				4942	unsigned long pending_updates)
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4943	{
Frederic Weisbecker	9fd81dd	2016-04-19 17:36:51 +0200	[diff] [blame]	4944	unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4945	int i, scale;
				4946
				4947	this_rq->nr_load_updates++;
				4948
				4949	/* Update our load: */
				4950	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
				4951	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
				4952	unsigned long old_load, new_load;
				4953
				4954	/* scale is effectively 1 << i now, and >> i divides by scale */
				4955
Byungchul Park	7400d3b	2016-01-15 16:07:49 +0900	[diff] [blame]	4956	old_load = this_rq->cpu_load[i];
Frederic Weisbecker	9fd81dd	2016-04-19 17:36:51 +0200	[diff] [blame]	4957	#ifdef CONFIG_NO_HZ_COMMON
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4958	old_load = decay_load_missed(old_load, pending_updates - 1, i);
Byungchul Park	7400d3b	2016-01-15 16:07:49 +0900	[diff] [blame]	4959	if (tickless_load) {
				4960	old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
				4961	/*
				4962	* old_load can never be a negative value because a
				4963	* decayed tickless_load cannot be greater than the
				4964	* original tickless_load.
				4965	*/
				4966	old_load += tickless_load;
				4967	}
Frederic Weisbecker	9fd81dd	2016-04-19 17:36:51 +0200	[diff] [blame]	4968	#endif
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4969	new_load = this_load;
				4970	/*
				4971	* Round up the averaging division if load is increasing. This
				4972	* prevents us from getting stuck on 9 if the load is 10, for
				4973	* example.
				4974	*/
				4975	if (new_load > old_load)
				4976	new_load += scale - 1;
				4977
				4978	this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
				4979	}
				4980
				4981	sched_avg_update(this_rq);
				4982	}
				4983
Yuyang Du	7ea241a	2015-07-15 08:04:42 +0800	[diff] [blame]	4984	/* Used instead of source_load when we know the type == 0 */
				4985	static unsigned long weighted_cpuload(const int cpu)
				4986	{
				4987	return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
				4988	}
				4989
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4990	#ifdef CONFIG_NO_HZ_COMMON
Frederic Weisbecker	1f41906	2016-04-13 15:56:51 +0200	[diff] [blame]	4991	/*
				4992	* There is no sane way to deal with nohz on smp when using jiffies because the
				4993	* cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
				4994	* causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
				4995	*
				4996	* Therefore we need to avoid the delta approach from the regular tick when
				4997	* possible since that would seriously skew the load calculation. This is why we
				4998	* use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on
				4999	* jiffies deltas for updates happening while in nohz mode (idle ticks, idle
				5000	* loop exit, nohz_idle_balance, nohz full exit...)
				5001	*
				5002	* This means we might still be one tick off for nohz periods.
				5003	*/
				5004
				5005	static void cpu_load_update_nohz(struct rq *this_rq,
				5006	unsigned long curr_jiffies,
				5007	unsigned long load)
Frederic Weisbecker	be68a68	2016-01-13 17:01:29 +0100	[diff] [blame]	5008	{
				5009	unsigned long pending_updates;
				5010
				5011	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
				5012	if (pending_updates) {
				5013	this_rq->last_load_update_tick = curr_jiffies;
				5014	/*
				5015	* In the regular NOHZ case, we were idle, this means load 0.
				5016	* In the NOHZ_FULL case, we were non-idle, we should consider
				5017	* its weighted load.
				5018	*/
Frederic Weisbecker	1f41906	2016-04-13 15:56:51 +0200	[diff] [blame]	5019	cpu_load_update(this_rq, load, pending_updates);
Frederic Weisbecker	be68a68	2016-01-13 17:01:29 +0100	[diff] [blame]	5020	}
				5021	}
				5022
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	5023	/*
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	5024	* Called from nohz_idle_balance() to update the load ratings before doing the
				5025	* idle balance.
				5026	*/
Frederic Weisbecker	cee1afc	2016-04-13 15:56:50 +0200	[diff] [blame]	5027	static void cpu_load_update_idle(struct rq *this_rq)
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	5028	{
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	5029	/*
				5030	* bail if there's load or we're actually up-to-date.
				5031	*/
Frederic Weisbecker	be68a68	2016-01-13 17:01:29 +0100	[diff] [blame]	5032	if (weighted_cpuload(cpu_of(this_rq)))
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	5033	return;
				5034
Frederic Weisbecker	1f41906	2016-04-13 15:56:51 +0200	[diff] [blame]	5035	cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	5036	}
				5037
				5038	/*
Frederic Weisbecker	1f41906	2016-04-13 15:56:51 +0200	[diff] [blame]	5039	* Record CPU load on nohz entry so we know the tickless load to account
				5040	* on nohz exit. cpu_load[0] happens then to be updated more frequently
				5041	* than other cpu_load[idx] but it should be fine as cpu_load readers
				5042	* shouldn't rely into synchronized cpu_load[*] updates.
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	5043	*/
Frederic Weisbecker	1f41906	2016-04-13 15:56:51 +0200	[diff] [blame]	5044	void cpu_load_update_nohz_start(void)
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	5045	{
				5046	struct rq *this_rq = this_rq();
Frederic Weisbecker	1f41906	2016-04-13 15:56:51 +0200	[diff] [blame]	5047
				5048	/*
				5049	* This is all lockless but should be fine. If weighted_cpuload changes
				5050	* concurrently we'll exit nohz. And cpu_load write can race with
				5051	* cpu_load_update_idle() but both updater would be writing the same.
				5052	*/
				5053	this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
				5054	}
				5055
				5056	/*
				5057	* Account the tickless load in the end of a nohz frame.
				5058	*/
				5059	void cpu_load_update_nohz_stop(void)
				5060	{
Jason Low	316c1608d	2015-04-28 13:00:20 -0700	[diff] [blame]	5061	unsigned long curr_jiffies = READ_ONCE(jiffies);
Frederic Weisbecker	1f41906	2016-04-13 15:56:51 +0200	[diff] [blame]	5062	struct rq *this_rq = this_rq();
				5063	unsigned long load;
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	5064
				5065	if (curr_jiffies == this_rq->last_load_update_tick)
				5066	return;
				5067
Frederic Weisbecker	1f41906	2016-04-13 15:56:51 +0200	[diff] [blame]	5068	load = weighted_cpuload(cpu_of(this_rq));
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	5069	raw_spin_lock(&this_rq->lock);
Matt Fleming	b52fad2	2016-05-03 20:46:54 +0100	[diff] [blame]	5070	update_rq_clock(this_rq);
Frederic Weisbecker	1f41906	2016-04-13 15:56:51 +0200	[diff] [blame]	5071	cpu_load_update_nohz(this_rq, curr_jiffies, load);
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	5072	raw_spin_unlock(&this_rq->lock);
				5073	}
Frederic Weisbecker	1f41906	2016-04-13 15:56:51 +0200	[diff] [blame]	5074	#else /* !CONFIG_NO_HZ_COMMON */
				5075	static inline void cpu_load_update_nohz(struct rq *this_rq,
				5076	unsigned long curr_jiffies,
				5077	unsigned long load) { }
				5078	#endif /* CONFIG_NO_HZ_COMMON */
				5079
				5080	static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
				5081	{
Frederic Weisbecker	9fd81dd	2016-04-19 17:36:51 +0200	[diff] [blame]	5082	#ifdef CONFIG_NO_HZ_COMMON
Frederic Weisbecker	1f41906	2016-04-13 15:56:51 +0200	[diff] [blame]	5083	/* See the mess around cpu_load_update_nohz(). */
				5084	this_rq->last_load_update_tick = READ_ONCE(jiffies);
Frederic Weisbecker	9fd81dd	2016-04-19 17:36:51 +0200	[diff] [blame]	5085	#endif
Frederic Weisbecker	1f41906	2016-04-13 15:56:51 +0200	[diff] [blame]	5086	cpu_load_update(this_rq, load, 1);
				5087	}
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	5088
				5089	/*
				5090	* Called from scheduler_tick()
				5091	*/
Frederic Weisbecker	cee1afc	2016-04-13 15:56:50 +0200	[diff] [blame]	5092	void cpu_load_update_active(struct rq *this_rq)
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	5093	{
Yuyang Du	7ea241a	2015-07-15 08:04:42 +0800	[diff] [blame]	5094	unsigned long load = weighted_cpuload(cpu_of(this_rq));
Frederic Weisbecker	1f41906	2016-04-13 15:56:51 +0200	[diff] [blame]	5095
				5096	if (tick_nohz_tick_stopped())
				5097	cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
				5098	else
				5099	cpu_load_update_periodic(this_rq, load);
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	5100	}
				5101
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5102	/*
				5103	* Return a low guess at the load of a migration-source cpu weighted
				5104	* according to the scheduling class and "nice" value.
				5105	*
				5106	* We want to under-estimate the load of migration sources, to
				5107	* balance conservatively.
				5108	*/
				5109	static unsigned long source_load(int cpu, int type)
				5110	{
				5111	struct rq *rq = cpu_rq(cpu);
				5112	unsigned long total = weighted_cpuload(cpu);
				5113
				5114	if (type == 0 \|\| !sched_feat(LB_BIAS))
				5115	return total;
				5116
				5117	return min(rq->cpu_load[type-1], total);
				5118	}
				5119
				5120	/*
				5121	* Return a high guess at the load of a migration-target cpu weighted
				5122	* according to the scheduling class and "nice" value.
				5123	*/
				5124	static unsigned long target_load(int cpu, int type)
				5125	{
				5126	struct rq *rq = cpu_rq(cpu);
				5127	unsigned long total = weighted_cpuload(cpu);
				5128
				5129	if (type == 0 \|\| !sched_feat(LB_BIAS))
				5130	return total;
				5131
				5132	return max(rq->cpu_load[type-1], total);
				5133	}
				5134
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	5135	static unsigned long capacity_of(int cpu)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5136	{
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	5137	return cpu_rq(cpu)->cpu_capacity;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5138	}
				5139
Vincent Guittot	ca6d75e	2015-02-27 16:54:09 +0100	[diff] [blame]	5140	static unsigned long capacity_orig_of(int cpu)
				5141	{
				5142	return cpu_rq(cpu)->cpu_capacity_orig;
				5143	}
				5144
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5145	static unsigned long cpu_avg_load_per_task(int cpu)
				5146	{
				5147	struct rq *rq = cpu_rq(cpu);
Jason Low	316c1608d	2015-04-28 13:00:20 -0700	[diff] [blame]	5148	unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
Yuyang Du	7ea241a	2015-07-15 08:04:42 +0800	[diff] [blame]	5149	unsigned long load_avg = weighted_cpuload(cpu);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5150
				5151	if (nr_running)
Alex Shi	b92486c	2013-06-20 10:18:50 +0800	[diff] [blame]	5152	return load_avg / nr_running;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5153
				5154	return 0;
				5155	}
				5156
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	5157	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	f5bfb7d	2008-06-27 13:41:39 +0200	[diff] [blame]	5158	/*
				5159	* effective_load() calculates the load change as seen from the root_task_group
				5160	*
				5161	* Adding load to a group doesn't make a group heavier, but can cause movement
				5162	* of group shares between cpus. Assuming the shares were perfectly aligned one
				5163	* can calculate the shift in shares.
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	5164	*
				5165	* Calculate the effective load difference if @wl is added (subtracted) to @tg
				5166	* on this @cpu and results in a total addition (subtraction) of @wg to the
				5167	* total group weight.
				5168	*
				5169	* Given a runqueue weight distribution (rw_i) we can compute a shares
				5170	* distribution (s_i) using:
				5171	*
				5172	* s_i = rw_i / \Sum rw_j (1)
				5173	*
				5174	* Suppose we have 4 CPUs and our @tg is a direct child of the root group and
				5175	* has 7 equal weight tasks, distributed as below (rw_i), with the resulting
				5176	* shares distribution (s_i):
				5177	*
				5178	* rw_i = { 2, 4, 1, 0 }
				5179	* s_i = { 2/7, 4/7, 1/7, 0 }
				5180	*
				5181	* As per wake_affine() we're interested in the load of two CPUs (the CPU the
				5182	* task used to run on and the CPU the waker is running on), we need to
				5183	* compute the effect of waking a task on either CPU and, in case of a sync
				5184	* wakeup, compute the effect of the current task going to sleep.
				5185	*
				5186	* So for a change of @wl to the local @cpu with an overall group weight change
				5187	* of @wl we can compute the new shares distribution (s'_i) using:
				5188	*
				5189	* s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
				5190	*
				5191	* Suppose we're interested in CPUs 0 and 1, and want to compute the load
				5192	* differences in waking a task to CPU 0. The additional task changes the
				5193	* weight and shares distributions like:
				5194	*
				5195	* rw'_i = { 3, 4, 1, 0 }
				5196	* s'_i = { 3/8, 4/8, 1/8, 0 }
				5197	*
				5198	* We can then compute the difference in effective weight by using:
				5199	*
				5200	* dw_i = S * (s'_i - s_i) (3)
				5201	*
				5202	* Where 'S' is the group weight as seen by its parent.
				5203	*
				5204	* Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
				5205	* times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
				5206	* 4/7) times the weight of the group.
Peter Zijlstra	f5bfb7d	2008-06-27 13:41:39 +0200	[diff] [blame]	5207	*/
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	5208	static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	5209	{
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	5210	struct sched_entity *se = tg->se[cpu];
Peter Zijlstra	f1d239f	2008-06-27 13:41:38 +0200	[diff] [blame]	5211
Rik van Riel	9722c2d	2014-01-06 11:39:12 +0000	[diff] [blame]	5212	if (!tg->parent) /* the trivial, non-cgroup case */
Peter Zijlstra	f1d239f	2008-06-27 13:41:38 +0200	[diff] [blame]	5213	return wl;
				5214
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	5215	for_each_sched_entity(se) {
Peter Zijlstra	7dd4912	2016-06-24 15:53:54 +0200	[diff] [blame]	5216	struct cfs_rq *cfs_rq = se->my_q;
				5217	long W, w = cfs_rq_load_avg(cfs_rq);
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	5218
Peter Zijlstra	7dd4912	2016-06-24 15:53:54 +0200	[diff] [blame]	5219	tg = cfs_rq->tg;
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	5220
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	5221	/*
				5222	* W = @wg + \Sum rw_j
				5223	*/
Peter Zijlstra	7dd4912	2016-06-24 15:53:54 +0200	[diff] [blame]	5224	W = wg + atomic_long_read(&tg->load_avg);
				5225
				5226	/* Ensure \Sum rw_j >= rw_i */
				5227	W -= cfs_rq->tg_load_avg_contrib;
				5228	W += w;
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	5229
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	5230	/*
				5231	* w = rw_i + @wl
				5232	*/
Peter Zijlstra	7dd4912	2016-06-24 15:53:54 +0200	[diff] [blame]	5233	w += wl;
Peter Zijlstra	940959e	2008-09-23 15:33:42 +0200	[diff] [blame]	5234
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	5235	/*
				5236	* wl = S * s'_i; see (2)
				5237	*/
				5238	if (W > 0 && w < W)
Dietmar Eggemann	ab522e3	2016-08-22 15:00:41 +0100	[diff] [blame]	5239	wl = (w * (long)scale_load_down(tg->shares)) / W;
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	5240	else
Dietmar Eggemann	ab522e3	2016-08-22 15:00:41 +0100	[diff] [blame]	5241	wl = scale_load_down(tg->shares);
Peter Zijlstra	940959e	2008-09-23 15:33:42 +0200	[diff] [blame]	5242
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	5243	/*
				5244	* Per the above, wl is the new se->load.weight value; since
				5245	* those are clipped to [MIN_SHARES, ...) do so now. See
				5246	* calc_cfs_shares().
				5247	*/
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	5248	if (wl < MIN_SHARES)
				5249	wl = MIN_SHARES;
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	5250
				5251	/*
				5252	* wl = dw_i = S * (s'_i - s_i); see (3)
				5253	*/
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	5254	wl -= se->avg.load_avg;
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	5255
				5256	/*
				5257	* Recursively apply this logic to all parent groups to compute
				5258	* the final effective load change on the root group. Since
				5259	* only the @tg group gets extra weight, all parent groups can
				5260	* only redistribute existing shares. @wl is the shift in shares
				5261	* resulting from this level per the above.
				5262	*/
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	5263	wg = 0;
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	5264	}
				5265
				5266	return wl;
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	5267	}
				5268	#else
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	5269
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	5270	static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	5271	{
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	5272	return wl;
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	5273	}
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	5274
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	5275	#endif
				5276
Peter Zijlstra	c58d25f	2016-05-12 09:19:59 +0200	[diff] [blame]	5277	static void record_wakee(struct task_struct *p)
				5278	{
				5279	/*
				5280	* Only decay a single time; tasks that have less then 1 wakeup per
				5281	* jiffy will not have built up many flips.
				5282	*/
				5283	if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
				5284	current->wakee_flips >>= 1;
				5285	current->wakee_flip_decay_ts = jiffies;
				5286	}
				5287
				5288	if (current->last_wakee != p) {
				5289	current->last_wakee = p;
				5290	current->wakee_flips++;
				5291	}
				5292	}
				5293
Mike Galbraith	63b0e9e	2015-07-14 17:39:50 +0200	[diff] [blame]	5294	/*
				5295	* Detect M:N waker/wakee relationships via a switching-frequency heuristic.
Peter Zijlstra	c58d25f	2016-05-12 09:19:59 +0200	[diff] [blame]	5296	*
Mike Galbraith	63b0e9e	2015-07-14 17:39:50 +0200	[diff] [blame]	5297	* A waker of many should wake a different task than the one last awakened
Peter Zijlstra	c58d25f	2016-05-12 09:19:59 +0200	[diff] [blame]	5298	* at a frequency roughly N times higher than one of its wakees.
				5299	*
				5300	* In order to determine whether we should let the load spread vs consolidating
				5301	* to shared cache, we look for a minimum 'flip' frequency of llc_size in one
				5302	* partner, and a factor of lls_size higher frequency in the other.
				5303	*
				5304	* With both conditions met, we can be relatively sure that the relationship is
				5305	* non-monogamous, with partner count exceeding socket size.
				5306	*
				5307	* Waker/wakee being client/server, worker/dispatcher, interrupt source or
				5308	* whatever is irrelevant, spread criteria is apparent partner count exceeds
				5309	* socket size.
Mike Galbraith	63b0e9e	2015-07-14 17:39:50 +0200	[diff] [blame]	5310	*/
Michael Wang	6247041	2013-07-04 12:55:51 +0800	[diff] [blame]	5311	static int wake_wide(struct task_struct *p)
				5312	{
Mike Galbraith	63b0e9e	2015-07-14 17:39:50 +0200	[diff] [blame]	5313	unsigned int master = current->wakee_flips;
				5314	unsigned int slave = p->wakee_flips;
Peter Zijlstra	7d9ffa8	2013-07-04 12:56:46 +0800	[diff] [blame]	5315	int factor = this_cpu_read(sd_llc_size);
Michael Wang	6247041	2013-07-04 12:55:51 +0800	[diff] [blame]	5316
Mike Galbraith	63b0e9e	2015-07-14 17:39:50 +0200	[diff] [blame]	5317	if (master < slave)
				5318	swap(master, slave);
				5319	if (slave < factor \|\| master < slave * factor)
				5320	return 0;
				5321	return 1;
Michael Wang	6247041	2013-07-04 12:55:51 +0800	[diff] [blame]	5322	}
				5323
Morten Rasmussen	772bd008c	2016-06-22 18:03:13 +0100	[diff] [blame]	5324	static int wake_affine(struct sched_domain sd, struct task_struct p,
				5325	int prev_cpu, int sync)
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	5326	{
Paul Turner	e37b6a7	2011-01-21 20:44:59 -0800	[diff] [blame]	5327	s64 this_load, load;
Vincent Guittot	bd61c98	2014-08-26 13:06:50 +0200	[diff] [blame]	5328	s64 this_eff_load, prev_eff_load;
Morten Rasmussen	772bd008c	2016-06-22 18:03:13 +0100	[diff] [blame]	5329	int idx, this_cpu;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	5330	struct task_group *tg;
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	5331	unsigned long weight;
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	5332	int balanced;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	5333
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	5334	idx = sd->wake_idx;
				5335	this_cpu = smp_processor_id();
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	5336	load = source_load(prev_cpu, idx);
				5337	this_load = target_load(this_cpu, idx);
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	5338
				5339	/*
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	5340	* If sync wakeup then subtract the (maximum possible)
				5341	* effect of the currently running task from the load
				5342	* of the current CPU:
				5343	*/
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	5344	if (sync) {
				5345	tg = task_group(current);
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	5346	weight = current->se.avg.load_avg;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	5347
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	5348	this_load += effective_load(tg, this_cpu, -weight, -weight);
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	5349	load += effective_load(tg, prev_cpu, 0, -weight);
				5350	}
				5351
				5352	tg = task_group(p);
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	5353	weight = p->se.avg.load_avg;
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	5354
Peter Zijlstra	71a29aa	2009-09-07 18:28:05 +0200	[diff] [blame]	5355	/*
				5356	* In low-load situations, where prev_cpu is idle and this_cpu is idle
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	5357	* due to the sync cause above having dropped this_load to 0, we'll
				5358	* always have an imbalance, but there's really nothing you can do
				5359	* about that, so that's good too.
Peter Zijlstra	71a29aa	2009-09-07 18:28:05 +0200	[diff] [blame]	5360	*
				5361	* Otherwise check if either cpus are near enough in load to allow this
				5362	* task to be woken on this_cpu.
				5363	*/
Vincent Guittot	bd61c98	2014-08-26 13:06:50 +0200	[diff] [blame]	5364	this_eff_load = 100;
				5365	this_eff_load *= capacity_of(prev_cpu);
Peter Zijlstra	e51fd5e	2010-05-31 12:37:30 +0200	[diff] [blame]	5366
Vincent Guittot	bd61c98	2014-08-26 13:06:50 +0200	[diff] [blame]	5367	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
				5368	prev_eff_load *= capacity_of(this_cpu);
				5369
				5370	if (this_load > 0) {
Peter Zijlstra	e51fd5e	2010-05-31 12:37:30 +0200	[diff] [blame]	5371	this_eff_load *= this_load +
				5372	effective_load(tg, this_cpu, weight, weight);
				5373
Peter Zijlstra	e51fd5e	2010-05-31 12:37:30 +0200	[diff] [blame]	5374	prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
Vincent Guittot	bd61c98	2014-08-26 13:06:50 +0200	[diff] [blame]	5375	}
Peter Zijlstra	e51fd5e	2010-05-31 12:37:30 +0200	[diff] [blame]	5376
Vincent Guittot	bd61c98	2014-08-26 13:06:50 +0200	[diff] [blame]	5377	balanced = this_eff_load <= prev_eff_load;
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	5378
Josh Poimboeuf	ae92882	2016-06-17 12:43:24 -0500	[diff] [blame]	5379	schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	5380
Vincent Guittot	05bfb65	2014-08-26 13:06:45 +0200	[diff] [blame]	5381	if (!balanced)
				5382	return 0;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	5383
Josh Poimboeuf	ae92882	2016-06-17 12:43:24 -0500	[diff] [blame]	5384	schedstat_inc(sd->ttwu_move_affine);
				5385	schedstat_inc(p->se.statistics.nr_wakeups_affine);
Vincent Guittot	05bfb65	2014-08-26 13:06:45 +0200	[diff] [blame]	5386
				5387	return 1;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	5388	}
				5389
Morten Rasmussen	6a0b19c	2016-10-14 14:41:08 +0100	[diff] [blame]	5390	static inline int task_util(struct task_struct *p);
				5391	static int cpu_util_wake(int cpu, struct task_struct *p);
				5392
				5393	static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
				5394	{
				5395	return capacity_orig_of(cpu) - cpu_util_wake(cpu, p);
				5396	}
				5397
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5398	/*
				5399	* find_idlest_group finds and returns the least busy CPU group within the
				5400	* domain.
				5401	*/
				5402	static struct sched_group *
Peter Zijlstra	78e7ed5	2009-09-03 13:16:51 +0200	[diff] [blame]	5403	find_idlest_group(struct sched_domain sd, struct task_struct p,
Vincent Guittot	c44f2a0	2013-10-18 13:52:21 +0200	[diff] [blame]	5404	int this_cpu, int sd_flag)
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	5405	{
Andi Kleen	b3bd3de	2010-08-10 14:17:51 -0700	[diff] [blame]	5406	struct sched_group idlest = NULL, group = sd->groups;
Morten Rasmussen	6a0b19c	2016-10-14 14:41:08 +0100	[diff] [blame]	5407	struct sched_group *most_spare_sg = NULL;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5408	unsigned long min_load = ULONG_MAX, this_load = 0;
Morten Rasmussen	6a0b19c	2016-10-14 14:41:08 +0100	[diff] [blame]	5409	unsigned long most_spare = 0, this_spare = 0;
Vincent Guittot	c44f2a0	2013-10-18 13:52:21 +0200	[diff] [blame]	5410	int load_idx = sd->forkexec_idx;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5411	int imbalance = 100 + (sd->imbalance_pct-100)/2;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	5412
Vincent Guittot	c44f2a0	2013-10-18 13:52:21 +0200	[diff] [blame]	5413	if (sd_flag & SD_BALANCE_WAKE)
				5414	load_idx = sd->wake_idx;
				5415
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5416	do {
Morten Rasmussen	6a0b19c	2016-10-14 14:41:08 +0100	[diff] [blame]	5417	unsigned long load, avg_load, spare_cap, max_spare_cap;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5418	int local_group;
				5419	int i;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	5420
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5421	/* Skip over this group if it has no CPUs allowed */
				5422	if (!cpumask_intersects(sched_group_cpus(group),
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	5423	tsk_cpus_allowed(p)))
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5424	continue;
				5425
				5426	local_group = cpumask_test_cpu(this_cpu,
				5427	sched_group_cpus(group));
				5428
Morten Rasmussen	6a0b19c	2016-10-14 14:41:08 +0100	[diff] [blame]	5429	/*
				5430	* Tally up the load of all CPUs in the group and find
				5431	* the group containing the CPU with most spare capacity.
				5432	*/
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5433	avg_load = 0;
Morten Rasmussen	6a0b19c	2016-10-14 14:41:08 +0100	[diff] [blame]	5434	max_spare_cap = 0;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5435
				5436	for_each_cpu(i, sched_group_cpus(group)) {
				5437	/* Bias balancing toward cpus of our domain */
				5438	if (local_group)
				5439	load = source_load(i, load_idx);
				5440	else
				5441	load = target_load(i, load_idx);
				5442
				5443	avg_load += load;
Morten Rasmussen	6a0b19c	2016-10-14 14:41:08 +0100	[diff] [blame]	5444
				5445	spare_cap = capacity_spare_wake(i, p);
				5446
				5447	if (spare_cap > max_spare_cap)
				5448	max_spare_cap = spare_cap;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5449	}
				5450
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	5451	/* Adjust by relative CPU capacity of the group */
Nicolas Pitre	ca8ce3d	2014-05-26 18:19:39 -0400	[diff] [blame]	5452	avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5453
				5454	if (local_group) {
				5455	this_load = avg_load;
Morten Rasmussen	6a0b19c	2016-10-14 14:41:08 +0100	[diff] [blame]	5456	this_spare = max_spare_cap;
				5457	} else {
				5458	if (avg_load < min_load) {
				5459	min_load = avg_load;
				5460	idlest = group;
				5461	}
				5462
				5463	if (most_spare < max_spare_cap) {
				5464	most_spare = max_spare_cap;
				5465	most_spare_sg = group;
				5466	}
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5467	}
				5468	} while (group = group->next, group != sd->groups);
				5469
Morten Rasmussen	6a0b19c	2016-10-14 14:41:08 +0100	[diff] [blame]	5470	/*
				5471	* The cross-over point between using spare capacity or least load
				5472	* is too conservative for high utilization tasks on partially
				5473	* utilized systems if we require spare_capacity > task_util(p),
				5474	* so we allow for some task stuffing by using
				5475	* spare_capacity > task_util(p)/2.
				5476	*/
				5477	if (this_spare > task_util(p) / 2 &&
				5478	imbalancethis_spare > 100most_spare)
				5479	return NULL;
				5480	else if (most_spare > task_util(p) / 2)
				5481	return most_spare_sg;
				5482
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5483	if (!idlest \|\| 100this_load < imbalancemin_load)
				5484	return NULL;
				5485	return idlest;
				5486	}
				5487
				5488	/*
				5489	* find_idlest_cpu - find the idlest cpu among the cpus in group.
				5490	*/
				5491	static int
				5492	find_idlest_cpu(struct sched_group group, struct task_struct p, int this_cpu)
				5493	{
				5494	unsigned long load, min_load = ULONG_MAX;
Nicolas Pitre	83a0a96	2014-09-04 11:32:10 -0400	[diff] [blame]	5495	unsigned int min_exit_latency = UINT_MAX;
				5496	u64 latest_idle_timestamp = 0;
				5497	int least_loaded_cpu = this_cpu;
				5498	int shallowest_idle_cpu = -1;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5499	int i;
				5500
Morten Rasmussen	eaecf41	2016-06-22 18:03:14 +0100	[diff] [blame]	5501	/* Check if we have any choice: */
				5502	if (group->group_weight == 1)
				5503	return cpumask_first(sched_group_cpus(group));
				5504
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5505	/* Traverse only the allowed CPUs */
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	5506	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
Nicolas Pitre	83a0a96	2014-09-04 11:32:10 -0400	[diff] [blame]	5507	if (idle_cpu(i)) {
				5508	struct rq *rq = cpu_rq(i);
				5509	struct cpuidle_state *idle = idle_get_state(rq);
				5510	if (idle && idle->exit_latency < min_exit_latency) {
				5511	/*
				5512	* We give priority to a CPU whose idle state
				5513	* has the smallest exit latency irrespective
				5514	* of any idle timestamp.
				5515	*/
				5516	min_exit_latency = idle->exit_latency;
				5517	latest_idle_timestamp = rq->idle_stamp;
				5518	shallowest_idle_cpu = i;
				5519	} else if ((!idle \|\| idle->exit_latency == min_exit_latency) &&
				5520	rq->idle_stamp > latest_idle_timestamp) {
				5521	/*
				5522	* If equal or no active idle state, then
				5523	* the most recently idled CPU might have
				5524	* a warmer cache.
				5525	*/
				5526	latest_idle_timestamp = rq->idle_stamp;
				5527	shallowest_idle_cpu = i;
				5528	}
Yao Dongdong	9f96742	2014-10-28 04:08:06 +0000	[diff] [blame]	5529	} else if (shallowest_idle_cpu == -1) {
Nicolas Pitre	83a0a96	2014-09-04 11:32:10 -0400	[diff] [blame]	5530	load = weighted_cpuload(i);
				5531	if (load < min_load \|\| (load == min_load && i == this_cpu)) {
				5532	min_load = load;
				5533	least_loaded_cpu = i;
				5534	}
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	5535	}
				5536	}
				5537
Nicolas Pitre	83a0a96	2014-09-04 11:32:10 -0400	[diff] [blame]	5538	return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5539	}
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	5540
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5541	/*
Peter Zijlstra	10e2f1a	2016-05-09 10:38:05 +0200	[diff] [blame]	5542	* Implement a for_each_cpu() variant that starts the scan at a given cpu
				5543	* (@start), and wraps around.
				5544	*
				5545	* This is used to scan for idle CPUs; such that not all CPUs looking for an
				5546	* idle CPU find the same CPU. The down-side is that tasks tend to cycle
				5547	* through the LLC domain.
				5548	*
				5549	* Especially tbench is found sensitive to this.
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	5550	*/
Peter Zijlstra	10e2f1a	2016-05-09 10:38:05 +0200	[diff] [blame]	5551
				5552	static int cpumask_next_wrap(int n, const struct cpumask mask, int start, int wrapped)
				5553	{
				5554	int next;
				5555
				5556	again:
				5557	next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1);
				5558
				5559	if (*wrapped) {
				5560	if (next >= start)
				5561	return nr_cpumask_bits;
				5562	} else {
				5563	if (next >= nr_cpumask_bits) {
				5564	*wrapped = 1;
				5565	n = -1;
				5566	goto again;
				5567	}
				5568	}
				5569
				5570	return next;
				5571	}
				5572
				5573	#define for_each_cpu_wrap(cpu, mask, start, wrap) \
				5574	for ((wrap) = 0, (cpu) = (start)-1; \
				5575	(cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \
				5576	(cpu) < nr_cpumask_bits; )
				5577
				5578	#ifdef CONFIG_SCHED_SMT
				5579
				5580	static inline void set_idle_cores(int cpu, int val)
				5581	{
				5582	struct sched_domain_shared *sds;
				5583
				5584	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
				5585	if (sds)
				5586	WRITE_ONCE(sds->has_idle_cores, val);
				5587	}
				5588
				5589	static inline bool test_idle_cores(int cpu, bool def)
				5590	{
				5591	struct sched_domain_shared *sds;
				5592
				5593	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
				5594	if (sds)
				5595	return READ_ONCE(sds->has_idle_cores);
				5596
				5597	return def;
				5598	}
				5599
				5600	/*
				5601	* Scans the local SMT mask to see if the entire core is idle, and records this
				5602	* information in sd_llc_shared->has_idle_cores.
				5603	*
				5604	* Since SMT siblings share all cache levels, inspecting this limited remote
				5605	* state should be fairly cheap.
				5606	*/
Peter Zijlstra	1b568f0	2016-05-09 10:38:41 +0200	[diff] [blame]	5607	void __update_idle_core(struct rq *rq)
Peter Zijlstra	10e2f1a	2016-05-09 10:38:05 +0200	[diff] [blame]	5608	{
				5609	int core = cpu_of(rq);
				5610	int cpu;
				5611
				5612	rcu_read_lock();
				5613	if (test_idle_cores(core, true))
				5614	goto unlock;
				5615
				5616	for_each_cpu(cpu, cpu_smt_mask(core)) {
				5617	if (cpu == core)
				5618	continue;
				5619
				5620	if (!idle_cpu(cpu))
				5621	goto unlock;
				5622	}
				5623
				5624	set_idle_cores(core, 1);
				5625	unlock:
				5626	rcu_read_unlock();
				5627	}
				5628
				5629	/*
				5630	* Scan the entire LLC domain for idle cores; this dynamically switches off if
				5631	* there are no idle cores left in the system; tracked through
				5632	* sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
				5633	*/
				5634	static int select_idle_core(struct task_struct p, struct sched_domain sd, int target)
				5635	{
				5636	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
				5637	int core, cpu, wrap;
				5638
Peter Zijlstra	1b568f0	2016-05-09 10:38:41 +0200	[diff] [blame]	5639	if (!static_branch_likely(&sched_smt_present))
				5640	return -1;
				5641
Peter Zijlstra	10e2f1a	2016-05-09 10:38:05 +0200	[diff] [blame]	5642	if (!test_idle_cores(target, false))
				5643	return -1;
				5644
				5645	cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p));
				5646
				5647	for_each_cpu_wrap(core, cpus, target, wrap) {
				5648	bool idle = true;
				5649
				5650	for_each_cpu(cpu, cpu_smt_mask(core)) {
				5651	cpumask_clear_cpu(cpu, cpus);
				5652	if (!idle_cpu(cpu))
				5653	idle = false;
				5654	}
				5655
				5656	if (idle)
				5657	return core;
				5658	}
				5659
				5660	/*
				5661	* Failed to find an idle core; stop looking for one.
				5662	*/
				5663	set_idle_cores(target, 0);
				5664
				5665	return -1;
				5666	}
				5667
				5668	/*
				5669	* Scan the local SMT mask for idle CPUs.
				5670	*/
				5671	static int select_idle_smt(struct task_struct p, struct sched_domain sd, int target)
				5672	{
				5673	int cpu;
				5674
Peter Zijlstra	1b568f0	2016-05-09 10:38:41 +0200	[diff] [blame]	5675	if (!static_branch_likely(&sched_smt_present))
				5676	return -1;
				5677
Peter Zijlstra	10e2f1a	2016-05-09 10:38:05 +0200	[diff] [blame]	5678	for_each_cpu(cpu, cpu_smt_mask(target)) {
				5679	if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
				5680	continue;
				5681	if (idle_cpu(cpu))
				5682	return cpu;
				5683	}
				5684
				5685	return -1;
				5686	}
				5687
				5688	#else /* CONFIG_SCHED_SMT */
				5689
				5690	static inline int select_idle_core(struct task_struct p, struct sched_domain sd, int target)
				5691	{
				5692	return -1;
				5693	}
				5694
				5695	static inline int select_idle_smt(struct task_struct p, struct sched_domain sd, int target)
				5696	{
				5697	return -1;
				5698	}
				5699
				5700	#endif /* CONFIG_SCHED_SMT */
				5701
				5702	/*
				5703	* Scan the LLC domain for idle CPUs; this is dynamically regulated by
				5704	* comparing the average scan cost (tracked in sd->avg_scan_cost) against the
				5705	* average idle time for this rq (as found in rq->avg_idle).
				5706	*/
				5707	static int select_idle_cpu(struct task_struct p, struct sched_domain sd, int target)
				5708	{
Wanpeng Li	9cfb38a	2016-10-09 08:04:03 +0800	[diff] [blame]	5709	struct sched_domain *this_sd;
				5710	u64 avg_cost, avg_idle = this_rq()->avg_idle;
Peter Zijlstra	10e2f1a	2016-05-09 10:38:05 +0200	[diff] [blame]	5711	u64 time, cost;
				5712	s64 delta;
				5713	int cpu, wrap;
				5714
Wanpeng Li	9cfb38a	2016-10-09 08:04:03 +0800	[diff] [blame]	5715	this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
				5716	if (!this_sd)
				5717	return -1;
				5718
				5719	avg_cost = this_sd->avg_scan_cost;
				5720
Peter Zijlstra	10e2f1a	2016-05-09 10:38:05 +0200	[diff] [blame]	5721	/*
				5722	* Due to large variance we need a large fuzz factor; hackbench in
				5723	* particularly is sensitive here.
				5724	*/
				5725	if ((avg_idle / 512) < avg_cost)
				5726	return -1;
				5727
				5728	time = local_clock();
				5729
				5730	for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
				5731	if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
				5732	continue;
				5733	if (idle_cpu(cpu))
				5734	break;
				5735	}
				5736
				5737	time = local_clock() - time;
				5738	cost = this_sd->avg_scan_cost;
				5739	delta = (s64)(time - cost) / 8;
				5740	this_sd->avg_scan_cost += delta;
				5741
				5742	return cpu;
				5743	}
				5744
				5745	/*
				5746	* Try and locate an idle core/thread in the LLC cache domain.
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	5747	*/
Morten Rasmussen	772bd008c	2016-06-22 18:03:13 +0100	[diff] [blame]	5748	static int select_idle_sibling(struct task_struct *p, int prev, int target)
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	5749	{
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	5750	struct sched_domain *sd;
Peter Zijlstra	10e2f1a	2016-05-09 10:38:05 +0200	[diff] [blame]	5751	int i;
Mike Galbraith	e0a79f5	2013-01-28 12:19:25 +0100	[diff] [blame]	5752
				5753	if (idle_cpu(target))
				5754	return target;
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	5755
				5756	/*
Peter Zijlstra	10e2f1a	2016-05-09 10:38:05 +0200	[diff] [blame]	5757	* If the previous cpu is cache affine and idle, don't be stupid.
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	5758	*/
Morten Rasmussen	772bd008c	2016-06-22 18:03:13 +0100	[diff] [blame]	5759	if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
				5760	return prev;
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	5761
Peter Zijlstra	518cd62	2011-12-07 15:07:31 +0100	[diff] [blame]	5762	sd = rcu_dereference(per_cpu(sd_llc, target));
Peter Zijlstra	10e2f1a	2016-05-09 10:38:05 +0200	[diff] [blame]	5763	if (!sd)
				5764	return target;
Morten Rasmussen	772bd008c	2016-06-22 18:03:13 +0100	[diff] [blame]	5765
Peter Zijlstra	10e2f1a	2016-05-09 10:38:05 +0200	[diff] [blame]	5766	i = select_idle_core(p, sd, target);
				5767	if ((unsigned)i < nr_cpumask_bits)
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	5768	return i;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	5769
Peter Zijlstra	10e2f1a	2016-05-09 10:38:05 +0200	[diff] [blame]	5770	i = select_idle_cpu(p, sd, target);
				5771	if ((unsigned)i < nr_cpumask_bits)
				5772	return i;
Mike Galbraith	970e178	2012-06-12 05:18:32 +0200	[diff] [blame]	5773
Peter Zijlstra	10e2f1a	2016-05-09 10:38:05 +0200	[diff] [blame]	5774	i = select_idle_smt(p, sd, target);
				5775	if ((unsigned)i < nr_cpumask_bits)
				5776	return i;
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	5777
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	5778	return target;
				5779	}
Dietmar Eggemann	231678b	2015-08-14 17:23:13 +0100	[diff] [blame]	5780
Vincent Guittot	8bb5b00	2015-03-04 08:48:47 +0100	[diff] [blame]	5781	/*
Dietmar Eggemann	9e91d61	2015-08-14 17:23:12 +0100	[diff] [blame]	5782	* cpu_util returns the amount of capacity of a CPU that is used by CFS
Vincent Guittot	8bb5b00	2015-03-04 08:48:47 +0100	[diff] [blame]	5783	* tasks. The unit of the return value must be the one of capacity so we can
Dietmar Eggemann	9e91d61	2015-08-14 17:23:12 +0100	[diff] [blame]	5784	* compare the utilization with the capacity of the CPU that is available for
				5785	* CFS task (ie cpu_capacity).
Dietmar Eggemann	231678b	2015-08-14 17:23:13 +0100	[diff] [blame]	5786	*
				5787	* cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
				5788	* recent utilization of currently non-runnable tasks on a CPU. It represents
				5789	* the amount of utilization of a CPU in the range [0..capacity_orig] where
				5790	* capacity_orig is the cpu_capacity available at the highest frequency
				5791	* (arch_scale_freq_capacity()).
				5792	* The utilization of a CPU converges towards a sum equal to or less than the
				5793	* current capacity (capacity_curr <= capacity_orig) of the CPU because it is
				5794	* the running time on this CPU scaled by capacity_curr.
				5795	*
				5796	* Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
				5797	* higher than capacity_orig because of unfortunate rounding in
				5798	* cfs.avg.util_avg or just after migrating tasks and new task wakeups until
				5799	* the average stabilizes with the new running time. We need to check that the
				5800	* utilization stays within the range of [0..capacity_orig] and cap it if
				5801	* necessary. Without utilization capping, a group could be seen as overloaded
				5802	* (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
				5803	* available capacity. We allow utilization to overshoot capacity_curr (but not
				5804	* capacity_orig) as it useful for predicting the capacity required after task
				5805	* migrations (scheduler-driven DVFS).
Vincent Guittot	8bb5b00	2015-03-04 08:48:47 +0100	[diff] [blame]	5806	*/
Dietmar Eggemann	9e91d61	2015-08-14 17:23:12 +0100	[diff] [blame]	5807	static int cpu_util(int cpu)
Vincent Guittot	8bb5b00	2015-03-04 08:48:47 +0100	[diff] [blame]	5808	{
Dietmar Eggemann	9e91d61	2015-08-14 17:23:12 +0100	[diff] [blame]	5809	unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
Vincent Guittot	8bb5b00	2015-03-04 08:48:47 +0100	[diff] [blame]	5810	unsigned long capacity = capacity_orig_of(cpu);
				5811
Dietmar Eggemann	231678b	2015-08-14 17:23:13 +0100	[diff] [blame]	5812	return (util >= capacity) ? capacity : util;
Vincent Guittot	8bb5b00	2015-03-04 08:48:47 +0100	[diff] [blame]	5813	}
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	5814
Morten Rasmussen	3273163	2016-07-25 14:34:26 +0100	[diff] [blame]	5815	static inline int task_util(struct task_struct *p)
				5816	{
				5817	return p->se.avg.util_avg;
				5818	}
				5819
				5820	/*
Morten Rasmussen	104cb16	2016-10-14 14:41:07 +0100	[diff] [blame]	5821	* cpu_util_wake: Compute cpu utilization with any contributions from
				5822	* the waking task p removed.
				5823	*/
				5824	static int cpu_util_wake(int cpu, struct task_struct *p)
				5825	{
				5826	unsigned long util, capacity;
				5827
				5828	/* Task has no contribution or is new */
				5829	if (cpu != task_cpu(p) \|\| !p->se.avg.last_update_time)
				5830	return cpu_util(cpu);
				5831
				5832	capacity = capacity_orig_of(cpu);
				5833	util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0);
				5834
				5835	return (util >= capacity) ? capacity : util;
				5836	}
				5837
				5838	/*
Morten Rasmussen	3273163	2016-07-25 14:34:26 +0100	[diff] [blame]	5839	* Disable WAKE_AFFINE in the case where task @p doesn't fit in the
				5840	* capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
				5841	*
				5842	* In that case WAKE_AFFINE doesn't make sense and we'll let
				5843	* BALANCE_WAKE sort things out.
				5844	*/
				5845	static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
				5846	{
				5847	long min_cap, max_cap;
				5848
				5849	min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
				5850	max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
				5851
				5852	/* Minimum capacity is close to max, no need to abort wake_affine */
				5853	if (max_cap - min_cap < max_cap >> 3)
				5854	return 0;
				5855
Morten Rasmussen	104cb16	2016-10-14 14:41:07 +0100	[diff] [blame]	5856	/* Bring task utilization in sync with prev_cpu */
				5857	sync_entity_load_avg(&p->se);
				5858
Morten Rasmussen	3273163	2016-07-25 14:34:26 +0100	[diff] [blame]	5859	return min_cap * 1024 < task_util(p) * capacity_margin;
				5860	}
				5861
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	5862	/*
Morten Rasmussen	de91b9c	2014-02-18 14:14:24 +0000	[diff] [blame]	5863	* select_task_rq_fair: Select target runqueue for the waking task in domains
				5864	* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
				5865	* SD_BALANCE_FORK, or SD_BALANCE_EXEC.
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5866	*
Morten Rasmussen	de91b9c	2014-02-18 14:14:24 +0000	[diff] [blame]	5867	* Balances load by selecting the idlest cpu in the idlest group, or under
				5868	* certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5869	*
Morten Rasmussen	de91b9c	2014-02-18 14:14:24 +0000	[diff] [blame]	5870	* Returns the target cpu number.
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5871	*
				5872	* preempt must be disabled.
				5873	*/
Peter Zijlstra	0017d73	2010-03-24 18:34:10 +0100	[diff] [blame]	5874	static int
Peter Zijlstra	ac66f54	2013-10-07 11:29:16 +0100	[diff] [blame]	5875	select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5876	{
Peter Zijlstra	29cd8ba	2009-09-17 09:01:14 +0200	[diff] [blame]	5877	struct sched_domain tmp, affine_sd = NULL, *sd = NULL;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	5878	int cpu = smp_processor_id();
Mike Galbraith	63b0e9e	2015-07-14 17:39:50 +0200	[diff] [blame]	5879	int new_cpu = prev_cpu;
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	5880	int want_affine = 0;
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	5881	int sync = wake_flags & WF_SYNC;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	5882
Peter Zijlstra	c58d25f	2016-05-12 09:19:59 +0200	[diff] [blame]	5883	if (sd_flag & SD_BALANCE_WAKE) {
				5884	record_wakee(p);
Morten Rasmussen	3273163	2016-07-25 14:34:26 +0100	[diff] [blame]	5885	want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
				5886	&& cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
Peter Zijlstra	c58d25f	2016-05-12 09:19:59 +0200	[diff] [blame]	5887	}
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	5888
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5889	rcu_read_lock();
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5890	for_each_domain(cpu, tmp) {
Peter Zijlstra	e4f4288	2009-12-16 18:04:34 +0100	[diff] [blame]	5891	if (!(tmp->flags & SD_LOAD_BALANCE))
Mike Galbraith	63b0e9e	2015-07-14 17:39:50 +0200	[diff] [blame]	5892	break;
Peter Zijlstra	e4f4288	2009-12-16 18:04:34 +0100	[diff] [blame]	5893
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5894	/*
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	5895	* If both cpu and prev_cpu are part of this domain,
				5896	* cpu is a valid SD_WAKE_AFFINE target.
Peter Zijlstra	fe3bcfe	2009-11-12 15:55:29 +0100	[diff] [blame]	5897	*/
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	5898	if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
				5899	cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
				5900	affine_sd = tmp;
Alex Shi	f03542a	2012-07-26 08:55:34 +0800	[diff] [blame]	5901	break;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	5902	}
				5903
Alex Shi	f03542a	2012-07-26 08:55:34 +0800	[diff] [blame]	5904	if (tmp->flags & sd_flag)
Peter Zijlstra	29cd8ba	2009-09-17 09:01:14 +0200	[diff] [blame]	5905	sd = tmp;
Mike Galbraith	63b0e9e	2015-07-14 17:39:50 +0200	[diff] [blame]	5906	else if (!want_affine)
				5907	break;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	5908	}
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5909
Mike Galbraith	63b0e9e	2015-07-14 17:39:50 +0200	[diff] [blame]	5910	if (affine_sd) {
				5911	sd = NULL; /* Prefer wake_affine over balance flags */
Morten Rasmussen	772bd008c	2016-06-22 18:03:13 +0100	[diff] [blame]	5912	if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
Mike Galbraith	63b0e9e	2015-07-14 17:39:50 +0200	[diff] [blame]	5913	new_cpu = cpu;
Mike Galbraith	8b911ac	2010-03-11 17:17:16 +0100	[diff] [blame]	5914	}
Peter Zijlstra	3b64089	2009-09-16 13:44:33 +0200	[diff] [blame]	5915
Mike Galbraith	63b0e9e	2015-07-14 17:39:50 +0200	[diff] [blame]	5916	if (!sd) {
				5917	if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
Morten Rasmussen	772bd008c	2016-06-22 18:03:13 +0100	[diff] [blame]	5918	new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
Mike Galbraith	63b0e9e	2015-07-14 17:39:50 +0200	[diff] [blame]	5919
				5920	} else while (sd) {
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5921	struct sched_group *group;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	5922	int weight;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5923
Peter Zijlstra	0763a66	2009-09-14 19:37:39 +0200	[diff] [blame]	5924	if (!(sd->flags & sd_flag)) {
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5925	sd = sd->child;
				5926	continue;
				5927	}
				5928
Vincent Guittot	c44f2a0	2013-10-18 13:52:21 +0200	[diff] [blame]	5929	group = find_idlest_group(sd, p, cpu, sd_flag);
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5930	if (!group) {
				5931	sd = sd->child;
				5932	continue;
				5933	}
				5934
Peter Zijlstra	d7c33c4	2009-09-11 12:45:38 +0200	[diff] [blame]	5935	new_cpu = find_idlest_cpu(group, p, cpu);
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5936	if (new_cpu == -1 \|\| new_cpu == cpu) {
				5937	/* Now try balancing at a lower domain level of cpu */
				5938	sd = sd->child;
				5939	continue;
				5940	}
				5941
				5942	/* Now try balancing at a lower domain level of new_cpu */
				5943	cpu = new_cpu;
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	5944	weight = sd->span_weight;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5945	sd = NULL;
				5946	for_each_domain(cpu, tmp) {
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	5947	if (weight <= tmp->span_weight)
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5948	break;
Peter Zijlstra	0763a66	2009-09-14 19:37:39 +0200	[diff] [blame]	5949	if (tmp->flags & sd_flag)
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5950	sd = tmp;
				5951	}
				5952	/* while loop will break here if sd == NULL */
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	5953	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5954	rcu_read_unlock();
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	5955
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	5956	return new_cpu;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	5957	}
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	5958
				5959	/*
				5960	* Called immediately before a task is migrated to a new cpu; task_cpu(p) and
				5961	* cfs_rq_of(p) references at time of call are still valid and identify the
Byungchul Park	525628c	2015-11-18 09:34:59 +0900	[diff] [blame]	5962	* previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	5963	*/
xiaofeng.yan	5a4fd03	2015-09-23 14:55:59 +0800	[diff] [blame]	5964	static void migrate_task_rq_fair(struct task_struct *p)
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	5965	{
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	5966	/*
Peter Zijlstra	59efa0b	2016-05-10 18:24:37 +0200	[diff] [blame]	5967	* As blocked tasks retain absolute vruntime the migration needs to
				5968	* deal with this by subtracting the old and adding the new
				5969	* min_vruntime -- the latter is done by enqueue_entity() when placing
				5970	* the task on the new runqueue.
				5971	*/
				5972	if (p->state == TASK_WAKING) {
				5973	struct sched_entity *se = &p->se;
				5974	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				5975	u64 min_vruntime;
				5976
				5977	#ifndef CONFIG_64BIT
				5978	u64 min_vruntime_copy;
				5979
				5980	do {
				5981	min_vruntime_copy = cfs_rq->min_vruntime_copy;
				5982	smp_rmb();
				5983	min_vruntime = cfs_rq->min_vruntime;
				5984	} while (min_vruntime != min_vruntime_copy);
				5985	#else
				5986	min_vruntime = cfs_rq->min_vruntime;
				5987	#endif
				5988
				5989	se->vruntime -= min_vruntime;
				5990	}
				5991
				5992	/*
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	5993	* We are supposed to update the task to "current" time, then its up to date
				5994	* and ready to go to new CPU/cfs_rq. But we have difficulty in getting
				5995	* what current time is, so simply throw away the out-of-date time. This
				5996	* will result in the wakee task is less decayed, but giving the wakee more
				5997	* load sounds not bad.
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	5998	*/
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	5999	remove_entity_load_avg(&p->se);
				6000
				6001	/* Tell new CPU we are migrated */
				6002	p->se.avg.last_update_time = 0;
Ben Segall	3944a92	2014-05-15 15:59:20 -0700	[diff] [blame]	6003
				6004	/* We have migrated, no longer consider this task hot */
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	6005	p->se.exec_start = 0;
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	6006	}
Yuyang Du	1269557	2015-07-15 08:04:40 +0800	[diff] [blame]	6007
				6008	static void task_dead_fair(struct task_struct *p)
				6009	{
				6010	remove_entity_load_avg(&p->se);
				6011	}
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	6012	#endif /* CONFIG_SMP */
				6013
Peter Zijlstra	e52fb7c	2009-01-14 12:39:19 +0100	[diff] [blame]	6014	static unsigned long
				6015	wakeup_gran(struct sched_entity curr, struct sched_entity se)
Peter Zijlstra	0bbd333	2008-04-19 19:44:57 +0200	[diff] [blame]	6016	{
				6017	unsigned long gran = sysctl_sched_wakeup_granularity;
				6018
				6019	/*
Peter Zijlstra	e52fb7c	2009-01-14 12:39:19 +0100	[diff] [blame]	6020	* Since its curr running now, convert the gran from real-time
				6021	* to virtual-time in his units.
Mike Galbraith	13814d4	2010-03-11 17:17:04 +0100	[diff] [blame]	6022	*
				6023	* By using 'se' instead of 'curr' we penalize light tasks, so
				6024	* they get preempted easier. That is, if 'se' < 'curr' then
				6025	* the resulting gran will be larger, therefore penalizing the
				6026	* lighter, if otoh 'se' > 'curr' then the resulting gran will
				6027	* be smaller, again penalizing the lighter task.
				6028	*
				6029	* This is especially important for buddies when the leftmost
				6030	* task is higher priority than the buddy.
Peter Zijlstra	0bbd333	2008-04-19 19:44:57 +0200	[diff] [blame]	6031	*/
Shaohua Li	f4ad9bd	2011-04-08 12:53:09 +0800	[diff] [blame]	6032	return calc_delta_fair(gran, se);
Peter Zijlstra	0bbd333	2008-04-19 19:44:57 +0200	[diff] [blame]	6033	}
				6034
				6035	/*
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	6036	* Should 'se' preempt 'curr'.
				6037	*
				6038	* \|s1
				6039	* \|s2
				6040	* \|s3
				6041	* g
				6042	* \|<--->\|c
				6043	*
				6044	* w(c, s1) = -1
				6045	* w(c, s2) = 0
				6046	* w(c, s3) = 1
				6047	*
				6048	*/
				6049	static int
				6050	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se)
				6051	{
				6052	s64 gran, vdiff = curr->vruntime - se->vruntime;
				6053
				6054	if (vdiff <= 0)
				6055	return -1;
				6056
Peter Zijlstra	e52fb7c	2009-01-14 12:39:19 +0100	[diff] [blame]	6057	gran = wakeup_gran(curr, se);
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	6058	if (vdiff > gran)
				6059	return 1;
				6060
				6061	return 0;
				6062	}
				6063
Peter Zijlstra	0247909	2008-11-04 21:25:10 +0100	[diff] [blame]	6064	static void set_last_buddy(struct sched_entity *se)
				6065	{
Venkatesh Pallipadi	69c80f3	2011-04-13 18:21:09 -0700	[diff] [blame]	6066	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
				6067	return;
				6068
				6069	for_each_sched_entity(se)
				6070	cfs_rq_of(se)->last = se;
Peter Zijlstra	0247909	2008-11-04 21:25:10 +0100	[diff] [blame]	6071	}
				6072
				6073	static void set_next_buddy(struct sched_entity *se)
				6074	{
Venkatesh Pallipadi	69c80f3	2011-04-13 18:21:09 -0700	[diff] [blame]	6075	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
				6076	return;
				6077
				6078	for_each_sched_entity(se)
				6079	cfs_rq_of(se)->next = se;
Peter Zijlstra	0247909	2008-11-04 21:25:10 +0100	[diff] [blame]	6080	}
				6081
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	6082	static void set_skip_buddy(struct sched_entity *se)
				6083	{
Venkatesh Pallipadi	69c80f3	2011-04-13 18:21:09 -0700	[diff] [blame]	6084	for_each_sched_entity(se)
				6085	cfs_rq_of(se)->skip = se;
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	6086	}
				6087
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	6088	/*
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6089	* Preempt the current task with a newly woken task if needed:
				6090	*/
Peter Zijlstra	5a9b86f	2009-09-16 13:47:58 +0200	[diff] [blame]	6091	static void check_preempt_wakeup(struct rq rq, struct task_struct p, int wake_flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6092	{
				6093	struct task_struct *curr = rq->curr;
Srivatsa Vaddagiri	8651a86	2007-10-15 17:00:12 +0200	[diff] [blame]	6094	struct sched_entity se = &curr->se, pse = &p->se;
Mike Galbraith	03e89e4	2008-12-16 08:45:30 +0100	[diff] [blame]	6095	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	6096	int scale = cfs_rq->nr_running >= sched_nr_latency;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	6097	int next_buddy_marked = 0;
Mike Galbraith	03e89e4	2008-12-16 08:45:30 +0100	[diff] [blame]	6098
Ingo Molnar	4ae7d5c	2008-03-19 01:42:00 +0100	[diff] [blame]	6099	if (unlikely(se == pse))
				6100	return;
				6101
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	6102	/*
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6103	* This is possible from callers such as attach_tasks(), in which we
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	6104	* unconditionally check_prempt_curr() after an enqueue (which may have
				6105	* lead to a throttle). This both saves work and prevents false
				6106	* next-buddy nomination below.
				6107	*/
				6108	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
				6109	return;
				6110
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	6111	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
Mike Galbraith	3cb63d5	2009-09-11 12:01:17 +0200	[diff] [blame]	6112	set_next_buddy(pse);
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	6113	next_buddy_marked = 1;
				6114	}
Peter Zijlstra	57fdc26	2008-09-23 15:33:45 +0200	[diff] [blame]	6115
Bharata B Rao	aec0a51	2008-08-28 14:42:49 +0530	[diff] [blame]	6116	/*
				6117	* We can come here with TIF_NEED_RESCHED already set from new task
				6118	* wake up path.
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	6119	*
				6120	* Note: this also catches the edge-case of curr being in a throttled
				6121	* group (e.g. via set_curr_task), since update_curr() (in the
				6122	* enqueue of curr) will have resulted in resched being set. This
				6123	* prevents us from potentially nominating it as a false LAST_BUDDY
				6124	* below.
Bharata B Rao	aec0a51	2008-08-28 14:42:49 +0530	[diff] [blame]	6125	*/
				6126	if (test_tsk_need_resched(curr))
				6127	return;
				6128
Darren Hart	a2f5c9a	2011-02-22 13:04:33 -0800	[diff] [blame]	6129	/* Idle tasks are by definition preempted by non-idle tasks. */
				6130	if (unlikely(curr->policy == SCHED_IDLE) &&
				6131	likely(p->policy != SCHED_IDLE))
				6132	goto preempt;
				6133
Ingo Molnar	91c234b	2007-10-15 17:00:18 +0200	[diff] [blame]	6134	/*
Darren Hart	a2f5c9a	2011-02-22 13:04:33 -0800	[diff] [blame]	6135	* Batch and idle tasks do not preempt non-idle tasks (their preemption
				6136	* is driven by the tick):
Ingo Molnar	91c234b	2007-10-15 17:00:18 +0200	[diff] [blame]	6137	*/
Ingo Molnar	8ed92e5	2012-10-14 14:28:50 +0200	[diff] [blame]	6138	if (unlikely(p->policy != SCHED_NORMAL) \|\| !sched_feat(WAKEUP_PREEMPTION))
Ingo Molnar	91c234b	2007-10-15 17:00:18 +0200	[diff] [blame]	6139	return;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6140
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	6141	find_matching_se(&se, &pse);
Paul Turner	9bbd737	2011-07-05 19:07:21 -0700	[diff] [blame]	6142	update_curr(cfs_rq_of(se));
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	6143	BUG_ON(!pse);
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	6144	if (wakeup_preempt_entity(se, pse) == 1) {
				6145	/*
				6146	* Bias pick_next to pick the sched entity that is
				6147	* triggering this preemption.
				6148	*/
				6149	if (!next_buddy_marked)
				6150	set_next_buddy(pse);
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	6151	goto preempt;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	6152	}
Jupyung Lee	a65ac74	2009-11-17 18:51:40 +0900	[diff] [blame]	6153
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	6154	return;
				6155
				6156	preempt:
Kirill Tkhai	8875125	2014-06-29 00:03:57 +0400	[diff] [blame]	6157	resched_curr(rq);
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	6158	/*
				6159	* Only set the backward buddy when the current task is still
				6160	* on the rq. This can happen when a wakeup gets interleaved
				6161	* with schedule on the ->pre_schedule() or idle_balance()
				6162	* point, either of which can * drop the rq lock.
				6163	*
				6164	* Also, during early boot the idle thread is in the fair class,
				6165	* for obvious reasons its a bad idea to schedule back to it.
				6166	*/
				6167	if (unlikely(!se->on_rq \|\| curr == rq->idle))
				6168	return;
				6169
				6170	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
				6171	set_last_buddy(se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6172	}
				6173
Peter Zijlstra	606dba2	2012-02-11 06:05:00 +0100	[diff] [blame]	6174	static struct task_struct *
Peter Zijlstra	e7904a2	2015-08-01 19:25:08 +0200	[diff] [blame]	6175	pick_next_task_fair(struct rq rq, struct task_struct prev, struct pin_cookie cookie)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6176	{
				6177	struct cfs_rq *cfs_rq = &rq->cfs;
				6178	struct sched_entity *se;
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	6179	struct task_struct *p;
Peter Zijlstra	37e117c	2014-02-14 12:25:08 +0100	[diff] [blame]	6180	int new_tasks;
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	6181
Peter Zijlstra	6e83125	2014-02-11 16:11:48 +0100	[diff] [blame]	6182	again:
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	6183	#ifdef CONFIG_FAIR_GROUP_SCHED
				6184	if (!cfs_rq->nr_running)
Peter Zijlstra	38033c3	2014-01-23 20:32:21 +0100	[diff] [blame]	6185	goto idle;
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	6186
Peter Zijlstra	3f1d2a3	2014-02-12 10:49:30 +0100	[diff] [blame]	6187	if (prev->sched_class != &fair_sched_class)
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	6188	goto simple;
				6189
				6190	/*
				6191	* Because of the set_next_buddy() in dequeue_task_fair() it is rather
				6192	* likely that a next task is from the same cgroup as the current.
				6193	*
				6194	* Therefore attempt to avoid putting and setting the entire cgroup
				6195	* hierarchy, only change the part that actually changes.
				6196	*/
				6197
				6198	do {
				6199	struct sched_entity *curr = cfs_rq->curr;
				6200
				6201	/*
				6202	* Since we got here without doing put_prev_entity() we also
				6203	* have to consider cfs_rq->curr. If it is still a runnable
				6204	* entity, update_curr() will update its vruntime, otherwise
				6205	* forget we've ever seen it.
				6206	*/
Ben Segall	54d2736	2015-04-06 15:28:10 -0700	[diff] [blame]	6207	if (curr) {
				6208	if (curr->on_rq)
				6209	update_curr(cfs_rq);
				6210	else
				6211	curr = NULL;
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	6212
Ben Segall	54d2736	2015-04-06 15:28:10 -0700	[diff] [blame]	6213	/*
				6214	* This call to check_cfs_rq_runtime() will do the
				6215	* throttle and dequeue its entity in the parent(s).
				6216	* Therefore the 'simple' nr_running test will indeed
				6217	* be correct.
				6218	*/
				6219	if (unlikely(check_cfs_rq_runtime(cfs_rq)))
				6220	goto simple;
				6221	}
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	6222
				6223	se = pick_next_entity(cfs_rq, curr);
				6224	cfs_rq = group_cfs_rq(se);
				6225	} while (cfs_rq);
				6226
				6227	p = task_of(se);
				6228
				6229	/*
				6230	* Since we haven't yet done put_prev_entity and if the selected task
				6231	* is a different task than we started out with, try and touch the
				6232	* least amount of cfs_rqs.
				6233	*/
				6234	if (prev != p) {
				6235	struct sched_entity *pse = &prev->se;
				6236
				6237	while (!(cfs_rq = is_same_group(se, pse))) {
				6238	int se_depth = se->depth;
				6239	int pse_depth = pse->depth;
				6240
				6241	if (se_depth <= pse_depth) {
				6242	put_prev_entity(cfs_rq_of(pse), pse);
				6243	pse = parent_entity(pse);
				6244	}
				6245	if (se_depth >= pse_depth) {
				6246	set_next_entity(cfs_rq_of(se), se);
				6247	se = parent_entity(se);
				6248	}
				6249	}
				6250
				6251	put_prev_entity(cfs_rq, pse);
				6252	set_next_entity(cfs_rq, se);
				6253	}
				6254
				6255	if (hrtick_enabled(rq))
				6256	hrtick_start_fair(rq, p);
				6257
				6258	return p;
				6259	simple:
				6260	cfs_rq = &rq->cfs;
				6261	#endif
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6262
Tim Blechmann	36ace27	2009-11-24 11:55:45 +0100	[diff] [blame]	6263	if (!cfs_rq->nr_running)
Peter Zijlstra	38033c3	2014-01-23 20:32:21 +0100	[diff] [blame]	6264	goto idle;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6265
Peter Zijlstra	3f1d2a3	2014-02-12 10:49:30 +0100	[diff] [blame]	6266	put_prev_task(rq, prev);
Peter Zijlstra	606dba2	2012-02-11 06:05:00 +0100	[diff] [blame]	6267
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6268	do {
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	6269	se = pick_next_entity(cfs_rq, NULL);
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	6270	set_next_entity(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6271	cfs_rq = group_cfs_rq(se);
				6272	} while (cfs_rq);
				6273
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	6274	p = task_of(se);
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	6275
Mike Galbraith	b39e66e	2011-11-22 15:20:07 +0100	[diff] [blame]	6276	if (hrtick_enabled(rq))
				6277	hrtick_start_fair(rq, p);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	6278
				6279	return p;
Peter Zijlstra	38033c3	2014-01-23 20:32:21 +0100	[diff] [blame]	6280
				6281	idle:
Peter Zijlstra	cbce1a6	2015-06-11 14:46:54 +0200	[diff] [blame]	6282	/*
				6283	* This is OK, because current is on_cpu, which avoids it being picked
				6284	* for load-balance and preemption/IRQs are still disabled avoiding
				6285	* further scheduler activity on it and we're being very careful to
				6286	* re-start the picking loop.
				6287	*/
Peter Zijlstra	e7904a2	2015-08-01 19:25:08 +0200	[diff] [blame]	6288	lockdep_unpin_lock(&rq->lock, cookie);
Kirill Tkhai	e4aa358	2014-03-06 13:31:55 +0400	[diff] [blame]	6289	new_tasks = idle_balance(rq);
Peter Zijlstra	e7904a2	2015-08-01 19:25:08 +0200	[diff] [blame]	6290	lockdep_repin_lock(&rq->lock, cookie);
Peter Zijlstra	37e117c	2014-02-14 12:25:08 +0100	[diff] [blame]	6291	/*
				6292	* Because idle_balance() releases (and re-acquires) rq->lock, it is
				6293	* possible for any higher priority task to appear. In that case we
				6294	* must re-start the pick_next_entity() loop.
				6295	*/
Kirill Tkhai	e4aa358	2014-03-06 13:31:55 +0400	[diff] [blame]	6296	if (new_tasks < 0)
Peter Zijlstra	37e117c	2014-02-14 12:25:08 +0100	[diff] [blame]	6297	return RETRY_TASK;
				6298
Kirill Tkhai	e4aa358	2014-03-06 13:31:55 +0400	[diff] [blame]	6299	if (new_tasks > 0)
Peter Zijlstra	38033c3	2014-01-23 20:32:21 +0100	[diff] [blame]	6300	goto again;
Peter Zijlstra	38033c3	2014-01-23 20:32:21 +0100	[diff] [blame]	6301
				6302	return NULL;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6303	}
				6304
				6305	/*
				6306	* Account for a descheduled task:
				6307	*/
Ingo Molnar	31ee529	2007-08-09 11:16:49 +0200	[diff] [blame]	6308	static void put_prev_task_fair(struct rq rq, struct task_struct prev)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6309	{
				6310	struct sched_entity *se = &prev->se;
				6311	struct cfs_rq *cfs_rq;
				6312
				6313	for_each_sched_entity(se) {
				6314	cfs_rq = cfs_rq_of(se);
Ingo Molnar	ab6cde2	2007-08-09 11:16:48 +0200	[diff] [blame]	6315	put_prev_entity(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6316	}
				6317	}
				6318
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	6319	/*
				6320	* sched_yield() is very simple
				6321	*
				6322	* The magic of dealing with the ->skip buddy is in pick_next_entity.
				6323	*/
				6324	static void yield_task_fair(struct rq *rq)
				6325	{
				6326	struct task_struct *curr = rq->curr;
				6327	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
				6328	struct sched_entity *se = &curr->se;
				6329
				6330	/*
				6331	* Are we the only task in the tree?
				6332	*/
				6333	if (unlikely(rq->nr_running == 1))
				6334	return;
				6335
				6336	clear_buddies(cfs_rq, se);
				6337
				6338	if (curr->policy != SCHED_BATCH) {
				6339	update_rq_clock(rq);
				6340	/*
				6341	* Update run-time statistics of the 'current'.
				6342	*/
				6343	update_curr(cfs_rq);
Mike Galbraith	916671c	2011-11-22 15:21:26 +0100	[diff] [blame]	6344	/*
				6345	* Tell update_rq_clock() that we've just updated,
				6346	* so we don't do microscopic update in schedule()
				6347	* and double the fastpath cost.
				6348	*/
Peter Zijlstra	9edfbfe	2015-01-05 11:18:11 +0100	[diff] [blame]	6349	rq_clock_skip_update(rq, true);
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	6350	}
				6351
				6352	set_skip_buddy(se);
				6353	}
				6354
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	6355	static bool yield_to_task_fair(struct rq rq, struct task_struct p, bool preempt)
				6356	{
				6357	struct sched_entity *se = &p->se;
				6358
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	6359	/* throttled hierarchies are not runnable */
				6360	if (!se->on_rq \|\| throttled_hierarchy(cfs_rq_of(se)))
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	6361	return false;
				6362
				6363	/* Tell the scheduler that we'd really like pse to run next. */
				6364	set_next_buddy(se);
				6365
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	6366	yield_task_fair(rq);
				6367
				6368	return true;
				6369	}
				6370
Peter Williams	681f3e6	2007-10-24 18:23:51 +0200	[diff] [blame]	6371	#ifdef CONFIG_SMP
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6372	/**************************************************
Peter Zijlstra	e9c84cb	2012-07-03 13:53:26 +0200	[diff] [blame]	6373	* Fair scheduling class load-balancing methods.
				6374	*
				6375	* BASICS
				6376	*
				6377	* The purpose of load-balancing is to achieve the same basic fairness the
				6378	* per-cpu scheduler provides, namely provide a proportional amount of compute
				6379	* time to each task. This is expressed in the following equation:
				6380	*
				6381	* W_i,n/P_i == W_j,n/P_j for all i,j (1)
				6382	*
				6383	* Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
				6384	* W_i,0 is defined as:
				6385	*
				6386	* W_i,0 = \Sum_j w_i,j (2)
				6387	*
				6388	* Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
Yuyang Du	1c3de5e	2016-03-30 07:07:51 +0800	[diff] [blame]	6389	* is derived from the nice value as per sched_prio_to_weight[].
Peter Zijlstra	e9c84cb	2012-07-03 13:53:26 +0200	[diff] [blame]	6390	*
				6391	* The weight average is an exponential decay average of the instantaneous
				6392	* weight:
				6393	*
				6394	* W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
				6395	*
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	6396	* C_i is the compute capacity of cpu i, typically it is the
Peter Zijlstra	e9c84cb	2012-07-03 13:53:26 +0200	[diff] [blame]	6397	* fraction of 'recent' time available for SCHED_OTHER task execution. But it
				6398	* can also include other factors [XXX].
				6399	*
				6400	* To achieve this balance we define a measure of imbalance which follows
				6401	* directly from (1):
				6402	*
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	6403	* imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4)
Peter Zijlstra	e9c84cb	2012-07-03 13:53:26 +0200	[diff] [blame]	6404	*
				6405	* We them move tasks around to minimize the imbalance. In the continuous
				6406	* function space it is obvious this converges, in the discrete case we get
				6407	* a few fun cases generally called infeasible weight scenarios.
				6408	*
				6409	* [XXX expand on:
				6410	* - infeasible weights;
				6411	* - local vs global optima in the discrete case. ]
				6412	*
				6413	*
				6414	* SCHED DOMAINS
				6415	*
				6416	* In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
				6417	* for all i,j solution, we create a tree of cpus that follows the hardware
				6418	* topology where each level pairs two lower groups (or better). This results
				6419	* in O(log n) layers. Furthermore we reduce the number of cpus going up the
				6420	* tree to only the first of the previous level and we decrease the frequency
				6421	* of load-balance at each level inv. proportional to the number of cpus in
				6422	* the groups.
				6423	*
				6424	* This yields:
				6425	*
				6426	* log_2 n 1 n
				6427	* \Sum { --- * --- * 2^i } = O(n) (5)
				6428	* i = 0 2^i 2^i
				6429	* `- size of each group
				6430	* \| \| `- number of cpus doing load-balance
				6431	* \| `- freq
				6432	* `- sum over all levels
				6433	*
				6434	* Coupled with a limit on how many tasks we can migrate every balance pass,
				6435	* this makes (5) the runtime complexity of the balancer.
				6436	*
				6437	* An important property here is that each CPU is still (indirectly) connected
				6438	* to every other cpu in at most O(log n) steps:
				6439	*
				6440	* The adjacency matrix of the resulting graph is given by:
				6441	*
Byungchul Park	97a7142	2015-07-05 18:33:48 +0900	[diff] [blame]	6442	* log_2 n
Peter Zijlstra	e9c84cb	2012-07-03 13:53:26 +0200	[diff] [blame]	6443	* A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
				6444	* k = 0
				6445	*
				6446	* And you'll find that:
				6447	*
				6448	* A^(log_2 n)_i,j != 0 for all i,j (7)
				6449	*
				6450	* Showing there's indeed a path between every cpu in at most O(log n) steps.
				6451	* The task movement gives a factor of O(m), giving a convergence complexity
				6452	* of:
				6453	*
				6454	* O(nm log n), n := nr_cpus, m := nr_tasks (8)
				6455	*
				6456	*
				6457	* WORK CONSERVING
				6458	*
				6459	* In order to avoid CPUs going idle while there's still work to do, new idle
				6460	* balancing is more aggressive and has the newly idle cpu iterate up the domain
				6461	* tree itself instead of relying on other CPUs to bring it work.
				6462	*
				6463	* This adds some complexity to both (5) and (8) but it reduces the total idle
				6464	* time.
				6465	*
				6466	* [XXX more?]
				6467	*
				6468	*
				6469	* CGROUPS
				6470	*
				6471	* Cgroups make a horror show out of (2), instead of a simple sum we get:
				6472	*
				6473	* s_k,i
				6474	* W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
				6475	* S_k
				6476	*
				6477	* Where
				6478	*
				6479	* s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
				6480	*
				6481	* w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
				6482	*
				6483	* The big problem is S_k, its a global sum needed to compute a local (W_i)
				6484	* property.
				6485	*
				6486	* [XXX write more on how we solve this.. _after_ merging pjt's patches that
				6487	* rewrite all of this once again.]
Byungchul Park	97a7142	2015-07-05 18:33:48 +0900	[diff] [blame]	6488	*/
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6489
Hiroshi Shimamoto	ed387b7	2012-01-31 11:40:32 +0900	[diff] [blame]	6490	static unsigned long __read_mostly max_load_balance_interval = HZ/10;
				6491
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	6492	enum fbq_type { regular, remote, all };
				6493
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	6494	#define LBF_ALL_PINNED 0x01
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	6495	#define LBF_NEED_BREAK 0x02
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	6496	#define LBF_DST_PINNED 0x04
				6497	#define LBF_SOME_PINNED 0x08
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	6498
				6499	struct lb_env {
				6500	struct sched_domain *sd;
				6501
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	6502	struct rq *src_rq;
Prashanth Nageshappa	85c1e7d	2012-06-19 17:47:34 +0530	[diff] [blame]	6503	int src_cpu;
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	6504
				6505	int dst_cpu;
				6506	struct rq *dst_rq;
				6507
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	6508	struct cpumask *dst_grpmask;
				6509	int new_dst_cpu;
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	6510	enum cpu_idle_type idle;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	6511	long imbalance;
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	6512	/* The set of CPUs under consideration for load-balancing */
				6513	struct cpumask *cpus;
				6514
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	6515	unsigned int flags;
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	6516
				6517	unsigned int loop;
				6518	unsigned int loop_break;
				6519	unsigned int loop_max;
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	6520
				6521	enum fbq_type fbq_type;
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6522	struct list_head tasks;
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	6523	};
				6524
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6525	/*
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6526	* Is this task likely cache-hot:
				6527	*/
Hillf Danton	5d5e2b1	2014-06-10 10:58:43 +0200	[diff] [blame]	6528	static int task_hot(struct task_struct p, struct lb_env env)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6529	{
				6530	s64 delta;
				6531
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	6532	lockdep_assert_held(&env->src_rq->lock);
				6533
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6534	if (p->sched_class != &fair_sched_class)
				6535	return 0;
				6536
				6537	if (unlikely(p->policy == SCHED_IDLE))
				6538	return 0;
				6539
				6540	/*
				6541	* Buddy candidates are cache hot:
				6542	*/
Hillf Danton	5d5e2b1	2014-06-10 10:58:43 +0200	[diff] [blame]	6543	if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6544	(&p->se == cfs_rq_of(&p->se)->next \|\|
				6545	&p->se == cfs_rq_of(&p->se)->last))
				6546	return 1;
				6547
				6548	if (sysctl_sched_migration_cost == -1)
				6549	return 1;
				6550	if (sysctl_sched_migration_cost == 0)
				6551	return 0;
				6552
Hillf Danton	5d5e2b1	2014-06-10 10:58:43 +0200	[diff] [blame]	6553	delta = rq_clock_task(env->src_rq) - p->se.exec_start;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6554
				6555	return delta < (s64)sysctl_sched_migration_cost;
				6556	}
				6557
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	6558	#ifdef CONFIG_NUMA_BALANCING
Rik van Riel	c1ceac6	2015-05-14 22:59:36 -0400	[diff] [blame]	6559	/*
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	6560	* Returns 1, if task migration degrades locality
				6561	* Returns 0, if task migration improves locality i.e migration preferred.
				6562	* Returns -1, if task migration is not affected by locality.
Rik van Riel	c1ceac6	2015-05-14 22:59:36 -0400	[diff] [blame]	6563	*/
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	6564	static int migrate_degrades_locality(struct task_struct p, struct lb_env env)
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	6565	{
Rik van Riel	b1ad065	2014-05-15 13:03:06 -0400	[diff] [blame]	6566	struct numa_group *numa_group = rcu_dereference(p->numa_group);
Rik van Riel	c1ceac6	2015-05-14 22:59:36 -0400	[diff] [blame]	6567	unsigned long src_faults, dst_faults;
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	6568	int src_nid, dst_nid;
				6569
Srikar Dronamraju	2a59572	2015-08-11 21:54:21 +0530	[diff] [blame]	6570	if (!static_branch_likely(&sched_numa_balancing))
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	6571	return -1;
				6572
Srikar Dronamraju	c3b9bc5	2015-08-11 16:30:12 +0530	[diff] [blame]	6573	if (!p->numa_faults \|\| !(env->sd->flags & SD_NUMA))
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	6574	return -1;
Mel Gorman	7a0f308	2013-10-07 11:29:01 +0100	[diff] [blame]	6575
				6576	src_nid = cpu_to_node(env->src_cpu);
				6577	dst_nid = cpu_to_node(env->dst_cpu);
				6578
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	6579	if (src_nid == dst_nid)
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	6580	return -1;
Mel Gorman	7a0f308	2013-10-07 11:29:01 +0100	[diff] [blame]	6581
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	6582	/* Migrating away from the preferred node is always bad. */
				6583	if (src_nid == p->numa_preferred_nid) {
				6584	if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
				6585	return 1;
				6586	else
				6587	return -1;
				6588	}
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	6589
Rik van Riel	c1ceac6	2015-05-14 22:59:36 -0400	[diff] [blame]	6590	/* Encourage migration to the preferred node. */
				6591	if (dst_nid == p->numa_preferred_nid)
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	6592	return 0;
Rik van Riel	c1ceac6	2015-05-14 22:59:36 -0400	[diff] [blame]	6593
				6594	if (numa_group) {
				6595	src_faults = group_faults(p, src_nid);
				6596	dst_faults = group_faults(p, dst_nid);
				6597	} else {
				6598	src_faults = task_faults(p, src_nid);
				6599	dst_faults = task_faults(p, dst_nid);
				6600	}
				6601
				6602	return dst_faults < src_faults;
Mel Gorman	7a0f308	2013-10-07 11:29:01 +0100	[diff] [blame]	6603	}
				6604
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	6605	#else
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	6606	static inline int migrate_degrades_locality(struct task_struct *p,
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	6607	struct lb_env *env)
				6608	{
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	6609	return -1;
Mel Gorman	7a0f308	2013-10-07 11:29:01 +0100	[diff] [blame]	6610	}
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	6611	#endif
				6612
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6613	/*
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6614	* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
				6615	*/
				6616	static
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	6617	int can_migrate_task(struct task_struct p, struct lb_env env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6618	{
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	6619	int tsk_cache_hot;
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	6620
				6621	lockdep_assert_held(&env->src_rq->lock);
				6622
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6623	/*
				6624	* We do not migrate tasks that are:
Joonsoo Kim	d319808	2013-04-23 17:27:40 +0900	[diff] [blame]	6625	* 1) throttled_lb_pair, or
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6626	* 2) cannot be migrated to this CPU due to cpus_allowed, or
Joonsoo Kim	d319808	2013-04-23 17:27:40 +0900	[diff] [blame]	6627	* 3) running (obviously), or
				6628	* 4) are cache-hot on their current CPU.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6629	*/
Joonsoo Kim	d319808	2013-04-23 17:27:40 +0900	[diff] [blame]	6630	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
				6631	return 0;
				6632
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	6633	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	6634	int cpu;
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	6635
Josh Poimboeuf	ae92882	2016-06-17 12:43:24 -0500	[diff] [blame]	6636	schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	6637
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	6638	env->flags \|= LBF_SOME_PINNED;
				6639
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	6640	/*
				6641	* Remember if this task can be migrated to any other cpu in
				6642	* our sched_group. We may want to revisit it if we couldn't
				6643	* meet load balance goals by pulling other tasks on src_cpu.
				6644	*
				6645	* Also avoid computing new_dst_cpu if we have already computed
				6646	* one in current iteration.
				6647	*/
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	6648	if (!env->dst_grpmask \|\| (env->flags & LBF_DST_PINNED))
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	6649	return 0;
				6650
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	6651	/* Prevent to re-select dst_cpu via env's cpus */
				6652	for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
				6653	if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	6654	env->flags \|= LBF_DST_PINNED;
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	6655	env->new_dst_cpu = cpu;
				6656	break;
				6657	}
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	6658	}
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	6659
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6660	return 0;
				6661	}
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	6662
				6663	/* Record that we found atleast one task that could run on dst_cpu */
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	6664	env->flags &= ~LBF_ALL_PINNED;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6665
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	6666	if (task_running(env->src_rq, p)) {
Josh Poimboeuf	ae92882	2016-06-17 12:43:24 -0500	[diff] [blame]	6667	schedstat_inc(p->se.statistics.nr_failed_migrations_running);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6668	return 0;
				6669	}
				6670
				6671	/*
				6672	* Aggressive migration if:
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	6673	* 1) destination numa is preferred
				6674	* 2) task is cache cold, or
				6675	* 3) too many balance attempts have failed.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6676	*/
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	6677	tsk_cache_hot = migrate_degrades_locality(p, env);
				6678	if (tsk_cache_hot == -1)
				6679	tsk_cache_hot = task_hot(p, env);
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	6680
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	6681	if (tsk_cache_hot <= 0 \|\|
Kirill Tkhai	7a96c23	2014-09-22 22:36:12 +0400	[diff] [blame]	6682	env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	6683	if (tsk_cache_hot == 1) {
Josh Poimboeuf	ae92882	2016-06-17 12:43:24 -0500	[diff] [blame]	6684	schedstat_inc(env->sd->lb_hot_gained[env->idle]);
				6685	schedstat_inc(p->se.statistics.nr_forced_migrations);
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	6686	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6687	return 1;
				6688	}
				6689
Josh Poimboeuf	ae92882	2016-06-17 12:43:24 -0500	[diff] [blame]	6690	schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
Zhang Hang	4e2dcb7	2013-04-10 14:04:55 +0800	[diff] [blame]	6691	return 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6692	}
				6693
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	6694	/*
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6695	* detach_task() -- detach the task for the migration specified in env
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	6696	*/
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6697	static void detach_task(struct task_struct p, struct lb_env env)
				6698	{
				6699	lockdep_assert_held(&env->src_rq->lock);
				6700
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6701	p->on_rq = TASK_ON_RQ_MIGRATING;
Joonwoo Park	3ea94de	2015-11-12 19:38:54 -0800	[diff] [blame]	6702	deactivate_task(env->src_rq, p, 0);
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6703	set_task_cpu(p, env->dst_cpu);
				6704	}
				6705
				6706	/*
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	6707	* detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	6708	* part of active balancing operations within "domain".
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	6709	*
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	6710	* Returns a task if successful and NULL otherwise.
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	6711	*/
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	6712	static struct task_struct detach_one_task(struct lb_env env)
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	6713	{
				6714	struct task_struct p, n;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	6715
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	6716	lockdep_assert_held(&env->src_rq->lock);
				6717
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	6718	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	6719	if (!can_migrate_task(p, env))
				6720	continue;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	6721
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6722	detach_task(p, env);
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	6723
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	6724	/*
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	6725	* Right now, this is only the second place where
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6726	* lb_gained[env->idle] is updated (other is detach_tasks)
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	6727	* so we can safely collect stats here rather than
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6728	* inside detach_tasks().
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	6729	*/
Josh Poimboeuf	ae92882	2016-06-17 12:43:24 -0500	[diff] [blame]	6730	schedstat_inc(env->sd->lb_gained[env->idle]);
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	6731	return p;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	6732	}
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	6733	return NULL;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	6734	}
				6735
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	6736	static const unsigned int sched_nr_migrate_break = 32;
				6737
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	6738	/*
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6739	* detach_tasks() -- tries to detach up to imbalance weighted load from
				6740	* busiest_rq, as part of a balancing operation within domain "sd".
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	6741	*
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6742	* Returns number of detached tasks if successful and 0 otherwise.
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	6743	*/
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6744	static int detach_tasks(struct lb_env *env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6745	{
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	6746	struct list_head *tasks = &env->src_rq->cfs_tasks;
				6747	struct task_struct *p;
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	6748	unsigned long load;
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6749	int detached = 0;
				6750
				6751	lockdep_assert_held(&env->src_rq->lock);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6752
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	6753	if (env->imbalance <= 0)
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	6754	return 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6755
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	6756	while (!list_empty(tasks)) {
Yuyang Du	985d3a4	2015-07-06 06:11:51 +0800	[diff] [blame]	6757	/*
				6758	* We don't want to steal all, otherwise we may be treated likewise,
				6759	* which could at worst lead to a livelock crash.
				6760	*/
				6761	if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
				6762	break;
				6763
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	6764	p = list_first_entry(tasks, struct task_struct, se.group_node);
				6765
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	6766	env->loop++;
				6767	/* We've more or less seen every task there is, call it quits */
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	6768	if (env->loop > env->loop_max)
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	6769	break;
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	6770
				6771	/* take a breather every nr_migrate tasks */
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	6772	if (env->loop > env->loop_break) {
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	6773	env->loop_break += sched_nr_migrate_break;
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	6774	env->flags \|= LBF_NEED_BREAK;
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	6775	break;
Peter Zijlstra	a195f00	2011-09-22 15:30:18 +0200	[diff] [blame]	6776	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6777
Joonsoo Kim	d319808	2013-04-23 17:27:40 +0900	[diff] [blame]	6778	if (!can_migrate_task(p, env))
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	6779	goto next;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6780
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	6781	load = task_h_load(p);
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	6782
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	6783	if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	6784	goto next;
				6785
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	6786	if ((load / 2) > env->imbalance)
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	6787	goto next;
				6788
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6789	detach_task(p, env);
				6790	list_add(&p->se.group_node, &env->tasks);
				6791
				6792	detached++;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	6793	env->imbalance -= load;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6794
				6795	#ifdef CONFIG_PREEMPT
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	6796	/*
				6797	* NEWIDLE balancing is a source of latency, so preemptible
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6798	* kernels will stop after the first task is detached to minimize
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	6799	* the critical section.
				6800	*/
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	6801	if (env->idle == CPU_NEWLY_IDLE)
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	6802	break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6803	#endif
				6804
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	6805	/*
				6806	* We only want to steal up to the prescribed amount of
				6807	* weighted load.
				6808	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	6809	if (env->imbalance <= 0)
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	6810	break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6811
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	6812	continue;
				6813	next:
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	6814	list_move_tail(&p->se.group_node, tasks);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6815	}
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	6816
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6817	/*
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6818	* Right now, this is one of only two places we collect this stat
				6819	* so we can safely collect detach_one_task() stats here rather
				6820	* than inside detach_one_task().
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6821	*/
Josh Poimboeuf	ae92882	2016-06-17 12:43:24 -0500	[diff] [blame]	6822	schedstat_add(env->sd->lb_gained[env->idle], detached);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6823
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6824	return detached;
				6825	}
				6826
				6827	/*
				6828	* attach_task() -- attach the task detached by detach_task() to its new rq.
				6829	*/
				6830	static void attach_task(struct rq rq, struct task_struct p)
				6831	{
				6832	lockdep_assert_held(&rq->lock);
				6833
				6834	BUG_ON(task_rq(p) != rq);
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6835	activate_task(rq, p, 0);
Joonwoo Park	3ea94de	2015-11-12 19:38:54 -0800	[diff] [blame]	6836	p->on_rq = TASK_ON_RQ_QUEUED;
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	6837	check_preempt_curr(rq, p, 0);
				6838	}
				6839
				6840	/*
				6841	* attach_one_task() -- attaches the task returned from detach_one_task() to
				6842	* its new rq.
				6843	*/
				6844	static void attach_one_task(struct rq rq, struct task_struct p)
				6845	{
				6846	raw_spin_lock(&rq->lock);
				6847	attach_task(rq, p);
				6848	raw_spin_unlock(&rq->lock);
				6849	}
				6850
				6851	/*
				6852	* attach_tasks() -- attaches all tasks detached by detach_tasks() to their
				6853	* new rq.
				6854	*/
				6855	static void attach_tasks(struct lb_env *env)
				6856	{
				6857	struct list_head *tasks = &env->tasks;
				6858	struct task_struct *p;
				6859
				6860	raw_spin_lock(&env->dst_rq->lock);
				6861
				6862	while (!list_empty(tasks)) {
				6863	p = list_first_entry(tasks, struct task_struct, se.group_node);
				6864	list_del_init(&p->se.group_node);
				6865
				6866	attach_task(env->dst_rq, p);
				6867	}
				6868
				6869	raw_spin_unlock(&env->dst_rq->lock);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6870	}
				6871
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	6872	#ifdef CONFIG_FAIR_GROUP_SCHED
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	6873	static void update_blocked_averages(int cpu)
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	6874	{
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	6875	struct rq *rq = cpu_rq(cpu);
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	6876	struct cfs_rq *cfs_rq;
				6877	unsigned long flags;
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	6878
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	6879	raw_spin_lock_irqsave(&rq->lock, flags);
				6880	update_rq_clock(rq);
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	6881
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	6882	/*
				6883	* Iterates the task_group tree in a bottom up fashion, see
				6884	* list_add_leaf_cfs_rq() for details.
				6885	*/
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	6886	for_each_leaf_cfs_rq(rq, cfs_rq) {
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	6887	/* throttled entities do not contribute to load */
				6888	if (throttled_hierarchy(cfs_rq))
				6889	continue;
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	6890
Steve Muckle	a2c6c91	2016-03-24 15:26:07 -0700	[diff] [blame]	6891	if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	6892	update_tg_load_avg(cfs_rq, 0);
Vincent Guittot	4e51607	2016-11-08 10:53:46 +0100	[diff] [blame]	6893
				6894	/* Propagate pending load changes to the parent */
				6895	if (cfs_rq->tg->se[cpu])
				6896	update_load_avg(cfs_rq->tg->se[cpu], 0);
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	6897	}
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	6898	raw_spin_unlock_irqrestore(&rq->lock, flags);
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	6899	}
				6900
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	6901	/*
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	6902	* Compute the hierarchical load factor for cfs_rq and all its ascendants.
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	6903	* This needs to be done in a top-down fashion because the load of a child
				6904	* group is a fraction of its parents load.
				6905	*/
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	6906	static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	6907	{
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	6908	struct rq *rq = rq_of(cfs_rq);
				6909	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
Peter Zijlstra	a35b646	2012-08-08 21:46:40 +0200	[diff] [blame]	6910	unsigned long now = jiffies;
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	6911	unsigned long load;
Peter Zijlstra	a35b646	2012-08-08 21:46:40 +0200	[diff] [blame]	6912
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	6913	if (cfs_rq->last_h_load_update == now)
Peter Zijlstra	a35b646	2012-08-08 21:46:40 +0200	[diff] [blame]	6914	return;
				6915
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	6916	cfs_rq->h_load_next = NULL;
				6917	for_each_sched_entity(se) {
				6918	cfs_rq = cfs_rq_of(se);
				6919	cfs_rq->h_load_next = se;
				6920	if (cfs_rq->last_h_load_update == now)
				6921	break;
				6922	}
Peter Zijlstra	a35b646	2012-08-08 21:46:40 +0200	[diff] [blame]	6923
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	6924	if (!se) {
Yuyang Du	7ea241a	2015-07-15 08:04:42 +0800	[diff] [blame]	6925	cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	6926	cfs_rq->last_h_load_update = now;
				6927	}
				6928
				6929	while ((se = cfs_rq->h_load_next) != NULL) {
				6930	load = cfs_rq->h_load;
Yuyang Du	7ea241a	2015-07-15 08:04:42 +0800	[diff] [blame]	6931	load = div64_ul(load * se->avg.load_avg,
				6932	cfs_rq_load_avg(cfs_rq) + 1);
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	6933	cfs_rq = group_cfs_rq(se);
				6934	cfs_rq->h_load = load;
				6935	cfs_rq->last_h_load_update = now;
				6936	}
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	6937	}
				6938
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	6939	static unsigned long task_h_load(struct task_struct *p)
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	6940	{
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	6941	struct cfs_rq *cfs_rq = task_cfs_rq(p);
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	6942
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	6943	update_cfs_rq_h_load(cfs_rq);
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	6944	return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
Yuyang Du	7ea241a	2015-07-15 08:04:42 +0800	[diff] [blame]	6945	cfs_rq_load_avg(cfs_rq) + 1);
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	6946	}
				6947	#else
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	6948	static inline void update_blocked_averages(int cpu)
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	6949	{
Vincent Guittot	6c1d47c	2015-07-15 08:04:38 +0800	[diff] [blame]	6950	struct rq *rq = cpu_rq(cpu);
				6951	struct cfs_rq *cfs_rq = &rq->cfs;
				6952	unsigned long flags;
				6953
				6954	raw_spin_lock_irqsave(&rq->lock, flags);
				6955	update_rq_clock(rq);
Steve Muckle	a2c6c91	2016-03-24 15:26:07 -0700	[diff] [blame]	6956	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
Vincent Guittot	6c1d47c	2015-07-15 08:04:38 +0800	[diff] [blame]	6957	raw_spin_unlock_irqrestore(&rq->lock, flags);
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	6958	}
				6959
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	6960	static unsigned long task_h_load(struct task_struct *p)
				6961	{
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	6962	return p->se.avg.load_avg;
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	6963	}
				6964	#endif
				6965
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6966	/******** Helpers for find_busiest_group **********************/
Rik van Riel	caeb178	2014-07-28 14:16:28 -0400	[diff] [blame]	6967
				6968	enum group_type {
				6969	group_other = 0,
				6970	group_imbalanced,
				6971	group_overloaded,
				6972	};
				6973
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6974	/*
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6975	* sg_lb_stats - stats of a sched_group required for load_balancing
				6976	*/
				6977	struct sg_lb_stats {
				6978	unsigned long avg_load; /Avg load across the CPUs of the group /
				6979	unsigned long group_load; /* Total load over the CPUs of the group */
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6980	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6981	unsigned long load_per_task;
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	6982	unsigned long group_capacity;
Dietmar Eggemann	9e91d61	2015-08-14 17:23:12 +0100	[diff] [blame]	6983	unsigned long group_util; /* Total utilization of the group */
Peter Zijlstra	147c5fc	2013-08-19 15:22:57 +0200	[diff] [blame]	6984	unsigned int sum_nr_running; /* Nr tasks running in the group */
Peter Zijlstra	147c5fc	2013-08-19 15:22:57 +0200	[diff] [blame]	6985	unsigned int idle_cpus;
				6986	unsigned int group_weight;
Rik van Riel	caeb178	2014-07-28 14:16:28 -0400	[diff] [blame]	6987	enum group_type group_type;
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	6988	int group_no_capacity;
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	6989	#ifdef CONFIG_NUMA_BALANCING
				6990	unsigned int nr_numa_running;
				6991	unsigned int nr_preferred_running;
				6992	#endif
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6993	};
				6994
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6995	/*
				6996	* sd_lb_stats - Structure to store the statistics of a sched_domain
				6997	* during load balancing.
				6998	*/
				6999	struct sd_lb_stats {
				7000	struct sched_group busiest; / Busiest group in this sd */
				7001	struct sched_group local; / Local group in this sd */
				7002	unsigned long total_load; /* Total load of all groups in sd */
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7003	unsigned long total_capacity; /* Total capacity of all groups in sd */
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7004	unsigned long avg_load; /* Average load across all groups in sd */
				7005
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7006	struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
Peter Zijlstra	147c5fc	2013-08-19 15:22:57 +0200	[diff] [blame]	7007	struct sg_lb_stats local_stat; /* Statistics of the local group */
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7008	};
				7009
Peter Zijlstra	147c5fc	2013-08-19 15:22:57 +0200	[diff] [blame]	7010	static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
				7011	{
				7012	/*
				7013	* Skimp on the clearing to avoid duplicate work. We can avoid clearing
				7014	* local_stat because update_sg_lb_stats() does a full clear/assignment.
				7015	* We must however clear busiest_stat::avg_load because
				7016	* update_sd_pick_busiest() reads this before assignment.
				7017	*/
				7018	*sds = (struct sd_lb_stats){
				7019	.busiest = NULL,
				7020	.local = NULL,
				7021	.total_load = 0UL,
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7022	.total_capacity = 0UL,
Peter Zijlstra	147c5fc	2013-08-19 15:22:57 +0200	[diff] [blame]	7023	.busiest_stat = {
				7024	.avg_load = 0UL,
Rik van Riel	caeb178	2014-07-28 14:16:28 -0400	[diff] [blame]	7025	.sum_nr_running = 0,
				7026	.group_type = group_other,
Peter Zijlstra	147c5fc	2013-08-19 15:22:57 +0200	[diff] [blame]	7027	},
				7028	};
				7029	}
				7030
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7031	/**
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7032	* get_sd_load_idx - Obtain the load index for a given sched domain.
				7033	* @sd: The sched_domain whose load_idx is to be obtained.
Kamalesh Babulal	ed1b773	2013-10-13 23:06:15 +0530	[diff] [blame]	7034	* @idle: The idle status of the CPU for whose sd load_idx is obtained.
Yacine Belkadi	e69f618	2013-07-12 20:45:47 +0200	[diff] [blame]	7035	*
				7036	* Return: The load index.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7037	*/
				7038	static inline int get_sd_load_idx(struct sched_domain *sd,
				7039	enum cpu_idle_type idle)
				7040	{
				7041	int load_idx;
				7042
				7043	switch (idle) {
				7044	case CPU_NOT_IDLE:
				7045	load_idx = sd->busy_idx;
				7046	break;
				7047
				7048	case CPU_NEWLY_IDLE:
				7049	load_idx = sd->newidle_idx;
				7050	break;
				7051	default:
				7052	load_idx = sd->idle_idx;
				7053	break;
				7054	}
				7055
				7056	return load_idx;
				7057	}
				7058
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	7059	static unsigned long scale_rt_capacity(int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7060	{
				7061	struct rq *rq = cpu_rq(cpu);
Vincent Guittot	b5b4860	2015-02-27 16:54:08 +0100	[diff] [blame]	7062	u64 total, used, age_stamp, avg;
Peter Zijlstra	cadefd3	2014-02-27 10:40:35 +0100	[diff] [blame]	7063	s64 delta;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7064
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	7065	/*
				7066	* Since we're reading these variables without serialization make sure
				7067	* we read them once before doing sanity checks on them.
				7068	*/
Jason Low	316c1608d	2015-04-28 13:00:20 -0700	[diff] [blame]	7069	age_stamp = READ_ONCE(rq->age_stamp);
				7070	avg = READ_ONCE(rq->rt_avg);
Peter Zijlstra	cebde6d	2015-01-05 11:18:10 +0100	[diff] [blame]	7071	delta = __rq_clock_broken(rq) - age_stamp;
Venkatesh Pallipadi	aa48380	2010-10-04 17:03:22 -0700	[diff] [blame]	7072
Peter Zijlstra	cadefd3	2014-02-27 10:40:35 +0100	[diff] [blame]	7073	if (unlikely(delta < 0))
				7074	delta = 0;
				7075
				7076	total = sched_avg_period() + delta;
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	7077
Vincent Guittot	b5b4860	2015-02-27 16:54:08 +0100	[diff] [blame]	7078	used = div_u64(avg, total);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7079
Vincent Guittot	b5b4860	2015-02-27 16:54:08 +0100	[diff] [blame]	7080	if (likely(used < SCHED_CAPACITY_SCALE))
				7081	return SCHED_CAPACITY_SCALE - used;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7082
Vincent Guittot	b5b4860	2015-02-27 16:54:08 +0100	[diff] [blame]	7083	return 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7084	}
				7085
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	7086	static void update_cpu_capacity(struct sched_domain *sd, int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7087	{
Morten Rasmussen	8cd5601	2015-08-14 17:23:10 +0100	[diff] [blame]	7088	unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7089	struct sched_group *sdg = sd->groups;
				7090
Vincent Guittot	ca6d75e	2015-02-27 16:54:09 +0100	[diff] [blame]	7091	cpu_rq(cpu)->cpu_capacity_orig = capacity;
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	7092
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	7093	capacity *= scale_rt_capacity(cpu);
Nicolas Pitre	ca8ce3d	2014-05-26 18:19:39 -0400	[diff] [blame]	7094	capacity >>= SCHED_CAPACITY_SHIFT;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7095
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	7096	if (!capacity)
				7097	capacity = 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7098
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	7099	cpu_rq(cpu)->cpu_capacity = capacity;
				7100	sdg->sgc->capacity = capacity;
Morten Rasmussen	bf475ce	2016-10-14 14:41:09 +0100	[diff] [blame]	7101	sdg->sgc->min_capacity = capacity;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7102	}
				7103
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7104	void update_group_capacity(struct sched_domain *sd, int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7105	{
				7106	struct sched_domain *child = sd->child;
				7107	struct sched_group group, sdg = sd->groups;
Morten Rasmussen	bf475ce	2016-10-14 14:41:09 +0100	[diff] [blame]	7108	unsigned long capacity, min_capacity;
Vincent Guittot	4ec4412	2011-12-12 20:21:08 +0100	[diff] [blame]	7109	unsigned long interval;
				7110
				7111	interval = msecs_to_jiffies(sd->balance_interval);
				7112	interval = clamp(interval, 1UL, max_load_balance_interval);
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7113	sdg->sgc->next_update = jiffies + interval;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7114
				7115	if (!child) {
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	7116	update_cpu_capacity(sd, cpu);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7117	return;
				7118	}
				7119
Vincent Guittot	dc7ff76	2015-03-03 11:35:03 +0100	[diff] [blame]	7120	capacity = 0;
Morten Rasmussen	bf475ce	2016-10-14 14:41:09 +0100	[diff] [blame]	7121	min_capacity = ULONG_MAX;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7122
Peter Zijlstra	74a5ce2	2012-05-23 18:00:43 +0200	[diff] [blame]	7123	if (child->flags & SD_OVERLAP) {
				7124	/*
				7125	* SD_OVERLAP domains cannot assume that child groups
				7126	* span the current group.
				7127	*/
				7128
Peter Zijlstra	863bffc	2013-08-28 11:44:39 +0200	[diff] [blame]	7129	for_each_cpu(cpu, sched_group_cpus(sdg)) {
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7130	struct sched_group_capacity *sgc;
Srikar Dronamraju	9abf24d	2013-11-12 22:11:26 +0530	[diff] [blame]	7131	struct rq *rq = cpu_rq(cpu);
Peter Zijlstra	863bffc	2013-08-28 11:44:39 +0200	[diff] [blame]	7132
Srikar Dronamraju	9abf24d	2013-11-12 22:11:26 +0530	[diff] [blame]	7133	/*
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7134	* build_sched_domains() -> init_sched_groups_capacity()
Srikar Dronamraju	9abf24d	2013-11-12 22:11:26 +0530	[diff] [blame]	7135	* gets here before we've attached the domains to the
				7136	* runqueues.
				7137	*
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	7138	* Use capacity_of(), which is set irrespective of domains
				7139	* in update_cpu_capacity().
Srikar Dronamraju	9abf24d	2013-11-12 22:11:26 +0530	[diff] [blame]	7140	*
Vincent Guittot	dc7ff76	2015-03-03 11:35:03 +0100	[diff] [blame]	7141	* This avoids capacity from being 0 and
Srikar Dronamraju	9abf24d	2013-11-12 22:11:26 +0530	[diff] [blame]	7142	* causing divide-by-zero issues on boot.
Srikar Dronamraju	9abf24d	2013-11-12 22:11:26 +0530	[diff] [blame]	7143	*/
				7144	if (unlikely(!rq->sd)) {
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	7145	capacity += capacity_of(cpu);
Morten Rasmussen	bf475ce	2016-10-14 14:41:09 +0100	[diff] [blame]	7146	} else {
				7147	sgc = rq->sd->groups->sgc;
				7148	capacity += sgc->capacity;
Srikar Dronamraju	9abf24d	2013-11-12 22:11:26 +0530	[diff] [blame]	7149	}
				7150
Morten Rasmussen	bf475ce	2016-10-14 14:41:09 +0100	[diff] [blame]	7151	min_capacity = min(capacity, min_capacity);
Peter Zijlstra	863bffc	2013-08-28 11:44:39 +0200	[diff] [blame]	7152	}
Peter Zijlstra	74a5ce2	2012-05-23 18:00:43 +0200	[diff] [blame]	7153	} else {
				7154	/*
				7155	* !SD_OVERLAP domains can assume that child groups
				7156	* span the current group.
Byungchul Park	97a7142	2015-07-05 18:33:48 +0900	[diff] [blame]	7157	*/
Peter Zijlstra	74a5ce2	2012-05-23 18:00:43 +0200	[diff] [blame]	7158
				7159	group = child->groups;
				7160	do {
Morten Rasmussen	bf475ce	2016-10-14 14:41:09 +0100	[diff] [blame]	7161	struct sched_group_capacity *sgc = group->sgc;
				7162
				7163	capacity += sgc->capacity;
				7164	min_capacity = min(sgc->min_capacity, min_capacity);
Peter Zijlstra	74a5ce2	2012-05-23 18:00:43 +0200	[diff] [blame]	7165	group = group->next;
				7166	} while (group != child->groups);
				7167	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7168
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7169	sdg->sgc->capacity = capacity;
Morten Rasmussen	bf475ce	2016-10-14 14:41:09 +0100	[diff] [blame]	7170	sdg->sgc->min_capacity = min_capacity;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7171	}
				7172
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	7173	/*
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7174	* Check whether the capacity of the rq has been noticeably reduced by side
				7175	* activity. The imbalance_pct is used for the threshold.
				7176	* Return true is the capacity is reduced
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	7177	*/
				7178	static inline int
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7179	check_cpu_capacity(struct rq rq, struct sched_domain sd)
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	7180	{
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7181	return ((rq->cpu_capacity * sd->imbalance_pct) <
				7182	(rq->cpu_capacity_orig * 100));
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	7183	}
				7184
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	7185	/*
				7186	* Group imbalance indicates (and tries to solve) the problem where balancing
				7187	* groups is inadequate due to tsk_cpus_allowed() constraints.
				7188	*
				7189	* Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
				7190	* cpumask covering 1 cpu of the first group and 3 cpus of the second group.
				7191	* Something like:
				7192	*
Ingo Molnar	2b4d5b2	2016-11-23 07:37:00 +0100	[diff] [blame]	7193	* { 0 1 2 3 } { 4 5 6 7 }
				7194	* * * * *
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	7195	*
				7196	* If we were to balance group-wise we'd place two tasks in the first group and
				7197	* two tasks in the second group. Clearly this is undesired as it will overload
				7198	* cpu 3 and leave one of the cpus in the second group unused.
				7199	*
				7200	* The current solution to this issue is detecting the skew in the first group
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	7201	* by noticing the lower domain failed to reach balance and had difficulty
				7202	* moving tasks due to affinity constraints.
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	7203	*
				7204	* When this is so detected; this group becomes a candidate for busiest; see
Kamalesh Babulal	ed1b773	2013-10-13 23:06:15 +0530	[diff] [blame]	7205	* update_sd_pick_busiest(). And calculate_imbalance() and
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	7206	* find_busiest_group() avoid some of the usual balance conditions to allow it
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	7207	* to create an effective group imbalance.
				7208	*
				7209	* This is a somewhat tricky proposition since the next run might not find the
				7210	* group imbalance and decide the groups need to be balanced again. A most
				7211	* subtle and fragile situation.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7212	*/
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	7213
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	7214	static inline int sg_imbalanced(struct sched_group *group)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7215	{
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7216	return group->sgc->imbalance;
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	7217	}
				7218
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	7219	/*
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7220	* group_has_capacity returns true if the group has spare capacity that could
				7221	* be used by some tasks.
				7222	* We consider that a group has spare capacity if the * number of task is
Dietmar Eggemann	9e91d61	2015-08-14 17:23:12 +0100	[diff] [blame]	7223	* smaller than the number of CPUs or if the utilization is lower than the
				7224	* available capacity for CFS tasks.
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7225	* For the latter, we use a threshold to stabilize the state, to take into
				7226	* account the variance of the tasks' load and to return true if the available
				7227	* capacity in meaningful for the load balancer.
				7228	* As an example, an available capacity of 1% can appear but it doesn't make
				7229	* any benefit for the load balance.
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	7230	*/
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7231	static inline bool
				7232	group_has_capacity(struct lb_env env, struct sg_lb_stats sgs)
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	7233	{
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7234	if (sgs->sum_nr_running < sgs->group_weight)
				7235	return true;
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	7236
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7237	if ((sgs->group_capacity * 100) >
Dietmar Eggemann	9e91d61	2015-08-14 17:23:12 +0100	[diff] [blame]	7238	(sgs->group_util * env->sd->imbalance_pct))
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7239	return true;
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	7240
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7241	return false;
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	7242	}
				7243
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7244	/*
				7245	* group_is_overloaded returns true if the group has more tasks than it can
				7246	* handle.
				7247	* group_is_overloaded is not equals to !group_has_capacity because a group
				7248	* with the exact right number of tasks, has no more spare capacity but is not
				7249	* overloaded so both group_has_capacity and group_is_overloaded return
				7250	* false.
				7251	*/
				7252	static inline bool
				7253	group_is_overloaded(struct lb_env env, struct sg_lb_stats sgs)
Rik van Riel	caeb178	2014-07-28 14:16:28 -0400	[diff] [blame]	7254	{
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7255	if (sgs->sum_nr_running <= sgs->group_weight)
				7256	return false;
				7257
				7258	if ((sgs->group_capacity * 100) <
Dietmar Eggemann	9e91d61	2015-08-14 17:23:12 +0100	[diff] [blame]	7259	(sgs->group_util * env->sd->imbalance_pct))
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7260	return true;
				7261
				7262	return false;
				7263	}
				7264
Morten Rasmussen	9e0994c	2016-10-14 14:41:10 +0100	[diff] [blame]	7265	/*
				7266	* group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
				7267	* per-CPU capacity than sched_group ref.
				7268	*/
				7269	static inline bool
				7270	group_smaller_cpu_capacity(struct sched_group sg, struct sched_group ref)
				7271	{
				7272	return sg->sgc->min_capacity * capacity_margin <
				7273	ref->sgc->min_capacity * 1024;
				7274	}
				7275
Leo Yan	79a89f9	2015-09-15 18:56:45 +0800	[diff] [blame]	7276	static inline enum
				7277	group_type group_classify(struct sched_group *group,
				7278	struct sg_lb_stats *sgs)
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7279	{
				7280	if (sgs->group_no_capacity)
Rik van Riel	caeb178	2014-07-28 14:16:28 -0400	[diff] [blame]	7281	return group_overloaded;
				7282
				7283	if (sg_imbalanced(group))
				7284	return group_imbalanced;
				7285
				7286	return group_other;
				7287	}
				7288
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7289	/**
				7290	* update_sg_lb_stats - Update sched_group's statistics for load balancing.
				7291	* @env: The load balancing environment.
				7292	* @group: sched_group whose statistics are to be updated.
				7293	* @load_idx: Load index of sched_domain of this_cpu for load calc.
				7294	* @local_group: Does group contain this_cpu.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7295	* @sgs: variable to hold the statistics for this group.
Masanari Iida	cd3bd4e	2014-07-28 12:38:06 +0900	[diff] [blame]	7296	* @overload: Indicate more than one runnable task for any CPU.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7297	*/
				7298	static inline void update_sg_lb_stats(struct lb_env *env,
				7299	struct sched_group *group, int load_idx,
Tim Chen	4486edd	2014-06-23 12:16:49 -0700	[diff] [blame]	7300	int local_group, struct sg_lb_stats *sgs,
				7301	bool *overload)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7302	{
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	7303	unsigned long load;
Waiman Long	a426f99	2015-11-25 14:09:38 -0500	[diff] [blame]	7304	int i, nr_running;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7305
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	7306	memset(sgs, 0, sizeof(*sgs));
				7307
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	7308	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7309	struct rq *rq = cpu_rq(i);
				7310
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7311	/* Bias balancing toward cpus of our domain */
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	7312	if (local_group)
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	7313	load = target_load(i, load_idx);
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	7314	else
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7315	load = source_load(i, load_idx);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7316
				7317	sgs->group_load += load;
Dietmar Eggemann	9e91d61	2015-08-14 17:23:12 +0100	[diff] [blame]	7318	sgs->group_util += cpu_util(i);
Vincent Guittot	65fdac0	2014-08-26 13:06:46 +0200	[diff] [blame]	7319	sgs->sum_nr_running += rq->cfs.h_nr_running;
Tim Chen	4486edd	2014-06-23 12:16:49 -0700	[diff] [blame]	7320
Waiman Long	a426f99	2015-11-25 14:09:38 -0500	[diff] [blame]	7321	nr_running = rq->nr_running;
				7322	if (nr_running > 1)
Tim Chen	4486edd	2014-06-23 12:16:49 -0700	[diff] [blame]	7323	*overload = true;
				7324
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	7325	#ifdef CONFIG_NUMA_BALANCING
				7326	sgs->nr_numa_running += rq->nr_numa_running;
				7327	sgs->nr_preferred_running += rq->nr_preferred_running;
				7328	#endif
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7329	sgs->sum_weighted_load += weighted_cpuload(i);
Waiman Long	a426f99	2015-11-25 14:09:38 -0500	[diff] [blame]	7330	/*
				7331	* No need to call idle_cpu() if nr_running is not 0
				7332	*/
				7333	if (!nr_running && idle_cpu(i))
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	7334	sgs->idle_cpus++;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7335	}
				7336
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7337	/* Adjust by relative CPU capacity of the group */
				7338	sgs->group_capacity = group->sgc->capacity;
Nicolas Pitre	ca8ce3d	2014-05-26 18:19:39 -0400	[diff] [blame]	7339	sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7340
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	7341	if (sgs->sum_nr_running)
Peter Zijlstra	38d0f77	2013-08-15 19:47:56 +0200	[diff] [blame]	7342	sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7343
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	7344	sgs->group_weight = group->group_weight;
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	7345
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7346	sgs->group_no_capacity = group_is_overloaded(env, sgs);
Leo Yan	79a89f9	2015-09-15 18:56:45 +0800	[diff] [blame]	7347	sgs->group_type = group_classify(group, sgs);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7348	}
				7349
				7350	/**
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7351	* update_sd_pick_busiest - return 1 on busiest group
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	7352	* @env: The load balancing environment.
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7353	* @sds: sched_domain statistics
				7354	* @sg: sched_group candidate to be checked for being the busiest
Michael Neuling	b6b1229	2010-06-10 12:06:21 +1000	[diff] [blame]	7355	* @sgs: sched_group statistics
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7356	*
				7357	* Determine if @sg is a busier group than the previously selected
				7358	* busiest group.
Yacine Belkadi	e69f618	2013-07-12 20:45:47 +0200	[diff] [blame]	7359	*
				7360	* Return: %true if @sg is a busier group than the previously selected
				7361	* busiest group. %false otherwise.
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7362	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7363	static bool update_sd_pick_busiest(struct lb_env *env,
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7364	struct sd_lb_stats *sds,
				7365	struct sched_group *sg,
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7366	struct sg_lb_stats *sgs)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7367	{
Rik van Riel	caeb178	2014-07-28 14:16:28 -0400	[diff] [blame]	7368	struct sg_lb_stats *busiest = &sds->busiest_stat;
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7369
Rik van Riel	caeb178	2014-07-28 14:16:28 -0400	[diff] [blame]	7370	if (sgs->group_type > busiest->group_type)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7371	return true;
				7372
Rik van Riel	caeb178	2014-07-28 14:16:28 -0400	[diff] [blame]	7373	if (sgs->group_type < busiest->group_type)
				7374	return false;
				7375
				7376	if (sgs->avg_load <= busiest->avg_load)
				7377	return false;
				7378
Morten Rasmussen	9e0994c	2016-10-14 14:41:10 +0100	[diff] [blame]	7379	if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
				7380	goto asym_packing;
				7381
				7382	/*
				7383	* Candidate sg has no more than one task per CPU and
				7384	* has higher per-CPU capacity. Migrating tasks to less
				7385	* capable CPUs may harm throughput. Maximize throughput,
				7386	* power/energy consequences are not considered.
				7387	*/
				7388	if (sgs->sum_nr_running <= sgs->group_weight &&
				7389	group_smaller_cpu_capacity(sds->local, sg))
				7390	return false;
				7391
				7392	asym_packing:
Rik van Riel	caeb178	2014-07-28 14:16:28 -0400	[diff] [blame]	7393	/* This is the busiest node in its class. */
				7394	if (!(env->sd->flags & SD_ASYM_PACKING))
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7395	return true;
				7396
Srikar Dronamraju	1f621e0	2016-04-06 18:47:40 +0530	[diff] [blame]	7397	/* No ASYM_PACKING if target cpu is already busy */
				7398	if (env->idle == CPU_NOT_IDLE)
				7399	return true;
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7400	/*
Tim Chen	afe06ef	2016-11-22 12:23:53 -0800	[diff] [blame^]	7401	* ASYM_PACKING needs to move all the work to the highest
				7402	* prority CPUs in the group, therefore mark all groups
				7403	* of lower priority than ourself as busy.
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7404	*/
Tim Chen	afe06ef	2016-11-22 12:23:53 -0800	[diff] [blame^]	7405	if (sgs->sum_nr_running &&
				7406	sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7407	if (!sds->busiest)
				7408	return true;
				7409
Tim Chen	afe06ef	2016-11-22 12:23:53 -0800	[diff] [blame^]	7410	/* Prefer to move from lowest priority cpu's work */
				7411	if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
				7412	sg->asym_prefer_cpu))
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7413	return true;
				7414	}
				7415
				7416	return false;
				7417	}
				7418
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	7419	#ifdef CONFIG_NUMA_BALANCING
				7420	static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
				7421	{
				7422	if (sgs->sum_nr_running > sgs->nr_numa_running)
				7423	return regular;
				7424	if (sgs->sum_nr_running > sgs->nr_preferred_running)
				7425	return remote;
				7426	return all;
				7427	}
				7428
				7429	static inline enum fbq_type fbq_classify_rq(struct rq *rq)
				7430	{
				7431	if (rq->nr_running > rq->nr_numa_running)
				7432	return regular;
				7433	if (rq->nr_running > rq->nr_preferred_running)
				7434	return remote;
				7435	return all;
				7436	}
				7437	#else
				7438	static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
				7439	{
				7440	return all;
				7441	}
				7442
				7443	static inline enum fbq_type fbq_classify_rq(struct rq *rq)
				7444	{
				7445	return regular;
				7446	}
				7447	#endif /* CONFIG_NUMA_BALANCING */
				7448
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7449	/**
Hui Kang	461819a	2011-10-11 23:00:59 -0400	[diff] [blame]	7450	* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	7451	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7452	* @sds: variable to hold the statistics for this sched_domain.
				7453	*/
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	7454	static inline void update_sd_lb_stats(struct lb_env env, struct sd_lb_stats sds)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7455	{
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7456	struct sched_domain *child = env->sd->child;
				7457	struct sched_group *sg = env->sd->groups;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7458	struct sg_lb_stats tmp_sgs;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7459	int load_idx, prefer_sibling = 0;
Tim Chen	4486edd	2014-06-23 12:16:49 -0700	[diff] [blame]	7460	bool overload = false;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7461
				7462	if (child && child->flags & SD_PREFER_SIBLING)
				7463	prefer_sibling = 1;
				7464
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7465	load_idx = get_sd_load_idx(env->sd, env->idle);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7466
				7467	do {
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7468	struct sg_lb_stats *sgs = &tmp_sgs;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7469	int local_group;
				7470
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7471	local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7472	if (local_group) {
				7473	sds->local = sg;
				7474	sgs = &sds->local_stat;
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	7475
				7476	if (env->idle != CPU_NEWLY_IDLE \|\|
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7477	time_after_eq(jiffies, sg->sgc->next_update))
				7478	update_group_capacity(env->sd, env->dst_cpu);
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7479	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7480
Tim Chen	4486edd	2014-06-23 12:16:49 -0700	[diff] [blame]	7481	update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
				7482	&overload);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7483
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	7484	if (local_group)
				7485	goto next_group;
				7486
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7487	/*
				7488	* In case the child domain prefers tasks go to siblings
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7489	* first, lower the sg capacity so that we'll try
Nikhil Rao	75dd321	2010-10-15 13:12:30 -0700	[diff] [blame]	7490	* and move all the excess tasks away. We lower the capacity
				7491	* of a group only if the local group has the capacity to fit
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7492	* these excess tasks. The extra check prevents the case where
				7493	* you always pull from the heaviest group when it is already
				7494	* under-utilized (possible with a large weight task outweighs
				7495	* the tasks on the system).
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7496	*/
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	7497	if (prefer_sibling && sds->local &&
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7498	group_has_capacity(env, &sds->local_stat) &&
				7499	(sgs->sum_nr_running > 1)) {
				7500	sgs->group_no_capacity = 1;
Leo Yan	79a89f9	2015-09-15 18:56:45 +0800	[diff] [blame]	7501	sgs->group_type = group_classify(sg, sgs);
Wanpeng Li	cb0b9f2	2014-11-05 07:44:50 +0800	[diff] [blame]	7502	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7503
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	7504	if (update_sd_pick_busiest(env, sds, sg, sgs)) {
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7505	sds->busiest = sg;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7506	sds->busiest_stat = *sgs;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7507	}
				7508
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	7509	next_group:
				7510	/* Now, start updating sd_lb_stats */
				7511	sds->total_load += sgs->group_load;
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7512	sds->total_capacity += sgs->group_capacity;
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	7513
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7514	sg = sg->next;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7515	} while (sg != env->sd->groups);
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	7516
				7517	if (env->sd->flags & SD_NUMA)
				7518	env->fbq_type = fbq_classify_group(&sds->busiest_stat);
Tim Chen	4486edd	2014-06-23 12:16:49 -0700	[diff] [blame]	7519
				7520	if (!env->sd->parent) {
				7521	/* update overload indicator if we are at root domain */
				7522	if (env->dst_rq->rd->overload != overload)
				7523	env->dst_rq->rd->overload = overload;
				7524	}
				7525
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7526	}
				7527
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7528	/**
				7529	* check_asym_packing - Check to see if the group is packed into the
				7530	* sched doman.
				7531	*
				7532	* This is primarily intended to used at the sibling level. Some
				7533	* cores like POWER7 prefer to use lower numbered SMT threads. In the
				7534	* case of POWER7, it can move to lower SMT modes only when higher
				7535	* threads are idle. When in lower SMT modes, the threads will
				7536	* perform better since they share less core resources. Hence when we
				7537	* have idle threads, we want them to be the higher ones.
				7538	*
				7539	* This packing function is run on idle threads. It checks to see if
				7540	* the busiest CPU in this domain (core in the P7 case) has a higher
				7541	* CPU number than the packing function is being run on. Here we are
				7542	* assuming lower CPU number will be equivalent to lower a SMT thread
				7543	* number.
				7544	*
Yacine Belkadi	e69f618	2013-07-12 20:45:47 +0200	[diff] [blame]	7545	* Return: 1 when packing is required and a task should be moved to
Michael Neuling	b6b1229	2010-06-10 12:06:21 +1000	[diff] [blame]	7546	* this CPU. The amount of the imbalance is returned in *imbalance.
				7547	*
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	7548	* @env: The load balancing environment.
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7549	* @sds: Statistics of the sched_domain which is to be packed
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7550	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7551	static int check_asym_packing(struct lb_env env, struct sd_lb_stats sds)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7552	{
				7553	int busiest_cpu;
				7554
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7555	if (!(env->sd->flags & SD_ASYM_PACKING))
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7556	return 0;
				7557
Srikar Dronamraju	1f621e0	2016-04-06 18:47:40 +0530	[diff] [blame]	7558	if (env->idle == CPU_NOT_IDLE)
				7559	return 0;
				7560
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7561	if (!sds->busiest)
				7562	return 0;
				7563
Tim Chen	afe06ef	2016-11-22 12:23:53 -0800	[diff] [blame^]	7564	busiest_cpu = sds->busiest->asym_prefer_cpu;
				7565	if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7566	return 0;
				7567
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7568	env->imbalance = DIV_ROUND_CLOSEST(
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7569	sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
Nicolas Pitre	ca8ce3d	2014-05-26 18:19:39 -0400	[diff] [blame]	7570	SCHED_CAPACITY_SCALE);
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7571
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7572	return 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7573	}
				7574
				7575	/**
				7576	* fix_small_imbalance - Calculate the minor imbalance that exists
				7577	* amongst the groups of a sched_domain, during
				7578	* load balancing.
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	7579	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7580	* @sds: Statistics of the sched_domain whose imbalance is to be calculated.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7581	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7582	static inline
				7583	void fix_small_imbalance(struct lb_env env, struct sd_lb_stats sds)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7584	{
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7585	unsigned long tmp, capa_now = 0, capa_move = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7586	unsigned int imbn = 2;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	7587	unsigned long scaled_busy_load_per_task;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7588	struct sg_lb_stats local, busiest;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7589
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7590	local = &sds->local_stat;
				7591	busiest = &sds->busiest_stat;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7592
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7593	if (!local->sum_nr_running)
				7594	local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
				7595	else if (busiest->load_per_task > local->load_per_task)
				7596	imbn = 1;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	7597
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7598	scaled_busy_load_per_task =
Nicolas Pitre	ca8ce3d	2014-05-26 18:19:39 -0400	[diff] [blame]	7599	(busiest->load_per_task * SCHED_CAPACITY_SCALE) /
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7600	busiest->group_capacity;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7601
Vladimir Davydov	3029ede	2013-09-15 17:49:14 +0400	[diff] [blame]	7602	if (busiest->avg_load + scaled_busy_load_per_task >=
				7603	local->avg_load + (scaled_busy_load_per_task * imbn)) {
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7604	env->imbalance = busiest->load_per_task;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7605	return;
				7606	}
				7607
				7608	/*
				7609	* OK, we don't have enough imbalance to justify moving tasks,
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	7610	* however we may be able to increase total CPU capacity used by
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7611	* moving them.
				7612	*/
				7613
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7614	capa_now += busiest->group_capacity *
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7615	min(busiest->load_per_task, busiest->avg_load);
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7616	capa_now += local->group_capacity *
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7617	min(local->load_per_task, local->avg_load);
Nicolas Pitre	ca8ce3d	2014-05-26 18:19:39 -0400	[diff] [blame]	7618	capa_now /= SCHED_CAPACITY_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7619
				7620	/* Amount of load we'd subtract */
Vincent Guittot	a2cd426	2014-03-11 17:26:06 +0100	[diff] [blame]	7621	if (busiest->avg_load > scaled_busy_load_per_task) {
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7622	capa_move += busiest->group_capacity *
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7623	min(busiest->load_per_task,
Vincent Guittot	a2cd426	2014-03-11 17:26:06 +0100	[diff] [blame]	7624	busiest->avg_load - scaled_busy_load_per_task);
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7625	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7626
				7627	/* Amount of load we'd add */
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7628	if (busiest->avg_load * busiest->group_capacity <
Nicolas Pitre	ca8ce3d	2014-05-26 18:19:39 -0400	[diff] [blame]	7629	busiest->load_per_task * SCHED_CAPACITY_SCALE) {
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7630	tmp = (busiest->avg_load * busiest->group_capacity) /
				7631	local->group_capacity;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7632	} else {
Nicolas Pitre	ca8ce3d	2014-05-26 18:19:39 -0400	[diff] [blame]	7633	tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7634	local->group_capacity;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7635	}
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7636	capa_move += local->group_capacity *
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	7637	min(local->load_per_task, local->avg_load + tmp);
Nicolas Pitre	ca8ce3d	2014-05-26 18:19:39 -0400	[diff] [blame]	7638	capa_move /= SCHED_CAPACITY_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7639
				7640	/* Move if we gain throughput */
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7641	if (capa_move > capa_now)
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7642	env->imbalance = busiest->load_per_task;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7643	}
				7644
				7645	/**
				7646	* calculate_imbalance - Calculate the amount of imbalance present within the
				7647	* groups of a given sched_domain during load balance.
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7648	* @env: load balance environment
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7649	* @sds: statistics of the sched_domain whose imbalance is to be calculated.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7650	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7651	static inline void calculate_imbalance(struct lb_env env, struct sd_lb_stats sds)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7652	{
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	7653	unsigned long max_pull, load_above_capacity = ~0UL;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7654	struct sg_lb_stats local, busiest;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	7655
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7656	local = &sds->local_stat;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7657	busiest = &sds->busiest_stat;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7658
Rik van Riel	caeb178	2014-07-28 14:16:28 -0400	[diff] [blame]	7659	if (busiest->group_type == group_imbalanced) {
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	7660	/*
				7661	* In the group_imb case we cannot rely on group-wide averages
				7662	* to ensure cpu-load equilibrium, look at wider averages. XXX
				7663	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7664	busiest->load_per_task =
				7665	min(busiest->load_per_task, sds->avg_load);
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	7666	}
				7667
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7668	/*
Dietmar Eggemann	885e542	2016-04-29 20:32:39 +0100	[diff] [blame]	7669	* Avg load of busiest sg can be less and avg load of local sg can
				7670	* be greater than avg load across all sgs of sd because avg load
				7671	* factors in sg capacity and sgs with smaller group_type are
				7672	* skipped when updating the busiest sg:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7673	*/
Vladimir Davydov	b188555	2013-09-15 17:49:13 +0400	[diff] [blame]	7674	if (busiest->avg_load <= sds->avg_load \|\|
				7675	local->avg_load >= sds->avg_load) {
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7676	env->imbalance = 0;
				7677	return fix_small_imbalance(env, sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7678	}
				7679
Peter Zijlstra	9a5d9ba	2014-07-29 17:15:11 +0200	[diff] [blame]	7680	/*
				7681	* If there aren't any idle cpus, avoid creating some.
				7682	*/
				7683	if (busiest->group_type == group_overloaded &&
				7684	local->group_type == group_overloaded) {
Peter Zijlstra	1be0eb2	2016-05-06 12:21:23 +0200	[diff] [blame]	7685	load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
Morten Rasmussen	cfa1033	2016-04-29 20:32:40 +0100	[diff] [blame]	7686	if (load_above_capacity > busiest->group_capacity) {
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7687	load_above_capacity -= busiest->group_capacity;
Dietmar Eggemann	2665621	2016-08-10 11:27:27 +0100	[diff] [blame]	7688	load_above_capacity *= scale_load_down(NICE_0_LOAD);
Morten Rasmussen	cfa1033	2016-04-29 20:32:40 +0100	[diff] [blame]	7689	load_above_capacity /= busiest->group_capacity;
				7690	} else
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7691	load_above_capacity = ~0UL;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	7692	}
				7693
				7694	/*
				7695	* We're trying to get all the cpus to the average_load, so we don't
				7696	* want to push ourselves above the average load, nor do we wish to
				7697	* reduce the max loaded cpu below the average load. At the same time,
Dietmar Eggemann	0a9b23c	2016-04-29 20:32:38 +0100	[diff] [blame]	7698	* we also don't want to reduce the group load below the group
				7699	* capacity. Thus we look for the minimum possible imbalance.
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	7700	*/
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	7701	max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7702
				7703	/* How much load to actually move to equalise the imbalance */
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7704	env->imbalance = min(
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7705	max_pull * busiest->group_capacity,
				7706	(sds->avg_load - local->avg_load) * local->group_capacity
Nicolas Pitre	ca8ce3d	2014-05-26 18:19:39 -0400	[diff] [blame]	7707	) / SCHED_CAPACITY_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7708
				7709	/*
				7710	* if *imbalance is less than the average load per runnable task
Lucas De Marchi	25985ed	2011-03-30 22:57:33 -0300	[diff] [blame]	7711	* there is no guarantee that any tasks will be moved so we'll have
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7712	* a think about bumping its value to force at least one task to be
				7713	* moved
				7714	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7715	if (env->imbalance < busiest->load_per_task)
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7716	return fix_small_imbalance(env, sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7717	}
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	7718
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7719	/***** find_busiest_group() helpers end here *******************/
				7720
				7721	/**
				7722	* find_busiest_group - Returns the busiest group within the sched_domain
Dietmar Eggemann	0a9b23c	2016-04-29 20:32:38 +0100	[diff] [blame]	7723	* if there is an imbalance.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7724	*
				7725	* Also calculates the amount of weighted load which should be moved
				7726	* to restore balance.
				7727	*
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	7728	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7729	*
Yacine Belkadi	e69f618	2013-07-12 20:45:47 +0200	[diff] [blame]	7730	* Return: - The busiest group if imbalance exists.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7731	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7732	static struct sched_group find_busiest_group(struct lb_env env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7733	{
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7734	struct sg_lb_stats local, busiest;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7735	struct sd_lb_stats sds;
				7736
Peter Zijlstra	147c5fc	2013-08-19 15:22:57 +0200	[diff] [blame]	7737	init_sd_lb_stats(&sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7738
				7739	/*
				7740	* Compute the various statistics relavent for load balancing at
				7741	* this level.
				7742	*/
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	7743	update_sd_lb_stats(env, &sds);
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7744	local = &sds.local_stat;
				7745	busiest = &sds.busiest_stat;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7746
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7747	/* ASYM feature bypasses nice load balance check */
Srikar Dronamraju	1f621e0	2016-04-06 18:47:40 +0530	[diff] [blame]	7748	if (check_asym_packing(env, &sds))
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7749	return sds.busiest;
				7750
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	7751	/* There is no busy sibling group to pull tasks from */
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7752	if (!sds.busiest \|\| busiest->sum_nr_running == 0)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7753	goto out_balanced;
				7754
Nicolas Pitre	ca8ce3d	2014-05-26 18:19:39 -0400	[diff] [blame]	7755	sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
				7756	/ sds.total_capacity;
Ken Chen	b0432d8	2011-04-07 17:23:22 -0700	[diff] [blame]	7757
Peter Zijlstra	866ab43	2011-02-21 18:56:47 +0100	[diff] [blame]	7758	/*
				7759	* If the busiest group is imbalanced the below checks don't
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	7760	* work because they assume all things are equal, which typically
Peter Zijlstra	866ab43	2011-02-21 18:56:47 +0100	[diff] [blame]	7761	* isn't true due to cpus_allowed constraints and the like.
				7762	*/
Rik van Riel	caeb178	2014-07-28 14:16:28 -0400	[diff] [blame]	7763	if (busiest->group_type == group_imbalanced)
Peter Zijlstra	866ab43	2011-02-21 18:56:47 +0100	[diff] [blame]	7764	goto force_balance;
				7765
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	7766	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7767	if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
				7768	busiest->group_no_capacity)
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	7769	goto force_balance;
				7770
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	7771	/*
Zhihui Zhang	9c58c79	2014-09-20 21:24:36 -0400	[diff] [blame]	7772	* If the local group is busier than the selected busiest group
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	7773	* don't try and pull any tasks.
				7774	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7775	if (local->avg_load >= busiest->avg_load)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7776	goto out_balanced;
				7777
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	7778	/*
				7779	* Don't pull any tasks if this group is already above the domain
				7780	* average load.
				7781	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7782	if (local->avg_load >= sds.avg_load)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7783	goto out_balanced;
				7784
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7785	if (env->idle == CPU_IDLE) {
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	7786	/*
Vincent Guittot	43f4d66	2014-10-01 15:38:55 +0200	[diff] [blame]	7787	* This cpu is idle. If the busiest group is not overloaded
				7788	* and there is no imbalance between this and busiest group
				7789	* wrt idle cpus, it is balanced. The imbalance becomes
				7790	* significant if the diff is greater than 1 otherwise we
				7791	* might end up to just move the imbalance on another group
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	7792	*/
Vincent Guittot	43f4d66	2014-10-01 15:38:55 +0200	[diff] [blame]	7793	if ((busiest->group_type != group_overloaded) &&
				7794	(local->idle_cpus <= (busiest->idle_cpus + 1)))
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	7795	goto out_balanced;
Peter Zijlstra	c186faf	2011-02-21 18:52:53 +0100	[diff] [blame]	7796	} else {
				7797	/*
				7798	* In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
				7799	* imbalance_pct to be conservative.
				7800	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	7801	if (100 * busiest->avg_load <=
				7802	env->sd->imbalance_pct * local->avg_load)
Peter Zijlstra	c186faf	2011-02-21 18:52:53 +0100	[diff] [blame]	7803	goto out_balanced;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	7804	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7805
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	7806	force_balance:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7807	/* Looks like there is an imbalance. Compute it */
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7808	calculate_imbalance(env, &sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7809	return sds.busiest;
				7810
				7811	out_balanced:
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7812	env->imbalance = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7813	return NULL;
				7814	}
				7815
				7816	/*
				7817	* find_busiest_queue - find the busiest runqueue among the cpus in group.
				7818	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7819	static struct rq find_busiest_queue(struct lb_env env,
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	7820	struct sched_group *group)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7821	{
				7822	struct rq busiest = NULL, rq;
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	7823	unsigned long busiest_load = 0, busiest_capacity = 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7824	int i;
				7825
Peter Zijlstra	6906a40	2013-08-19 15:20:21 +0200	[diff] [blame]	7826	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7827	unsigned long capacity, wl;
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	7828	enum fbq_type rt;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7829
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	7830	rq = cpu_rq(i);
				7831	rt = fbq_classify_rq(rq);
				7832
				7833	/*
				7834	* We classify groups/runqueues into three groups:
				7835	* - regular: there are !numa tasks
				7836	* - remote: there are numa tasks that run on the 'wrong' node
				7837	* - all: there is no distinction
				7838	*
				7839	* In order to avoid migrating ideally placed numa tasks,
				7840	* ignore those when there's better options.
				7841	*
				7842	* If we ignore the actual busiest queue to migrate another
				7843	* task, the next balance pass can still reduce the busiest
				7844	* queue by moving tasks around inside the node.
				7845	*
				7846	* If we cannot move enough load due to this classification
				7847	* the next pass will adjust the group classification and
				7848	* allow migration of more tasks.
				7849	*
				7850	* Both cases only affect the total convergence complexity.
				7851	*/
				7852	if (rt > env->fbq_type)
				7853	continue;
				7854
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	7855	capacity = capacity_of(i);
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	7856
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	7857	wl = weighted_cpuload(i);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7858
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	7859	/*
				7860	* When comparing with imbalance, use weighted_cpuload()
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	7861	* which is not scaled with the cpu capacity.
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	7862	*/
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	7863
				7864	if (rq->nr_running == 1 && wl > env->imbalance &&
				7865	!check_cpu_capacity(rq, env->sd))
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7866	continue;
				7867
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	7868	/*
				7869	* For the load comparisons with the other cpu's, consider
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	7870	* the weighted_cpuload() scaled with the cpu capacity, so
				7871	* that the load can be moved away from the cpu that is
				7872	* potentially running at a lower capacity.
Joonsoo Kim	95a79b8	2013-08-06 17:36:41 +0900	[diff] [blame]	7873	*
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	7874	* Thus we're looking for max(wl_i / capacity_i), crosswise
Joonsoo Kim	95a79b8	2013-08-06 17:36:41 +0900	[diff] [blame]	7875	* multiplication to rid ourselves of the division works out
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	7876	* to: wl_i * capacity_j > wl_j * capacity_i; where j is
				7877	* our previous maximum.
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	7878	*/
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	7879	if (wl * busiest_capacity > busiest_load * capacity) {
Joonsoo Kim	95a79b8	2013-08-06 17:36:41 +0900	[diff] [blame]	7880	busiest_load = wl;
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	7881	busiest_capacity = capacity;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7882	busiest = rq;
				7883	}
				7884	}
				7885
				7886	return busiest;
				7887	}
				7888
				7889	/*
				7890	* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
				7891	* so long as it is large enough.
				7892	*/
				7893	#define MAX_PINNED_INTERVAL 512
				7894
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7895	static int need_active_balance(struct lb_env *env)
Peter Zijlstra	1af3ed3	2009-12-23 15:10:31 +0100	[diff] [blame]	7896	{
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7897	struct sched_domain *sd = env->sd;
				7898
				7899	if (env->idle == CPU_NEWLY_IDLE) {
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7900
				7901	/*
				7902	* ASYM_PACKING needs to force migrate tasks from busy but
Tim Chen	afe06ef	2016-11-22 12:23:53 -0800	[diff] [blame^]	7903	* lower priority CPUs in order to pack all tasks in the
				7904	* highest priority CPUs.
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7905	*/
Tim Chen	afe06ef	2016-11-22 12:23:53 -0800	[diff] [blame^]	7906	if ((sd->flags & SD_ASYM_PACKING) &&
				7907	sched_asym_prefer(env->dst_cpu, env->src_cpu))
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7908	return 1;
Peter Zijlstra	1af3ed3	2009-12-23 15:10:31 +0100	[diff] [blame]	7909	}
				7910
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	7911	/*
				7912	* The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
				7913	* It's worth migrating the task if the src_cpu's capacity is reduced
				7914	* because of other sched_class or IRQs if more capacity stays
				7915	* available on dst_cpu.
				7916	*/
				7917	if ((env->idle != CPU_NOT_IDLE) &&
				7918	(env->src_rq->cfs.h_nr_running == 1)) {
				7919	if ((check_cpu_capacity(env->src_rq, sd)) &&
				7920	(capacity_of(env->src_cpu)sd->imbalance_pct < capacity_of(env->dst_cpu)100))
				7921	return 1;
				7922	}
				7923
Peter Zijlstra	1af3ed3	2009-12-23 15:10:31 +0100	[diff] [blame]	7924	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
				7925	}
				7926
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	7927	static int active_load_balance_cpu_stop(void *data);
				7928
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	7929	static int should_we_balance(struct lb_env *env)
				7930	{
				7931	struct sched_group *sg = env->sd->groups;
				7932	struct cpumask sg_cpus, sg_mask;
				7933	int cpu, balance_cpu = -1;
				7934
				7935	/*
				7936	* In the newly idle case, we will allow all the cpu's
				7937	* to do the newly idle load balance.
				7938	*/
				7939	if (env->idle == CPU_NEWLY_IDLE)
				7940	return 1;
				7941
				7942	sg_cpus = sched_group_cpus(sg);
				7943	sg_mask = sched_group_mask(sg);
				7944	/* Try to find first idle cpu */
				7945	for_each_cpu_and(cpu, sg_cpus, env->cpus) {
				7946	if (!cpumask_test_cpu(cpu, sg_mask) \|\| !idle_cpu(cpu))
				7947	continue;
				7948
				7949	balance_cpu = cpu;
				7950	break;
				7951	}
				7952
				7953	if (balance_cpu == -1)
				7954	balance_cpu = group_balance_cpu(sg);
				7955
				7956	/*
				7957	* First idle cpu or the first cpu(busiest) in this sched group
				7958	* is eligible for doing load balancing at this and above domains.
				7959	*/
Joonsoo Kim	b0cff9d	2013-09-10 15:54:49 +0900	[diff] [blame]	7960	return balance_cpu == env->dst_cpu;
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	7961	}
				7962
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7963	/*
				7964	* Check this_cpu to ensure it is balanced within domain. Attempt to move
				7965	* tasks if there is an imbalance.
				7966	*/
				7967	static int load_balance(int this_cpu, struct rq *this_rq,
				7968	struct sched_domain *sd, enum cpu_idle_type idle,
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	7969	int *continue_balancing)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7970	{
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	7971	int ld_moved, cur_ld_moved, active_balance = 0;
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	7972	struct sched_domain *sd_parent = sd->parent;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7973	struct sched_group *group;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7974	struct rq *busiest;
				7975	unsigned long flags;
Christoph Lameter	4ba2968	2014-08-26 19:12:21 -0500	[diff] [blame]	7976	struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7977
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	7978	struct lb_env env = {
				7979	.sd = sd,
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	7980	.dst_cpu = this_cpu,
				7981	.dst_rq = this_rq,
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	7982	.dst_grpmask = sched_group_cpus(sd->groups),
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	7983	.idle = idle,
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	7984	.loop_break = sched_nr_migrate_break,
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	7985	.cpus = cpus,
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	7986	.fbq_type = all,
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	7987	.tasks = LIST_HEAD_INIT(env.tasks),
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	7988	};
				7989
Joonsoo Kim	cfc0311	2013-04-23 17:27:39 +0900	[diff] [blame]	7990	/*
				7991	* For NEWLY_IDLE load_balancing, we don't need to consider
				7992	* other cpus in our group
				7993	*/
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	7994	if (idle == CPU_NEWLY_IDLE)
Joonsoo Kim	cfc0311	2013-04-23 17:27:39 +0900	[diff] [blame]	7995	env.dst_grpmask = NULL;
Joonsoo Kim	cfc0311	2013-04-23 17:27:39 +0900	[diff] [blame]	7996
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7997	cpumask_copy(cpus, cpu_active_mask);
				7998
Josh Poimboeuf	ae92882	2016-06-17 12:43:24 -0500	[diff] [blame]	7999	schedstat_inc(sd->lb_count[idle]);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8000
				8001	redo:
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	8002	if (!should_we_balance(&env)) {
				8003	*continue_balancing = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8004	goto out_balanced;
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	8005	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8006
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	8007	group = find_busiest_group(&env);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8008	if (!group) {
Josh Poimboeuf	ae92882	2016-06-17 12:43:24 -0500	[diff] [blame]	8009	schedstat_inc(sd->lb_nobusyg[idle]);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8010	goto out_balanced;
				8011	}
				8012
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	8013	busiest = find_busiest_queue(&env, group);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8014	if (!busiest) {
Josh Poimboeuf	ae92882	2016-06-17 12:43:24 -0500	[diff] [blame]	8015	schedstat_inc(sd->lb_nobusyq[idle]);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8016	goto out_balanced;
				8017	}
				8018
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	8019	BUG_ON(busiest == env.dst_rq);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8020
Josh Poimboeuf	ae92882	2016-06-17 12:43:24 -0500	[diff] [blame]	8021	schedstat_add(sd->lb_imbalance[idle], env.imbalance);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8022
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	8023	env.src_cpu = busiest->cpu;
				8024	env.src_rq = busiest;
				8025
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8026	ld_moved = 0;
				8027	if (busiest->nr_running > 1) {
				8028	/*
				8029	* Attempt to move tasks. If find_busiest_group has found
				8030	* an imbalance but busiest->nr_running <= 1, the group is
				8031	* still unbalanced. ld_moved simply stays zero, so it is
				8032	* correctly treated as an imbalance.
				8033	*/
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	8034	env.flags \|= LBF_ALL_PINNED;
Peter Zijlstra	c82513e	2012-04-26 13:12:27 +0200	[diff] [blame]	8035	env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	8036
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	8037	more_balance:
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	8038	raw_spin_lock_irqsave(&busiest->lock, flags);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	8039
				8040	/*
				8041	* cur_ld_moved - load moved in current iteration
				8042	* ld_moved - cumulative load moved across iterations
				8043	*/
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	8044	cur_ld_moved = detach_tasks(&env);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8045
				8046	/*
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	8047	* We've detached some tasks from busiest_rq. Every
				8048	* task is masked "TASK_ON_RQ_MIGRATING", so we can safely
				8049	* unlock busiest->lock, and we are able to be sure
				8050	* that nobody can manipulate the tasks in parallel.
				8051	* See task_rq_lock() family for the details.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8052	*/
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	8053
				8054	raw_spin_unlock(&busiest->lock);
				8055
				8056	if (cur_ld_moved) {
				8057	attach_tasks(&env);
				8058	ld_moved += cur_ld_moved;
				8059	}
				8060
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8061	local_irq_restore(flags);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	8062
Joonsoo Kim	f1cd085	2013-04-23 17:27:37 +0900	[diff] [blame]	8063	if (env.flags & LBF_NEED_BREAK) {
				8064	env.flags &= ~LBF_NEED_BREAK;
				8065	goto more_balance;
				8066	}
				8067
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	8068	/*
				8069	* Revisit (affine) tasks on src_cpu that couldn't be moved to
				8070	* us and move them to an alternate dst_cpu in our sched_group
				8071	* where they can run. The upper limit on how many times we
				8072	* iterate on same src_cpu is dependent on number of cpus in our
				8073	* sched_group.
				8074	*
				8075	* This changes load balance semantics a bit on who can move
				8076	* load to a given_cpu. In addition to the given_cpu itself
				8077	* (or a ilb_cpu acting on its behalf where given_cpu is
				8078	* nohz-idle), we now have balance_cpu in a position to move
				8079	* load to given_cpu. In rare situations, this may cause
				8080	* conflicts (balance_cpu and given_cpu/ilb_cpu deciding
				8081	* _independently_ and at _same_ time to move some load to
				8082	* given_cpu) causing exceess load to be moved to given_cpu.
				8083	* This however should not happen so much in practice and
				8084	* moreover subsequent load balance cycles should correct the
				8085	* excess load moved.
				8086	*/
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	8087	if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	8088
Vladimir Davydov	7aff2e3	2013-09-15 21:30:13 +0400	[diff] [blame]	8089	/* Prevent to re-select dst_cpu via env's cpus */
				8090	cpumask_clear_cpu(env.dst_cpu, env.cpus);
				8091
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	8092	env.dst_rq = cpu_rq(env.new_dst_cpu);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	8093	env.dst_cpu = env.new_dst_cpu;
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	8094	env.flags &= ~LBF_DST_PINNED;
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	8095	env.loop = 0;
				8096	env.loop_break = sched_nr_migrate_break;
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	8097
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	8098	/*
				8099	* Go back to "more_balance" rather than "redo" since we
				8100	* need to continue with same src_cpu.
				8101	*/
				8102	goto more_balance;
				8103	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8104
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	8105	/*
				8106	* We failed to reach balance because of affinity.
				8107	*/
				8108	if (sd_parent) {
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	8109	int *group_imbalance = &sd_parent->groups->sgc->imbalance;
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	8110
Vincent Guittot	afdeee0	2014-08-26 13:06:44 +0200	[diff] [blame]	8111	if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	8112	*group_imbalance = 1;
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	8113	}
				8114
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8115	/* All tasks on this runqueue were pinned by CPU affinity */
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	8116	if (unlikely(env.flags & LBF_ALL_PINNED)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8117	cpumask_clear_cpu(cpu_of(busiest), cpus);
Prashanth Nageshappa	bbf18b1	2012-06-19 17:52:07 +0530	[diff] [blame]	8118	if (!cpumask_empty(cpus)) {
				8119	env.loop = 0;
				8120	env.loop_break = sched_nr_migrate_break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8121	goto redo;
Prashanth Nageshappa	bbf18b1	2012-06-19 17:52:07 +0530	[diff] [blame]	8122	}
Vincent Guittot	afdeee0	2014-08-26 13:06:44 +0200	[diff] [blame]	8123	goto out_all_pinned;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8124	}
				8125	}
				8126
				8127	if (!ld_moved) {
Josh Poimboeuf	ae92882	2016-06-17 12:43:24 -0500	[diff] [blame]	8128	schedstat_inc(sd->lb_failed[idle]);
Venkatesh Pallipadi	58b26c4	2010-09-10 18:19:17 -0700	[diff] [blame]	8129	/*
				8130	* Increment the failure counter only on periodic balance.
				8131	* We do not want newidle balance, which can be very
				8132	* frequent, pollute the failure counter causing
				8133	* excessive cache_hot migrations and active balances.
				8134	*/
				8135	if (idle != CPU_NEWLY_IDLE)
				8136	sd->nr_balance_failed++;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8137
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	8138	if (need_active_balance(&env)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8139	raw_spin_lock_irqsave(&busiest->lock, flags);
				8140
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	8141	/* don't kick the active_load_balance_cpu_stop,
				8142	* if the curr task on busiest cpu can't be
				8143	* moved to this_cpu
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8144	*/
				8145	if (!cpumask_test_cpu(this_cpu,
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	8146	tsk_cpus_allowed(busiest->curr))) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8147	raw_spin_unlock_irqrestore(&busiest->lock,
				8148	flags);
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	8149	env.flags \|= LBF_ALL_PINNED;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8150	goto out_one_pinned;
				8151	}
				8152
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	8153	/*
				8154	* ->active_balance synchronizes accesses to
				8155	* ->active_balance_work. Once set, it's cleared
				8156	* only after active load balance is finished.
				8157	*/
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8158	if (!busiest->active_balance) {
				8159	busiest->active_balance = 1;
				8160	busiest->push_cpu = this_cpu;
				8161	active_balance = 1;
				8162	}
				8163	raw_spin_unlock_irqrestore(&busiest->lock, flags);
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	8164
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	8165	if (active_balance) {
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	8166	stop_one_cpu_nowait(cpu_of(busiest),
				8167	active_load_balance_cpu_stop, busiest,
				8168	&busiest->active_balance_work);
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	8169	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8170
Srikar Dronamraju	d02c07118	2016-03-23 17:54:44 +0530	[diff] [blame]	8171	/* We've kicked active balancing, force task migration. */
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8172	sd->nr_balance_failed = sd->cache_nice_tries+1;
				8173	}
				8174	} else
				8175	sd->nr_balance_failed = 0;
				8176
				8177	if (likely(!active_balance)) {
				8178	/* We were unbalanced, so reset the balancing interval */
				8179	sd->balance_interval = sd->min_interval;
				8180	} else {
				8181	/*
				8182	* If we've begun active balancing, start to back off. This
				8183	* case may not be covered by the all_pinned logic if there
				8184	* is only 1 task on the busy runqueue (because we don't call
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	8185	* detach_tasks).
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8186	*/
				8187	if (sd->balance_interval < sd->max_interval)
				8188	sd->balance_interval *= 2;
				8189	}
				8190
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8191	goto out;
				8192
				8193	out_balanced:
Vincent Guittot	afdeee0	2014-08-26 13:06:44 +0200	[diff] [blame]	8194	/*
				8195	* We reach balance although we may have faced some affinity
				8196	* constraints. Clear the imbalance flag if it was set.
				8197	*/
				8198	if (sd_parent) {
				8199	int *group_imbalance = &sd_parent->groups->sgc->imbalance;
				8200
				8201	if (*group_imbalance)
				8202	*group_imbalance = 0;
				8203	}
				8204
				8205	out_all_pinned:
				8206	/*
				8207	* We reach balance because all tasks are pinned at this level so
				8208	* we can't migrate them. Let the imbalance flag set so parent level
				8209	* can try to migrate them.
				8210	*/
Josh Poimboeuf	ae92882	2016-06-17 12:43:24 -0500	[diff] [blame]	8211	schedstat_inc(sd->lb_balanced[idle]);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8212
				8213	sd->nr_balance_failed = 0;
				8214
				8215	out_one_pinned:
				8216	/* tune up the balancing interval */
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	8217	if (((env.flags & LBF_ALL_PINNED) &&
Peter Zijlstra	5b54b56	2011-09-22 15:23:13 +0200	[diff] [blame]	8218	sd->balance_interval < MAX_PINNED_INTERVAL) \|\|
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8219	(sd->balance_interval < sd->max_interval))
				8220	sd->balance_interval *= 2;
				8221
Venkatesh Pallipadi	46e49b3	2011-02-14 14:38:50 -0800	[diff] [blame]	8222	ld_moved = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8223	out:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8224	return ld_moved;
				8225	}
				8226
Jason Low	52a08ef	2014-05-08 17:49:22 -0700	[diff] [blame]	8227	static inline unsigned long
				8228	get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
				8229	{
				8230	unsigned long interval = sd->balance_interval;
				8231
				8232	if (cpu_busy)
				8233	interval *= sd->busy_factor;
				8234
				8235	/* scale ms to jiffies */
				8236	interval = msecs_to_jiffies(interval);
				8237	interval = clamp(interval, 1UL, max_load_balance_interval);
				8238
				8239	return interval;
				8240	}
				8241
				8242	static inline void
Leo Yan	31851a9	2016-08-05 14:31:29 +0800	[diff] [blame]	8243	update_next_balance(struct sched_domain sd, unsigned long next_balance)
Jason Low	52a08ef	2014-05-08 17:49:22 -0700	[diff] [blame]	8244	{
				8245	unsigned long interval, next;
				8246
Leo Yan	31851a9	2016-08-05 14:31:29 +0800	[diff] [blame]	8247	/* used by idle balance, so cpu_busy = 0 */
				8248	interval = get_sd_balance_interval(sd, 0);
Jason Low	52a08ef	2014-05-08 17:49:22 -0700	[diff] [blame]	8249	next = sd->last_balance + interval;
				8250
				8251	if (time_after(*next_balance, next))
				8252	*next_balance = next;
				8253	}
				8254
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8255	/*
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8256	* idle_balance is called by schedule() if this_cpu is about to become
				8257	* idle. Attempts to pull tasks from other CPUs.
				8258	*/
Peter Zijlstra	6e83125	2014-02-11 16:11:48 +0100	[diff] [blame]	8259	static int idle_balance(struct rq *this_rq)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8260	{
Jason Low	52a08ef	2014-05-08 17:49:22 -0700	[diff] [blame]	8261	unsigned long next_balance = jiffies + HZ;
				8262	int this_cpu = this_rq->cpu;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8263	struct sched_domain *sd;
				8264	int pulled_task = 0;
Jason Low	9bd721c	2013-09-13 11:26:52 -0700	[diff] [blame]	8265	u64 curr_cost = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8266
Peter Zijlstra	6e83125	2014-02-11 16:11:48 +0100	[diff] [blame]	8267	/*
				8268	* We must set idle_stamp _before_ calling idle_balance(), such that we
				8269	* measure the duration of idle_balance() as idle time.
				8270	*/
				8271	this_rq->idle_stamp = rq_clock(this_rq);
				8272
Tim Chen	4486edd	2014-06-23 12:16:49 -0700	[diff] [blame]	8273	if (this_rq->avg_idle < sysctl_sched_migration_cost \|\|
				8274	!this_rq->rd->overload) {
Jason Low	52a08ef	2014-05-08 17:49:22 -0700	[diff] [blame]	8275	rcu_read_lock();
				8276	sd = rcu_dereference_check_sched_domain(this_rq->sd);
				8277	if (sd)
Leo Yan	31851a9	2016-08-05 14:31:29 +0800	[diff] [blame]	8278	update_next_balance(sd, &next_balance);
Jason Low	52a08ef	2014-05-08 17:49:22 -0700	[diff] [blame]	8279	rcu_read_unlock();
				8280
Peter Zijlstra	6e83125	2014-02-11 16:11:48 +0100	[diff] [blame]	8281	goto out;
Jason Low	52a08ef	2014-05-08 17:49:22 -0700	[diff] [blame]	8282	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8283
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	8284	raw_spin_unlock(&this_rq->lock);
				8285
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	8286	update_blocked_averages(this_cpu);
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	8287	rcu_read_lock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8288	for_each_domain(this_cpu, sd) {
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	8289	int continue_balancing = 1;
Jason Low	9bd721c	2013-09-13 11:26:52 -0700	[diff] [blame]	8290	u64 t0, domain_cost;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8291
				8292	if (!(sd->flags & SD_LOAD_BALANCE))
				8293	continue;
				8294
Jason Low	52a08ef	2014-05-08 17:49:22 -0700	[diff] [blame]	8295	if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
Leo Yan	31851a9	2016-08-05 14:31:29 +0800	[diff] [blame]	8296	update_next_balance(sd, &next_balance);
Jason Low	9bd721c	2013-09-13 11:26:52 -0700	[diff] [blame]	8297	break;
Jason Low	52a08ef	2014-05-08 17:49:22 -0700	[diff] [blame]	8298	}
Jason Low	9bd721c	2013-09-13 11:26:52 -0700	[diff] [blame]	8299
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	8300	if (sd->flags & SD_BALANCE_NEWIDLE) {
Jason Low	9bd721c	2013-09-13 11:26:52 -0700	[diff] [blame]	8301	t0 = sched_clock_cpu(this_cpu);
				8302
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	8303	pulled_task = load_balance(this_cpu, this_rq,
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	8304	sd, CPU_NEWLY_IDLE,
				8305	&continue_balancing);
Jason Low	9bd721c	2013-09-13 11:26:52 -0700	[diff] [blame]	8306
				8307	domain_cost = sched_clock_cpu(this_cpu) - t0;
				8308	if (domain_cost > sd->max_newidle_lb_cost)
				8309	sd->max_newidle_lb_cost = domain_cost;
				8310
				8311	curr_cost += domain_cost;
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	8312	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8313
Leo Yan	31851a9	2016-08-05 14:31:29 +0800	[diff] [blame]	8314	update_next_balance(sd, &next_balance);
Jason Low	39a4d9c	2014-04-23 18:30:35 -0700	[diff] [blame]	8315
				8316	/*
				8317	* Stop searching for tasks to pull if there are
				8318	* now runnable tasks on this rq.
				8319	*/
				8320	if (pulled_task \|\| this_rq->nr_running > 0)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8321	break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8322	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	8323	rcu_read_unlock();
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	8324
				8325	raw_spin_lock(&this_rq->lock);
				8326
Jason Low	0e5b533	2014-04-28 15:45:54 -0700	[diff] [blame]	8327	if (curr_cost > this_rq->max_idle_balance_cost)
				8328	this_rq->max_idle_balance_cost = curr_cost;
				8329
Daniel Lezcano	e5fc661	2014-01-17 10:04:02 +0100	[diff] [blame]	8330	/*
Jason Low	0e5b533	2014-04-28 15:45:54 -0700	[diff] [blame]	8331	* While browsing the domains, we released the rq lock, a task could
				8332	* have been enqueued in the meantime. Since we're not going idle,
				8333	* pretend we pulled a task.
Daniel Lezcano	e5fc661	2014-01-17 10:04:02 +0100	[diff] [blame]	8334	*/
Jason Low	0e5b533	2014-04-28 15:45:54 -0700	[diff] [blame]	8335	if (this_rq->cfs.h_nr_running && !pulled_task)
Peter Zijlstra	6e83125	2014-02-11 16:11:48 +0100	[diff] [blame]	8336	pulled_task = 1;
Daniel Lezcano	e5fc661	2014-01-17 10:04:02 +0100	[diff] [blame]	8337
Peter Zijlstra	6e83125	2014-02-11 16:11:48 +0100	[diff] [blame]	8338	out:
Jason Low	52a08ef	2014-05-08 17:49:22 -0700	[diff] [blame]	8339	/* Move the next balance forward */
				8340	if (time_after(this_rq->next_balance, next_balance))
				8341	this_rq->next_balance = next_balance;
				8342
Kirill Tkhai	e4aa358	2014-03-06 13:31:55 +0400	[diff] [blame]	8343	/* Is there a task of a high priority class? */
Kirill Tkhai	4638364	2014-03-15 02:15:07 +0400	[diff] [blame]	8344	if (this_rq->nr_running != this_rq->cfs.h_nr_running)
Kirill Tkhai	e4aa358	2014-03-06 13:31:55 +0400	[diff] [blame]	8345	pulled_task = -1;
				8346
Dietmar Eggemann	38c6ade	2015-10-20 13:04:41 +0100	[diff] [blame]	8347	if (pulled_task)
Peter Zijlstra	6e83125	2014-02-11 16:11:48 +0100	[diff] [blame]	8348	this_rq->idle_stamp = 0;
				8349
Daniel Lezcano	3c4017c	2014-01-17 10:04:03 +0100	[diff] [blame]	8350	return pulled_task;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8351	}
				8352
				8353	/*
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	8354	* active_load_balance_cpu_stop is run by cpu stopper. It pushes
				8355	* running tasks off the busiest CPU onto idle CPUs. It requires at
				8356	* least 1 task to be running on each physical CPU where possible, and
				8357	* avoids physical / logical imbalances.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8358	*/
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	8359	static int active_load_balance_cpu_stop(void *data)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8360	{
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	8361	struct rq *busiest_rq = data;
				8362	int busiest_cpu = cpu_of(busiest_rq);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8363	int target_cpu = busiest_rq->push_cpu;
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	8364	struct rq *target_rq = cpu_rq(target_cpu);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8365	struct sched_domain *sd;
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	8366	struct task_struct *p = NULL;
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	8367
				8368	raw_spin_lock_irq(&busiest_rq->lock);
				8369
				8370	/* make sure the requested cpu hasn't gone down in the meantime */
				8371	if (unlikely(busiest_cpu != smp_processor_id() \|\|
				8372	!busiest_rq->active_balance))
				8373	goto out_unlock;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8374
				8375	/* Is there any task to move? */
				8376	if (busiest_rq->nr_running <= 1)
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	8377	goto out_unlock;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8378
				8379	/*
				8380	* This condition is "impossible", if it occurs
				8381	* we need to fix it. Originally reported by
				8382	* Bjorn Helgaas on a 128-cpu setup.
				8383	*/
				8384	BUG_ON(busiest_rq == target_rq);
				8385
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8386	/* Search for an sd spanning us and the target CPU. */
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	8387	rcu_read_lock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8388	for_each_domain(target_cpu, sd) {
				8389	if ((sd->flags & SD_LOAD_BALANCE) &&
				8390	cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
				8391	break;
				8392	}
				8393
				8394	if (likely(sd)) {
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	8395	struct lb_env env = {
				8396	.sd = sd,
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	8397	.dst_cpu = target_cpu,
				8398	.dst_rq = target_rq,
				8399	.src_cpu = busiest_rq->cpu,
				8400	.src_rq = busiest_rq,
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	8401	.idle = CPU_IDLE,
				8402	};
				8403
Josh Poimboeuf	ae92882	2016-06-17 12:43:24 -0500	[diff] [blame]	8404	schedstat_inc(sd->alb_count);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8405
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	8406	p = detach_one_task(&env);
Srikar Dronamraju	d02c07118	2016-03-23 17:54:44 +0530	[diff] [blame]	8407	if (p) {
Josh Poimboeuf	ae92882	2016-06-17 12:43:24 -0500	[diff] [blame]	8408	schedstat_inc(sd->alb_pushed);
Srikar Dronamraju	d02c07118	2016-03-23 17:54:44 +0530	[diff] [blame]	8409	/* Active balancing done, reset the failure counter. */
				8410	sd->nr_balance_failed = 0;
				8411	} else {
Josh Poimboeuf	ae92882	2016-06-17 12:43:24 -0500	[diff] [blame]	8412	schedstat_inc(sd->alb_failed);
Srikar Dronamraju	d02c07118	2016-03-23 17:54:44 +0530	[diff] [blame]	8413	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8414	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	8415	rcu_read_unlock();
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	8416	out_unlock:
				8417	busiest_rq->active_balance = 0;
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	8418	raw_spin_unlock(&busiest_rq->lock);
				8419
				8420	if (p)
				8421	attach_one_task(target_rq, p);
				8422
				8423	local_irq_enable();
				8424
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	8425	return 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8426	}
				8427
Mike Galbraith	d987fc7	2011-12-05 10:01:47 +0100	[diff] [blame]	8428	static inline int on_null_domain(struct rq *rq)
				8429	{
				8430	return unlikely(!rcu_dereference_sched(rq->sd));
				8431	}
				8432
Frederic Weisbecker	3451d02	2011-08-10 23:21:01 +0200	[diff] [blame]	8433	#ifdef CONFIG_NO_HZ_COMMON
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8434	/*
				8435	* idle load balancing details
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8436	* - When one of the busy CPUs notice that there may be an idle rebalancing
				8437	* needed, they will kick the idle load balancer, which then does idle
				8438	* load balancing for all the idle CPUs.
				8439	*/
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8440	static struct {
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8441	cpumask_var_t idle_cpus_mask;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	8442	atomic_t nr_cpus;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8443	unsigned long next_balance; /* in jiffy units */
				8444	} nohz ____cacheline_aligned;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8445
Daniel Lezcano	3dd0337	2014-01-06 12:34:41 +0100	[diff] [blame]	8446	static inline int find_new_ilb(void)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8447	{
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	8448	int ilb = cpumask_first(nohz.idle_cpus_mask);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8449
Suresh Siddha	786d6dc	2011-12-01 17:07:35 -0800	[diff] [blame]	8450	if (ilb < nr_cpu_ids && idle_cpu(ilb))
				8451	return ilb;
				8452
				8453	return nr_cpu_ids;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8454	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8455
				8456	/*
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8457	* Kick a CPU to do the nohz balancing, if it is time for it. We pick the
				8458	* nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
				8459	* CPU (if there is one).
				8460	*/
Daniel Lezcano	0aeeeeb	2014-01-06 12:34:42 +0100	[diff] [blame]	8461	static void nohz_balancer_kick(void)
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8462	{
				8463	int ilb_cpu;
				8464
				8465	nohz.next_balance++;
				8466
Daniel Lezcano	3dd0337	2014-01-06 12:34:41 +0100	[diff] [blame]	8467	ilb_cpu = find_new_ilb();
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8468
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	8469	if (ilb_cpu >= nr_cpu_ids)
				8470	return;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8471
Suresh Siddha	cd490c5	2011-12-06 11:26:34 -0800	[diff] [blame]	8472	if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	8473	return;
				8474	/*
				8475	* Use smp_send_reschedule() instead of resched_cpu().
				8476	* This way we generate a sched IPI on the target cpu which
				8477	* is idle. And the softirq performing nohz idle load balance
				8478	* will be run before returning from the IPI.
				8479	*/
				8480	smp_send_reschedule(ilb_cpu);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8481	return;
				8482	}
				8483
Thomas Gleixner	20a5c8c	2016-03-10 12:54:20 +0100	[diff] [blame]	8484	void nohz_balance_exit_idle(unsigned int cpu)
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	8485	{
				8486	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
Mike Galbraith	d987fc7	2011-12-05 10:01:47 +0100	[diff] [blame]	8487	/*
				8488	* Completely isolated CPUs don't ever set, so we must test.
				8489	*/
				8490	if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
				8491	cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
				8492	atomic_dec(&nohz.nr_cpus);
				8493	}
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	8494	clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
				8495	}
				8496	}
				8497
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	8498	static inline void set_cpu_sd_state_busy(void)
				8499	{
				8500	struct sched_domain *sd;
Preeti U Murthy	37dc6b5	2013-10-30 08:42:52 +0530	[diff] [blame]	8501	int cpu = smp_processor_id();
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	8502
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	8503	rcu_read_lock();
Peter Zijlstra	0e369d7	2016-05-09 10:38:01 +0200	[diff] [blame]	8504	sd = rcu_dereference(per_cpu(sd_llc, cpu));
Vincent Guittot	25f55d9	2013-04-23 16:59:02 +0200	[diff] [blame]	8505
				8506	if (!sd \|\| !sd->nohz_idle)
				8507	goto unlock;
				8508	sd->nohz_idle = 0;
				8509
Peter Zijlstra	0e369d7	2016-05-09 10:38:01 +0200	[diff] [blame]	8510	atomic_inc(&sd->shared->nr_busy_cpus);
Vincent Guittot	25f55d9	2013-04-23 16:59:02 +0200	[diff] [blame]	8511	unlock:
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	8512	rcu_read_unlock();
				8513	}
				8514
				8515	void set_cpu_sd_state_idle(void)
				8516	{
				8517	struct sched_domain *sd;
Preeti U Murthy	37dc6b5	2013-10-30 08:42:52 +0530	[diff] [blame]	8518	int cpu = smp_processor_id();
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	8519
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	8520	rcu_read_lock();
Peter Zijlstra	0e369d7	2016-05-09 10:38:01 +0200	[diff] [blame]	8521	sd = rcu_dereference(per_cpu(sd_llc, cpu));
Vincent Guittot	25f55d9	2013-04-23 16:59:02 +0200	[diff] [blame]	8522
				8523	if (!sd \|\| sd->nohz_idle)
				8524	goto unlock;
				8525	sd->nohz_idle = 1;
				8526
Peter Zijlstra	0e369d7	2016-05-09 10:38:01 +0200	[diff] [blame]	8527	atomic_dec(&sd->shared->nr_busy_cpus);
Vincent Guittot	25f55d9	2013-04-23 16:59:02 +0200	[diff] [blame]	8528	unlock:
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	8529	rcu_read_unlock();
				8530	}
				8531
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8532	/*
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	8533	* This routine will record that the cpu is going idle with tick stopped.
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	8534	* This info will be used in performing idle load balancing in the future.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8535	*/
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	8536	void nohz_balance_enter_idle(int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8537	{
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	8538	/*
				8539	* If this cpu is going down, then nothing needs to be done.
				8540	*/
				8541	if (!cpu_active(cpu))
				8542	return;
				8543
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	8544	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
				8545	return;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8546
Mike Galbraith	d987fc7	2011-12-05 10:01:47 +0100	[diff] [blame]	8547	/*
				8548	* If we're a completely isolated CPU, we don't play.
				8549	*/
				8550	if (on_null_domain(cpu_rq(cpu)))
				8551	return;
				8552
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	8553	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
				8554	atomic_inc(&nohz.nr_cpus);
				8555	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8556	}
				8557	#endif
				8558
				8559	static DEFINE_SPINLOCK(balancing);
				8560
Peter Zijlstra	49c022e	2011-04-05 10:14:25 +0200	[diff] [blame]	8561	/*
				8562	* Scale the max load_balance interval with the number of CPUs in the system.
				8563	* This trades load-balance latency on larger machines for less cross talk.
				8564	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	8565	void update_max_interval(void)
Peter Zijlstra	49c022e	2011-04-05 10:14:25 +0200	[diff] [blame]	8566	{
				8567	max_load_balance_interval = HZ*num_online_cpus()/10;
				8568	}
				8569
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8570	/*
				8571	* It checks each scheduling domain to see if it is due to be balanced,
				8572	* and initiates a balancing operation if so.
				8573	*
Libin	b9b0853	2013-04-01 19:14:01 +0800	[diff] [blame]	8574	* Balancing parameters are set up in init_sched_domains.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8575	*/
Daniel Lezcano	f7ed0a8	2014-01-06 12:34:43 +0100	[diff] [blame]	8576	static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8577	{
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	8578	int continue_balancing = 1;
Daniel Lezcano	f7ed0a8	2014-01-06 12:34:43 +0100	[diff] [blame]	8579	int cpu = rq->cpu;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8580	unsigned long interval;
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	8581	struct sched_domain *sd;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8582	/* Earliest time when we have to do rebalance again */
				8583	unsigned long next_balance = jiffies + 60*HZ;
				8584	int update_next_balance = 0;
Jason Low	f48627e	2013-09-13 11:26:53 -0700	[diff] [blame]	8585	int need_serialize, need_decay = 0;
				8586	u64 max_cost = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8587
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	8588	update_blocked_averages(cpu);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	8589
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	8590	rcu_read_lock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8591	for_each_domain(cpu, sd) {
Jason Low	f48627e	2013-09-13 11:26:53 -0700	[diff] [blame]	8592	/*
				8593	* Decay the newidle max times here because this is a regular
				8594	* visit to all the domains. Decay ~1% per second.
				8595	*/
				8596	if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
				8597	sd->max_newidle_lb_cost =
				8598	(sd->max_newidle_lb_cost * 253) / 256;
				8599	sd->next_decay_max_lb_cost = jiffies + HZ;
				8600	need_decay = 1;
				8601	}
				8602	max_cost += sd->max_newidle_lb_cost;
				8603
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8604	if (!(sd->flags & SD_LOAD_BALANCE))
				8605	continue;
				8606
Jason Low	f48627e	2013-09-13 11:26:53 -0700	[diff] [blame]	8607	/*
				8608	* Stop the load balance at this level. There is another
				8609	* CPU in our sched group which is doing load balancing more
				8610	* actively.
				8611	*/
				8612	if (!continue_balancing) {
				8613	if (need_decay)
				8614	continue;
				8615	break;
				8616	}
				8617
Jason Low	52a08ef	2014-05-08 17:49:22 -0700	[diff] [blame]	8618	interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8619
				8620	need_serialize = sd->flags & SD_SERIALIZE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8621	if (need_serialize) {
				8622	if (!spin_trylock(&balancing))
				8623	goto out;
				8624	}
				8625
				8626	if (time_after_eq(jiffies, sd->last_balance + interval)) {
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	8627	if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8628	/*
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	8629	* The LBF_DST_PINNED logic could have changed
Joonsoo Kim	de5eb2d	2013-04-23 17:27:38 +0900	[diff] [blame]	8630	* env->dst_cpu, so we can't know our idle
				8631	* state even if we migrated tasks. Update it.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8632	*/
Joonsoo Kim	de5eb2d	2013-04-23 17:27:38 +0900	[diff] [blame]	8633	idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8634	}
				8635	sd->last_balance = jiffies;
Jason Low	52a08ef	2014-05-08 17:49:22 -0700	[diff] [blame]	8636	interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8637	}
				8638	if (need_serialize)
				8639	spin_unlock(&balancing);
				8640	out:
				8641	if (time_after(next_balance, sd->last_balance + interval)) {
				8642	next_balance = sd->last_balance + interval;
				8643	update_next_balance = 1;
				8644	}
Jason Low	f48627e	2013-09-13 11:26:53 -0700	[diff] [blame]	8645	}
				8646	if (need_decay) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8647	/*
Jason Low	f48627e	2013-09-13 11:26:53 -0700	[diff] [blame]	8648	* Ensure the rq-wide value also decays but keep it at a
				8649	* reasonable floor to avoid funnies with rq->avg_idle.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8650	*/
Jason Low	f48627e	2013-09-13 11:26:53 -0700	[diff] [blame]	8651	rq->max_idle_balance_cost =
				8652	max((u64)sysctl_sched_migration_cost, max_cost);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8653	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	8654	rcu_read_unlock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8655
				8656	/*
				8657	* next_balance will be updated only when there is a need.
				8658	* When the cpu is attached to null domain for ex, it will not be
				8659	* updated.
				8660	*/
Vincent Guittot	c5afb6a	2015-08-03 11:55:50 +0200	[diff] [blame]	8661	if (likely(update_next_balance)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8662	rq->next_balance = next_balance;
Vincent Guittot	c5afb6a	2015-08-03 11:55:50 +0200	[diff] [blame]	8663
				8664	#ifdef CONFIG_NO_HZ_COMMON
				8665	/*
				8666	* If this CPU has been elected to perform the nohz idle
				8667	* balance. Other idle CPUs have already rebalanced with
				8668	* nohz_idle_balance() and nohz.next_balance has been
				8669	* updated accordingly. This CPU is now running the idle load
				8670	* balance for itself and we need to update the
				8671	* nohz.next_balance accordingly.
				8672	*/
				8673	if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
				8674	nohz.next_balance = rq->next_balance;
				8675	#endif
				8676	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8677	}
				8678
Frederic Weisbecker	3451d02	2011-08-10 23:21:01 +0200	[diff] [blame]	8679	#ifdef CONFIG_NO_HZ_COMMON
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8680	/*
Frederic Weisbecker	3451d02	2011-08-10 23:21:01 +0200	[diff] [blame]	8681	* In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8682	* rebalancing for all the cpus for whom scheduler ticks are stopped.
				8683	*/
Daniel Lezcano	208cb16	2014-01-06 12:34:44 +0100	[diff] [blame]	8684	static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8685	{
Daniel Lezcano	208cb16	2014-01-06 12:34:44 +0100	[diff] [blame]	8686	int this_cpu = this_rq->cpu;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8687	struct rq *rq;
				8688	int balance_cpu;
Vincent Guittot	c5afb6a	2015-08-03 11:55:50 +0200	[diff] [blame]	8689	/* Earliest time when we have to do rebalance again */
				8690	unsigned long next_balance = jiffies + 60*HZ;
				8691	int update_next_balance = 0;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8692
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	8693	if (idle != CPU_IDLE \|\|
				8694	!test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
				8695	goto end;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8696
				8697	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
Suresh Siddha	8a6d42d	2011-12-06 11:19:37 -0800	[diff] [blame]	8698	if (balance_cpu == this_cpu \|\| !idle_cpu(balance_cpu))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8699	continue;
				8700
				8701	/*
				8702	* If this cpu gets work to do, stop the load balancing
				8703	* work being done for other cpus. Next load
				8704	* balancing owner will pick it up.
				8705	*/
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	8706	if (need_resched())
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8707	break;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8708
Vincent Guittot	5ed4f1d	2012-09-13 06:11:26 +0200	[diff] [blame]	8709	rq = cpu_rq(balance_cpu);
				8710
Tim Chen	ed61bbc	2014-05-20 14:39:27 -0700	[diff] [blame]	8711	/*
				8712	* If time for next balance is due,
				8713	* do the balance.
				8714	*/
				8715	if (time_after_eq(jiffies, rq->next_balance)) {
				8716	raw_spin_lock_irq(&rq->lock);
				8717	update_rq_clock(rq);
Frederic Weisbecker	cee1afc	2016-04-13 15:56:50 +0200	[diff] [blame]	8718	cpu_load_update_idle(rq);
Tim Chen	ed61bbc	2014-05-20 14:39:27 -0700	[diff] [blame]	8719	raw_spin_unlock_irq(&rq->lock);
				8720	rebalance_domains(rq, CPU_IDLE);
				8721	}
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8722
Vincent Guittot	c5afb6a	2015-08-03 11:55:50 +0200	[diff] [blame]	8723	if (time_after(next_balance, rq->next_balance)) {
				8724	next_balance = rq->next_balance;
				8725	update_next_balance = 1;
				8726	}
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8727	}
Vincent Guittot	c5afb6a	2015-08-03 11:55:50 +0200	[diff] [blame]	8728
				8729	/*
				8730	* next_balance will be updated only when there is a need.
				8731	* When the CPU is attached to null domain for ex, it will not be
				8732	* updated.
				8733	*/
				8734	if (likely(update_next_balance))
				8735	nohz.next_balance = next_balance;
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	8736	end:
				8737	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8738	}
				8739
				8740	/*
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	8741	* Current heuristic for kicking the idle load balancer in the presence
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	8742	* of an idle cpu in the system.
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	8743	* - This rq has more than one task.
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	8744	* - This rq has at least one CFS task and the capacity of the CPU is
				8745	* significantly reduced because of RT tasks or IRQs.
				8746	* - At parent of LLC scheduler domain level, this cpu's scheduler group has
				8747	* multiple busy cpu.
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	8748	* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
				8749	* domain span are idle.
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8750	*/
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	8751	static inline bool nohz_kick_needed(struct rq *rq)
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8752	{
				8753	unsigned long now = jiffies;
Peter Zijlstra	0e369d7	2016-05-09 10:38:01 +0200	[diff] [blame]	8754	struct sched_domain_shared *sds;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	8755	struct sched_domain *sd;
Tim Chen	afe06ef	2016-11-22 12:23:53 -0800	[diff] [blame^]	8756	int nr_busy, i, cpu = rq->cpu;
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	8757	bool kick = false;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8758
Daniel Lezcano	4a72562	2014-01-06 12:34:39 +0100	[diff] [blame]	8759	if (unlikely(rq->idle_balance))
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	8760	return false;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8761
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	8762	/*
				8763	* We may be recently in ticked or tickless idle mode. At the first
				8764	* busy tick after returning from idle, we will update the busy stats.
				8765	*/
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	8766	set_cpu_sd_state_busy();
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	8767	nohz_balance_exit_idle(cpu);
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	8768
				8769	/*
				8770	* None are in tickless mode and hence no need for NOHZ idle load
				8771	* balancing.
				8772	*/
				8773	if (likely(!atomic_read(&nohz.nr_cpus)))
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	8774	return false;
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	8775
				8776	if (time_before(now, nohz.next_balance))
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	8777	return false;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8778
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	8779	if (rq->nr_running >= 2)
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	8780	return true;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8781
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	8782	rcu_read_lock();
Peter Zijlstra	0e369d7	2016-05-09 10:38:01 +0200	[diff] [blame]	8783	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
				8784	if (sds) {
				8785	/*
				8786	* XXX: write a coherent comment on why we do this.
				8787	* See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
				8788	*/
				8789	nr_busy = atomic_read(&sds->nr_busy_cpus);
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	8790	if (nr_busy > 1) {
				8791	kick = true;
				8792	goto unlock;
				8793	}
				8794
				8795	}
				8796
				8797	sd = rcu_dereference(rq->sd);
				8798	if (sd) {
				8799	if ((rq->cfs.h_nr_running >= 1) &&
				8800	check_cpu_capacity(rq, sd)) {
				8801	kick = true;
				8802	goto unlock;
				8803	}
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8804	}
Preeti U Murthy	37dc6b5	2013-10-30 08:42:52 +0530	[diff] [blame]	8805
				8806	sd = rcu_dereference(per_cpu(sd_asym, cpu));
Tim Chen	afe06ef	2016-11-22 12:23:53 -0800	[diff] [blame^]	8807	if (sd) {
				8808	for_each_cpu(i, sched_domain_span(sd)) {
				8809	if (i == cpu \|\|
				8810	!cpumask_test_cpu(i, nohz.idle_cpus_mask))
				8811	continue;
Preeti U Murthy	37dc6b5	2013-10-30 08:42:52 +0530	[diff] [blame]	8812
Tim Chen	afe06ef	2016-11-22 12:23:53 -0800	[diff] [blame^]	8813	if (sched_asym_prefer(i, cpu)) {
				8814	kick = true;
				8815	goto unlock;
				8816	}
				8817	}
				8818	}
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	8819	unlock:
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	8820	rcu_read_unlock();
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	8821	return kick;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8822	}
				8823	#else
Daniel Lezcano	208cb16	2014-01-06 12:34:44 +0100	[diff] [blame]	8824	static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8825	#endif
				8826
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8827	/*
				8828	* run_rebalance_domains is triggered when needed from the scheduler tick.
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8829	* Also triggered for nohz idle balancing (with nohz_balancing_kick set).
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8830	*/
Emese Revfy	0766f78	2016-06-20 20:42:34 +0200	[diff] [blame]	8831	static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8832	{
Daniel Lezcano	208cb16	2014-01-06 12:34:44 +0100	[diff] [blame]	8833	struct rq *this_rq = this_rq();
Suresh Siddha	6eb57e0	2011-10-03 15:09:01 -0700	[diff] [blame]	8834	enum cpu_idle_type idle = this_rq->idle_balance ?
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8835	CPU_IDLE : CPU_NOT_IDLE;
				8836
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8837	/*
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8838	* If this cpu has a pending nohz_balance_kick, then do the
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8839	* balancing on behalf of the other idle cpus whose ticks are
Preeti U Murthy	d4573c3	2015-03-26 18:32:44 +0530	[diff] [blame]	8840	* stopped. Do nohz_idle_balance before rebalance_domains to
				8841	* give the idle cpus a chance to load balance. Else we may
				8842	* load balance only within the local sched_domain hierarchy
				8843	* and abort nohz_idle_balance altogether if we pull some load.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8844	*/
Daniel Lezcano	208cb16	2014-01-06 12:34:44 +0100	[diff] [blame]	8845	nohz_idle_balance(this_rq, idle);
Preeti U Murthy	d4573c3	2015-03-26 18:32:44 +0530	[diff] [blame]	8846	rebalance_domains(this_rq, idle);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8847	}
				8848
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8849	/*
				8850	* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8851	*/
Daniel Lezcano	7caff66	2014-01-06 12:34:38 +0100	[diff] [blame]	8852	void trigger_load_balance(struct rq *rq)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8853	{
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8854	/* Don't need to rebalance while attached to NULL domain */
Daniel Lezcano	c726099	2014-01-06 12:34:45 +0100	[diff] [blame]	8855	if (unlikely(on_null_domain(rq)))
				8856	return;
				8857
				8858	if (time_after_eq(jiffies, rq->next_balance))
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8859	raise_softirq(SCHED_SOFTIRQ);
Frederic Weisbecker	3451d02	2011-08-10 23:21:01 +0200	[diff] [blame]	8860	#ifdef CONFIG_NO_HZ_COMMON
Daniel Lezcano	c726099	2014-01-06 12:34:45 +0100	[diff] [blame]	8861	if (nohz_kick_needed(rq))
Daniel Lezcano	0aeeeeb	2014-01-06 12:34:42 +0100	[diff] [blame]	8862	nohz_balancer_kick();
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	8863	#endif
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	8864	}
				8865
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	8866	static void rq_online_fair(struct rq *rq)
				8867	{
				8868	update_sysctl();
Kirill Tkhai	0e59bda	2014-06-25 12:19:42 +0400	[diff] [blame]	8869
				8870	update_runtime_enabled(rq);
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	8871	}
				8872
				8873	static void rq_offline_fair(struct rq *rq)
				8874	{
				8875	update_sysctl();
Peter Boonstoppel	a4c96ae	2012-08-09 15:34:47 -0700	[diff] [blame]	8876
				8877	/* Ensure any throttled groups are reachable by pick_next_task */
				8878	unthrottle_offline_cfs_rqs(rq);
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	8879	}
				8880
Dhaval Giani	55e12e5	2008-06-24 23:39:43 +0530	[diff] [blame]	8881	#endif /* CONFIG_SMP */
Peter Williams	e1d1484	2007-10-24 18:23:51 +0200	[diff] [blame]	8882
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	8883	/*
				8884	* scheduler tick hitting a task of our scheduling class:
				8885	*/
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	8886	static void task_tick_fair(struct rq rq, struct task_struct curr, int queued)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	8887	{
				8888	struct cfs_rq *cfs_rq;
				8889	struct sched_entity *se = &curr->se;
				8890
				8891	for_each_sched_entity(se) {
				8892	cfs_rq = cfs_rq_of(se);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	8893	entity_tick(cfs_rq, se, queued);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	8894	}
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	8895
Srikar Dronamraju	b52da86	2015-10-02 07:48:25 +0530	[diff] [blame]	8896	if (static_branch_unlikely(&sched_numa_balancing))
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	8897	task_tick_numa(rq, curr);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	8898	}
				8899
				8900	/*
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	8901	* called on fork with the child task as argument from the parent's context
				8902	* - child not yet on the tasklist
				8903	* - preemption disabled
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	8904	*/
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	8905	static void task_fork_fair(struct task_struct *p)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	8906	{
Daisuke Nishimura	4fc420c	2011-12-15 14:36:55 +0900	[diff] [blame]	8907	struct cfs_rq *cfs_rq;
				8908	struct sched_entity se = &p->se, curr;
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	8909	struct rq *rq = this_rq();
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	8910
Peter Zijlstra	e210bff	2016-06-16 18:51:48 +0200	[diff] [blame]	8911	raw_spin_lock(&rq->lock);
Peter Zijlstra	861d034	2010-08-19 13:31:43 +0200	[diff] [blame]	8912	update_rq_clock(rq);
				8913
Daisuke Nishimura	4fc420c	2011-12-15 14:36:55 +0900	[diff] [blame]	8914	cfs_rq = task_cfs_rq(current);
				8915	curr = cfs_rq->curr;
Peter Zijlstra	e210bff	2016-06-16 18:51:48 +0200	[diff] [blame]	8916	if (curr) {
				8917	update_curr(cfs_rq);
Mike Galbraith	b5d9d73	2009-09-08 11:12:28 +0200	[diff] [blame]	8918	se->vruntime = curr->vruntime;
Peter Zijlstra	e210bff	2016-06-16 18:51:48 +0200	[diff] [blame]	8919	}
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	8920	place_entity(cfs_rq, se, 1);
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	8921
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	8922	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
Dmitry Adamushko	87fefa3	2007-10-15 17:00:08 +0200	[diff] [blame]	8923	/*
Ingo Molnar	edcb60a	2007-10-15 17:00:08 +0200	[diff] [blame]	8924	* Upon rescheduling, sched_class::put_prev_task() will place
				8925	* 'current' within the tree based on its new key value.
				8926	*/
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	8927	swap(curr->vruntime, se->vruntime);
Kirill Tkhai	8875125	2014-06-29 00:03:57 +0400	[diff] [blame]	8928	resched_curr(rq);
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	8929	}
				8930
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	8931	se->vruntime -= cfs_rq->min_vruntime;
Peter Zijlstra	e210bff	2016-06-16 18:51:48 +0200	[diff] [blame]	8932	raw_spin_unlock(&rq->lock);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	8933	}
				8934
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	8935	/*
				8936	* Priority of the task has changed. Check to see if we preempt
				8937	* the current task.
				8938	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	8939	static void
				8940	prio_changed_fair(struct rq rq, struct task_struct p, int oldprio)
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	8941	{
Kirill Tkhai	da0c1e6	2014-08-20 13:47:32 +0400	[diff] [blame]	8942	if (!task_on_rq_queued(p))
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	8943	return;
				8944
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	8945	/*
				8946	* Reschedule if we are currently running on this runqueue and
				8947	* our priority decreased, or if we are not currently running on
				8948	* this runqueue and our priority is higher than the current's
				8949	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	8950	if (rq->curr == p) {
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	8951	if (p->prio > oldprio)
Kirill Tkhai	8875125	2014-06-29 00:03:57 +0400	[diff] [blame]	8952	resched_curr(rq);
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	8953	} else
Peter Zijlstra	15afe09	2008-09-20 23:38:02 +0200	[diff] [blame]	8954	check_preempt_curr(rq, p, 0);
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	8955	}
				8956
Byungchul Park	daa5940	2015-08-20 20:22:00 +0900	[diff] [blame]	8957	static inline bool vruntime_normalized(struct task_struct *p)
				8958	{
				8959	struct sched_entity *se = &p->se;
				8960
				8961	/*
				8962	* In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
				8963	* the dequeue_entity(.flags=0) will already have normalized the
				8964	* vruntime.
				8965	*/
				8966	if (p->on_rq)
				8967	return true;
				8968
				8969	/*
				8970	* When !on_rq, vruntime of the task has usually NOT been normalized.
				8971	* But there are some cases where it has already been normalized:
				8972	*
				8973	* - A forked child which is waiting for being woken up by
				8974	* wake_up_new_task().
				8975	* - A task which has been woken up by try_to_wake_up() and
				8976	* waiting for actually being woken up by sched_ttwu_pending().
				8977	*/
				8978	if (!se->sum_exec_runtime \|\| p->state == TASK_WAKING)
				8979	return true;
				8980
				8981	return false;
				8982	}
				8983
Vincent Guittot	09a43ac	2016-11-08 10:53:45 +0100	[diff] [blame]	8984	#ifdef CONFIG_FAIR_GROUP_SCHED
				8985	/*
				8986	* Propagate the changes of the sched_entity across the tg tree to make it
				8987	* visible to the root
				8988	*/
				8989	static void propagate_entity_cfs_rq(struct sched_entity *se)
				8990	{
				8991	struct cfs_rq *cfs_rq;
				8992
				8993	/* Start to propagate at parent */
				8994	se = se->parent;
				8995
				8996	for_each_sched_entity(se) {
				8997	cfs_rq = cfs_rq_of(se);
				8998
				8999	if (cfs_rq_throttled(cfs_rq))
				9000	break;
				9001
				9002	update_load_avg(se, UPDATE_TG);
				9003	}
				9004	}
				9005	#else
				9006	static void propagate_entity_cfs_rq(struct sched_entity *se) { }
				9007	#endif
				9008
Vincent Guittot	df21791	2016-11-08 10:53:42 +0100	[diff] [blame]	9009	static void detach_entity_cfs_rq(struct sched_entity *se)
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	9010	{
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	9011	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				9012
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	9013	/* Catch up with the cfs_rq and remove our load when we leave */
Vincent Guittot	d31b1a6	2016-11-08 10:53:44 +0100	[diff] [blame]	9014	update_load_avg(se, 0);
Byungchul Park	a05e8c5	2015-08-20 20:21:56 +0900	[diff] [blame]	9015	detach_entity_load_avg(cfs_rq, se);
Peter Zijlstra	7c3edd2	2016-07-13 10:56:25 +0200	[diff] [blame]	9016	update_tg_load_avg(cfs_rq, false);
Vincent Guittot	09a43ac	2016-11-08 10:53:45 +0100	[diff] [blame]	9017	propagate_entity_cfs_rq(se);
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	9018	}
				9019
Vincent Guittot	df21791	2016-11-08 10:53:42 +0100	[diff] [blame]	9020	static void attach_entity_cfs_rq(struct sched_entity *se)
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	9021	{
Byungchul Park	daa5940	2015-08-20 20:22:00 +0900	[diff] [blame]	9022	struct cfs_rq *cfs_rq = cfs_rq_of(se);
Byungchul Park	7855a35	2015-08-10 18:02:55 +0900	[diff] [blame]	9023
				9024	#ifdef CONFIG_FAIR_GROUP_SCHED
Michael wang	eb7a59b	2014-02-20 11:14:53 +0800	[diff] [blame]	9025	/*
				9026	* Since the real-depth could have been changed (only FAIR
				9027	* class maintain depth value), reset depth properly.
				9028	*/
				9029	se->depth = se->parent ? se->parent->depth + 1 : 0;
				9030	#endif
Byungchul Park	7855a35	2015-08-10 18:02:55 +0900	[diff] [blame]	9031
Vincent Guittot	df21791	2016-11-08 10:53:42 +0100	[diff] [blame]	9032	/* Synchronize entity with its cfs_rq */
Vincent Guittot	d31b1a6	2016-11-08 10:53:44 +0100	[diff] [blame]	9033	update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
Byungchul Park	daa5940	2015-08-20 20:22:00 +0900	[diff] [blame]	9034	attach_entity_load_avg(cfs_rq, se);
Peter Zijlstra	7c3edd2	2016-07-13 10:56:25 +0200	[diff] [blame]	9035	update_tg_load_avg(cfs_rq, false);
Vincent Guittot	09a43ac	2016-11-08 10:53:45 +0100	[diff] [blame]	9036	propagate_entity_cfs_rq(se);
Vincent Guittot	df21791	2016-11-08 10:53:42 +0100	[diff] [blame]	9037	}
				9038
				9039	static void detach_task_cfs_rq(struct task_struct *p)
				9040	{
				9041	struct sched_entity *se = &p->se;
				9042	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				9043
				9044	if (!vruntime_normalized(p)) {
				9045	/*
				9046	* Fix up our vruntime so that the current sleep doesn't
				9047	* cause 'unlimited' sleep bonus.
				9048	*/
				9049	place_entity(cfs_rq, se, 0);
				9050	se->vruntime -= cfs_rq->min_vruntime;
				9051	}
				9052
				9053	detach_entity_cfs_rq(se);
				9054	}
				9055
				9056	static void attach_task_cfs_rq(struct task_struct *p)
				9057	{
				9058	struct sched_entity *se = &p->se;
				9059	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				9060
				9061	attach_entity_cfs_rq(se);
Byungchul Park	6efdb10	2015-08-20 20:21:59 +0900	[diff] [blame]	9062
Byungchul Park	daa5940	2015-08-20 20:22:00 +0900	[diff] [blame]	9063	if (!vruntime_normalized(p))
				9064	se->vruntime += cfs_rq->min_vruntime;
				9065	}
Byungchul Park	7855a35	2015-08-10 18:02:55 +0900	[diff] [blame]	9066
Byungchul Park	daa5940	2015-08-20 20:22:00 +0900	[diff] [blame]	9067	static void switched_from_fair(struct rq rq, struct task_struct p)
				9068	{
				9069	detach_task_cfs_rq(p);
				9070	}
				9071
				9072	static void switched_to_fair(struct rq rq, struct task_struct p)
				9073	{
				9074	attach_task_cfs_rq(p);
				9075
				9076	if (task_on_rq_queued(p)) {
Byungchul Park	7855a35	2015-08-10 18:02:55 +0900	[diff] [blame]	9077	/*
Byungchul Park	daa5940	2015-08-20 20:22:00 +0900	[diff] [blame]	9078	* We were most likely switched from sched_rt, so
				9079	* kick off the schedule if running, otherwise just see
				9080	* if we can still preempt the current task.
Byungchul Park	7855a35	2015-08-10 18:02:55 +0900	[diff] [blame]	9081	*/
Byungchul Park	daa5940	2015-08-20 20:22:00 +0900	[diff] [blame]	9082	if (rq->curr == p)
				9083	resched_curr(rq);
				9084	else
				9085	check_preempt_curr(rq, p, 0);
Byungchul Park	7855a35	2015-08-10 18:02:55 +0900	[diff] [blame]	9086	}
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	9087	}
				9088
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	9089	/* Account for a task changing its policy or group.
				9090	*
				9091	* This routine is mostly called to set cfs_rq->curr field when a task
				9092	* migrates between groups/classes.
				9093	*/
				9094	static void set_curr_task_fair(struct rq *rq)
				9095	{
				9096	struct sched_entity *se = &rq->curr->se;
				9097
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	9098	for_each_sched_entity(se) {
				9099	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				9100
				9101	set_next_entity(cfs_rq, se);
				9102	/* ensure bandwidth has been allocated on our new cfs_rq */
				9103	account_cfs_rq_runtime(cfs_rq, 0);
				9104	}
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	9105	}
				9106
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9107	void init_cfs_rq(struct cfs_rq *cfs_rq)
				9108	{
				9109	cfs_rq->tasks_timeline = RB_ROOT;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9110	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
				9111	#ifndef CONFIG_64BIT
				9112	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
				9113	#endif
Alex Shi	141965c	2013-06-26 13:05:39 +0800	[diff] [blame]	9114	#ifdef CONFIG_SMP
Vincent Guittot	09a43ac	2016-11-08 10:53:45 +0100	[diff] [blame]	9115	#ifdef CONFIG_FAIR_GROUP_SCHED
				9116	cfs_rq->propagate_avg = 0;
				9117	#endif
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	9118	atomic_long_set(&cfs_rq->removed_load_avg, 0);
				9119	atomic_long_set(&cfs_rq->removed_util_avg, 0);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	9120	#endif
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9121	}
				9122
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	9123	#ifdef CONFIG_FAIR_GROUP_SCHED
Vincent Guittot	ea86cb4	2016-06-17 13:38:55 +0200	[diff] [blame]	9124	static void task_set_group_fair(struct task_struct *p)
				9125	{
				9126	struct sched_entity *se = &p->se;
				9127
				9128	set_task_rq(p, task_cpu(p));
				9129	se->depth = se->parent ? se->parent->depth + 1 : 0;
				9130	}
				9131
Peter Zijlstra	bc54da2	2015-08-31 17:13:55 +0200	[diff] [blame]	9132	static void task_move_group_fair(struct task_struct *p)
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	9133	{
Byungchul Park	daa5940	2015-08-20 20:22:00 +0900	[diff] [blame]	9134	detach_task_cfs_rq(p);
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	9135	set_task_rq(p, task_cpu(p));
Byungchul Park	6efdb10	2015-08-20 20:21:59 +0900	[diff] [blame]	9136
				9137	#ifdef CONFIG_SMP
				9138	/* Tell se's cfs_rq has been changed -- migrated */
				9139	p->se.avg.last_update_time = 0;
				9140	#endif
Byungchul Park	daa5940	2015-08-20 20:22:00 +0900	[diff] [blame]	9141	attach_task_cfs_rq(p);
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	9142	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9143
Vincent Guittot	ea86cb4	2016-06-17 13:38:55 +0200	[diff] [blame]	9144	static void task_change_group_fair(struct task_struct *p, int type)
				9145	{
				9146	switch (type) {
				9147	case TASK_SET_GROUP:
				9148	task_set_group_fair(p);
				9149	break;
				9150
				9151	case TASK_MOVE_GROUP:
				9152	task_move_group_fair(p);
				9153	break;
				9154	}
				9155	}
				9156
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9157	void free_fair_sched_group(struct task_group *tg)
				9158	{
				9159	int i;
				9160
				9161	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
				9162
				9163	for_each_possible_cpu(i) {
				9164	if (tg->cfs_rq)
				9165	kfree(tg->cfs_rq[i]);
Peter Zijlstra	6fe1f34	2016-01-21 22:24:16 +0100	[diff] [blame]	9166	if (tg->se)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9167	kfree(tg->se[i]);
				9168	}
				9169
				9170	kfree(tg->cfs_rq);
				9171	kfree(tg->se);
				9172	}
				9173
				9174	int alloc_fair_sched_group(struct task_group tg, struct task_group parent)
				9175	{
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9176	struct sched_entity *se;
Peter Zijlstra	b7fa30c	2016-06-09 15:07:50 +0200	[diff] [blame]	9177	struct cfs_rq *cfs_rq;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9178	int i;
				9179
				9180	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
				9181	if (!tg->cfs_rq)
				9182	goto err;
				9183	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
				9184	if (!tg->se)
				9185	goto err;
				9186
				9187	tg->shares = NICE_0_LOAD;
				9188
				9189	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
				9190
				9191	for_each_possible_cpu(i) {
				9192	cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
				9193	GFP_KERNEL, cpu_to_node(i));
				9194	if (!cfs_rq)
				9195	goto err;
				9196
				9197	se = kzalloc_node(sizeof(struct sched_entity),
				9198	GFP_KERNEL, cpu_to_node(i));
				9199	if (!se)
				9200	goto err_free_rq;
				9201
				9202	init_cfs_rq(cfs_rq);
				9203	init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
Yuyang Du	540247f	2015-07-15 08:04:39 +0800	[diff] [blame]	9204	init_entity_runnable_average(se);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9205	}
				9206
				9207	return 1;
				9208
				9209	err_free_rq:
				9210	kfree(cfs_rq);
				9211	err:
				9212	return 0;
				9213	}
				9214
Peter Zijlstra	8663e24	2016-06-22 14:58:02 +0200	[diff] [blame]	9215	void online_fair_sched_group(struct task_group *tg)
				9216	{
				9217	struct sched_entity *se;
				9218	struct rq *rq;
				9219	int i;
				9220
				9221	for_each_possible_cpu(i) {
				9222	rq = cpu_rq(i);
				9223	se = tg->se[i];
				9224
				9225	raw_spin_lock_irq(&rq->lock);
Vincent Guittot	d032669	2016-11-08 10:53:47 +0100	[diff] [blame]	9226	attach_entity_cfs_rq(se);
Peter Zijlstra	55e16d3	2016-06-22 15:14:26 +0200	[diff] [blame]	9227	sync_throttle(tg, i);
Peter Zijlstra	8663e24	2016-06-22 14:58:02 +0200	[diff] [blame]	9228	raw_spin_unlock_irq(&rq->lock);
				9229	}
				9230	}
				9231
Peter Zijlstra	6fe1f34	2016-01-21 22:24:16 +0100	[diff] [blame]	9232	void unregister_fair_sched_group(struct task_group *tg)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9233	{
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9234	unsigned long flags;
Peter Zijlstra	6fe1f34	2016-01-21 22:24:16 +0100	[diff] [blame]	9235	struct rq *rq;
				9236	int cpu;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9237
Peter Zijlstra	6fe1f34	2016-01-21 22:24:16 +0100	[diff] [blame]	9238	for_each_possible_cpu(cpu) {
				9239	if (tg->se[cpu])
				9240	remove_entity_load_avg(tg->se[cpu]);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9241
Peter Zijlstra	6fe1f34	2016-01-21 22:24:16 +0100	[diff] [blame]	9242	/*
				9243	* Only empty task groups can be destroyed; so we can speculatively
				9244	* check on_list without danger of it being re-added.
				9245	*/
				9246	if (!tg->cfs_rq[cpu]->on_list)
				9247	continue;
				9248
				9249	rq = cpu_rq(cpu);
				9250
				9251	raw_spin_lock_irqsave(&rq->lock, flags);
				9252	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
				9253	raw_spin_unlock_irqrestore(&rq->lock, flags);
				9254	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9255	}
				9256
				9257	void init_tg_cfs_entry(struct task_group tg, struct cfs_rq cfs_rq,
				9258	struct sched_entity *se, int cpu,
				9259	struct sched_entity *parent)
				9260	{
				9261	struct rq *rq = cpu_rq(cpu);
				9262
				9263	cfs_rq->tg = tg;
				9264	cfs_rq->rq = rq;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9265	init_cfs_rq_runtime(cfs_rq);
				9266
				9267	tg->cfs_rq[cpu] = cfs_rq;
				9268	tg->se[cpu] = se;
				9269
				9270	/* se could be NULL for root_task_group */
				9271	if (!se)
				9272	return;
				9273
Peter Zijlstra	fed14d4	2012-02-11 06:05:00 +0100	[diff] [blame]	9274	if (!parent) {
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9275	se->cfs_rq = &rq->cfs;
Peter Zijlstra	fed14d4	2012-02-11 06:05:00 +0100	[diff] [blame]	9276	se->depth = 0;
				9277	} else {
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9278	se->cfs_rq = parent->my_q;
Peter Zijlstra	fed14d4	2012-02-11 06:05:00 +0100	[diff] [blame]	9279	se->depth = parent->depth + 1;
				9280	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9281
				9282	se->my_q = cfs_rq;
Paul Turner	0ac9b1c	2013-10-16 11:16:27 -0700	[diff] [blame]	9283	/* guarantee group entities always have weight */
				9284	update_load_set(&se->load, NICE_0_LOAD);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9285	se->parent = parent;
				9286	}
				9287
				9288	static DEFINE_MUTEX(shares_mutex);
				9289
				9290	int sched_group_set_shares(struct task_group *tg, unsigned long shares)
				9291	{
				9292	int i;
				9293	unsigned long flags;
				9294
				9295	/*
				9296	* We can't change the weight of the root cgroup.
				9297	*/
				9298	if (!tg->se[0])
				9299	return -EINVAL;
				9300
				9301	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
				9302
				9303	mutex_lock(&shares_mutex);
				9304	if (tg->shares == shares)
				9305	goto done;
				9306
				9307	tg->shares = shares;
				9308	for_each_possible_cpu(i) {
				9309	struct rq *rq = cpu_rq(i);
				9310	struct sched_entity *se;
				9311
				9312	se = tg->se[i];
				9313	/* Propagate contribution to hierarchy */
				9314	raw_spin_lock_irqsave(&rq->lock, flags);
Frederic Weisbecker	71b1da4	2013-04-12 01:50:59 +0200	[diff] [blame]	9315
				9316	/* Possible calls to update_curr() need rq clock */
				9317	update_rq_clock(rq);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	9318	for_each_sched_entity(se)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9319	update_cfs_shares(group_cfs_rq(se));
				9320	raw_spin_unlock_irqrestore(&rq->lock, flags);
				9321	}
				9322
				9323	done:
				9324	mutex_unlock(&shares_mutex);
				9325	return 0;
				9326	}
				9327	#else /* CONFIG_FAIR_GROUP_SCHED */
				9328
				9329	void free_fair_sched_group(struct task_group *tg) { }
				9330
				9331	int alloc_fair_sched_group(struct task_group tg, struct task_group parent)
				9332	{
				9333	return 1;
				9334	}
				9335
Peter Zijlstra	8663e24	2016-06-22 14:58:02 +0200	[diff] [blame]	9336	void online_fair_sched_group(struct task_group *tg) { }
				9337
Peter Zijlstra	6fe1f34	2016-01-21 22:24:16 +0100	[diff] [blame]	9338	void unregister_fair_sched_group(struct task_group *tg) { }
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9339
				9340	#endif /* CONFIG_FAIR_GROUP_SCHED */
				9341
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	9342
H Hartley Sweeten	6d686f4	2010-01-13 20:21:52 -0700	[diff] [blame]	9343	static unsigned int get_rr_interval_fair(struct rq rq, struct task_struct task)
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	9344	{
				9345	struct sched_entity *se = &task->se;
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	9346	unsigned int rr_interval = 0;
				9347
				9348	/*
				9349	* Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
				9350	* idle runqueue:
				9351	*/
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	9352	if (rq->cfs.load.weight)
Zhu Yanhai	a59f4e0	2013-01-08 12:56:52 +0800	[diff] [blame]	9353	rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	9354
				9355	return rr_interval;
				9356	}
				9357
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	9358	/*
				9359	* All the scheduling class methods:
				9360	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9361	const struct sched_class fair_sched_class = {
Ingo Molnar	5522d5d	2007-10-15 17:00:12 +0200	[diff] [blame]	9362	.next = &idle_sched_class,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	9363	.enqueue_task = enqueue_task_fair,
				9364	.dequeue_task = dequeue_task_fair,
				9365	.yield_task = yield_task_fair,
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	9366	.yield_to_task = yield_to_task_fair,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	9367
Ingo Molnar	2e09bf5	2007-10-15 17:00:05 +0200	[diff] [blame]	9368	.check_preempt_curr = check_preempt_wakeup,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	9369
				9370	.pick_next_task = pick_next_task_fair,
				9371	.put_prev_task = put_prev_task_fair,
				9372
Peter Williams	681f3e6	2007-10-24 18:23:51 +0200	[diff] [blame]	9373	#ifdef CONFIG_SMP
Li Zefan	4ce72a2	2008-10-22 15:25:26 +0800	[diff] [blame]	9374	.select_task_rq = select_task_rq_fair,
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	9375	.migrate_task_rq = migrate_task_rq_fair,
Alex Shi	141965c	2013-06-26 13:05:39 +0800	[diff] [blame]	9376
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	9377	.rq_online = rq_online_fair,
				9378	.rq_offline = rq_offline_fair,
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	9379
Yuyang Du	1269557	2015-07-15 08:04:40 +0800	[diff] [blame]	9380	.task_dead = task_dead_fair,
Peter Zijlstra	c5b2803	2015-05-15 17:43:35 +0200	[diff] [blame]	9381	.set_cpus_allowed = set_cpus_allowed_common,
Peter Williams	681f3e6	2007-10-24 18:23:51 +0200	[diff] [blame]	9382	#endif
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	9383
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	9384	.set_curr_task = set_curr_task_fair,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	9385	.task_tick = task_tick_fair,
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	9386	.task_fork = task_fork_fair,
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	9387
				9388	.prio_changed = prio_changed_fair,
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	9389	.switched_from = switched_from_fair,
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	9390	.switched_to = switched_to_fair,
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	9391
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	9392	.get_rr_interval = get_rr_interval_fair,
				9393
Stanislaw Gruszka	6e99891	2014-11-12 16:58:44 +0100	[diff] [blame]	9394	.update_curr = update_curr_fair,
				9395
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	9396	#ifdef CONFIG_FAIR_GROUP_SCHED
Vincent Guittot	ea86cb4	2016-06-17 13:38:55 +0200	[diff] [blame]	9397	.task_change_group = task_change_group_fair,
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	9398	#endif
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	9399	};
				9400
				9401	#ifdef CONFIG_SCHED_DEBUG
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9402	void print_cfs_stats(struct seq_file *m, int cpu)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	9403	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	9404	struct cfs_rq *cfs_rq;
				9405
Peter Zijlstra	5973e5b	2008-01-25 21:08:34 +0100	[diff] [blame]	9406	rcu_read_lock();
Ingo Molnar	c3b64f1	2007-08-09 11:16:51 +0200	[diff] [blame]	9407	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
Ingo Molnar	5cef9ec	2007-08-09 11:16:47 +0200	[diff] [blame]	9408	print_cfs_rq(m, cpu, cfs_rq);
Peter Zijlstra	5973e5b	2008-01-25 21:08:34 +0100	[diff] [blame]	9409	rcu_read_unlock();
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	9410	}
Srikar Dronamraju	397f237	2015-06-25 22:51:43 +0530	[diff] [blame]	9411
				9412	#ifdef CONFIG_NUMA_BALANCING
				9413	void show_numa_stats(struct task_struct p, struct seq_file m)
				9414	{
				9415	int node;
				9416	unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
				9417
				9418	for_each_online_node(node) {
				9419	if (p->numa_faults) {
				9420	tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
				9421	tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
				9422	}
				9423	if (p->numa_group) {
				9424	gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
				9425	gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
				9426	}
				9427	print_numa_stats(m, node, tsf, tpf, gsf, gpf);
				9428	}
				9429	}
				9430	#endif /* CONFIG_NUMA_BALANCING */
				9431	#endif /* CONFIG_SCHED_DEBUG */
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9432
				9433	__init void init_sched_fair_class(void)
				9434	{
				9435	#ifdef CONFIG_SMP
				9436	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
				9437
Frederic Weisbecker	3451d02	2011-08-10 23:21:01 +0200	[diff] [blame]	9438	#ifdef CONFIG_NO_HZ_COMMON
Diwakar Tundlam	554ceca	2012-03-07 14:44:26 -0800	[diff] [blame]	9439	nohz.next_balance = jiffies;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9440	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	9441	#endif
				9442	#endif /* SMP */
				9443
				9444	}