Blame - kernel/sched/fair.c - SHIFTPHONES/mainline/linux

blob: 56b7d4b839476b6ed1692e786abe9ed6cda64a5f [file] [log] [blame]

Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1	/*
				2	* Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
				3	*
				4	* Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
				5	*
				6	* Interactivity improvements by Mike Galbraith
				7	* (C) 2007 Mike Galbraith <efault@gmx.de>
				8	*
				9	* Various enhancements by Dmitry Adamushko.
				10	* (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
				11	*
				12	* Group scheduling enhancements by Srivatsa Vaddagiri
				13	* Copyright IBM Corporation, 2007
				14	* Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
				15	*
				16	* Scaled math optimizations by Thomas Gleixner
				17	* Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	18	*
				19	* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
Peter Zijlstra	90eec10	2015-11-16 11:08:45 +0100	[diff] [blame]	20	* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	21	*/
				22
Arjan van de Ven	9745512	2008-01-25 21:08:34 +0100	[diff] [blame]	23	#include <linux/latencytop.h>
Christian Ehrhardt	1983a92	2009-11-30 12:16:47 +0100	[diff] [blame]	24	#include <linux/sched.h>
Sisir Koppaka	3436ae1	2011-03-26 18:22:55 +0530	[diff] [blame]	25	#include <linux/cpumask.h>
Nicolas Pitre	83a0a96	2014-09-04 11:32:10 -0400	[diff] [blame]	26	#include <linux/cpuidle.h>
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	27	#include <linux/slab.h>
				28	#include <linux/profile.h>
				29	#include <linux/interrupt.h>
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	30	#include <linux/mempolicy.h>
Mel Gorman	e14808b	2012-11-19 10:59:15 +0000	[diff] [blame]	31	#include <linux/migrate.h>
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	32	#include <linux/task_work.h>
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	33
				34	#include <trace/events/sched.h>
				35
				36	#include "sched.h"
Arjan van de Ven	9745512	2008-01-25 21:08:34 +0100	[diff] [blame]	37
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	38	/*
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	39	* Targeted preemption latency for CPU-bound tasks:
Takuya Yoshikawa	864616e	2010-10-14 16:09:13 +0900	[diff] [blame]	40	* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	41	*
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	42	* NOTE: this latency value is not the same as the concept of
Ingo Molnar	d274a4c	2007-10-15 17:00:14 +0200	[diff] [blame]	43	* 'timeslice length' - timeslices in CFS are of variable length
				44	* and have no persistent notion like in traditional, time-slice
				45	* based scheduling concepts.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	46	*
Ingo Molnar	d274a4c	2007-10-15 17:00:14 +0200	[diff] [blame]	47	* (to see the precise effective timeslice length of your workload,
				48	* run vmstat and monitor the context-switches (cs) field)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	49	*/
Mike Galbraith	2140692	2010-03-11 17:17:15 +0100	[diff] [blame]	50	unsigned int sysctl_sched_latency = 6000000ULL;
				51	unsigned int normalized_sysctl_sched_latency = 6000000ULL;
Ingo Molnar	2bd8e6d	2007-10-15 17:00:02 +0200	[diff] [blame]	52
				53	/*
Christian Ehrhardt	1983a92	2009-11-30 12:16:47 +0100	[diff] [blame]	54	* The initial- and re-scaling of tunables is configurable
				55	* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
				56	*
				57	* Options are:
				58	* SCHED_TUNABLESCALING_NONE - unscaled, always *1
				59	* SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
				60	* SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
				61	*/
				62	enum sched_tunable_scaling sysctl_sched_tunable_scaling
				63	= SCHED_TUNABLESCALING_LOG;
				64
				65	/*
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	66	* Minimal preemption granularity for CPU-bound tasks:
Takuya Yoshikawa	864616e	2010-10-14 16:09:13 +0900	[diff] [blame]	67	* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	68	*/
Ingo Molnar	0bf377b	2010-09-12 08:14:52 +0200	[diff] [blame]	69	unsigned int sysctl_sched_min_granularity = 750000ULL;
				70	unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	71
				72	/*
				73	* is kept at sysctl_sched_latency / sysctl_sched_min_granularity
				74	*/
Ingo Molnar	0bf377b	2010-09-12 08:14:52 +0200	[diff] [blame]	75	static unsigned int sched_nr_latency = 8;
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	76
				77	/*
Mike Galbraith	2bba22c	2009-09-09 15:41:37 +0200	[diff] [blame]	78	* After fork, child runs first. If set to 0 (default) then
Ingo Molnar	2bd8e6d	2007-10-15 17:00:02 +0200	[diff] [blame]	79	* parent will (try to) run first.
				80	*/
Mike Galbraith	2bba22c	2009-09-09 15:41:37 +0200	[diff] [blame]	81	unsigned int sysctl_sched_child_runs_first __read_mostly;
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	82
				83	/*
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	84	* SCHED_OTHER wake-up granularity.
Mike Galbraith	172e082	2009-09-09 15:41:37 +0200	[diff] [blame]	85	* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	86	*
				87	* This option delays the preemption effects of decoupled workloads
				88	* and reduces their over-scheduling. Synchronous workloads will still
				89	* have immediate wakeup/sleep latencies.
				90	*/
Mike Galbraith	172e082	2009-09-09 15:41:37 +0200	[diff] [blame]	91	unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	92	unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	93
Ingo Molnar	da84d96	2007-10-15 17:00:18 +0200	[diff] [blame]	94	const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
				95
Paul Turner	a7a4f8a	2010-11-15 15:47:06 -0800	[diff] [blame]	96	/*
				97	* The exponential sliding window over which load is averaged for shares
				98	* distribution.
				99	* (default: 10msec)
				100	*/
				101	unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
				102
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	103	#ifdef CONFIG_CFS_BANDWIDTH
				104	/*
				105	* Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
				106	* each time a cfs_rq requests quota.
				107	*
				108	* Note: in the case that the slice exceeds the runtime remaining (either due
				109	* to consumption or the quota being specified to be smaller than the slice)
				110	* we will always only issue the remaining available time.
				111	*
				112	* default: 5 msec, units: microseconds
				113	*/
				114	unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
				115	#endif
				116
Paul Gortmaker	8527632	2013-04-19 15:10:50 -0400	[diff] [blame]	117	static inline void update_load_add(struct load_weight *lw, unsigned long inc)
				118	{
				119	lw->weight += inc;
				120	lw->inv_weight = 0;
				121	}
				122
				123	static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
				124	{
				125	lw->weight -= dec;
				126	lw->inv_weight = 0;
				127	}
				128
				129	static inline void update_load_set(struct load_weight *lw, unsigned long w)
				130	{
				131	lw->weight = w;
				132	lw->inv_weight = 0;
				133	}
				134
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	135	/*
				136	* Increase the granularity value when there are more CPUs,
				137	* because with more CPUs the 'effective latency' as visible
				138	* to users decreases. But the relationship is not linear,
				139	* so pick a second-best guess by going with the log2 of the
				140	* number of CPUs.
				141	*
				142	* This idea comes from the SD scheduler of Con Kolivas:
				143	*/
Nicholas Mc Guire	58ac93e	2015-05-15 21:05:42 +0200	[diff] [blame]	144	static unsigned int get_update_sysctl_factor(void)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	145	{
Nicholas Mc Guire	58ac93e	2015-05-15 21:05:42 +0200	[diff] [blame]	146	unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	147	unsigned int factor;
				148
				149	switch (sysctl_sched_tunable_scaling) {
				150	case SCHED_TUNABLESCALING_NONE:
				151	factor = 1;
				152	break;
				153	case SCHED_TUNABLESCALING_LINEAR:
				154	factor = cpus;
				155	break;
				156	case SCHED_TUNABLESCALING_LOG:
				157	default:
				158	factor = 1 + ilog2(cpus);
				159	break;
				160	}
				161
				162	return factor;
				163	}
				164
				165	static void update_sysctl(void)
				166	{
				167	unsigned int factor = get_update_sysctl_factor();
				168
				169	#define SET_SYSCTL(name) \
				170	(sysctl_##name = (factor) * normalized_sysctl_##name)
				171	SET_SYSCTL(sched_min_granularity);
				172	SET_SYSCTL(sched_latency);
				173	SET_SYSCTL(sched_wakeup_granularity);
				174	#undef SET_SYSCTL
				175	}
				176
				177	void sched_init_granularity(void)
				178	{
				179	update_sysctl();
				180	}
				181
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	182	#define WMULT_CONST (~0U)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	183	#define WMULT_SHIFT 32
				184
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	185	static void __update_inv_weight(struct load_weight *lw)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	186	{
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	187	unsigned long w;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	188
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	189	if (likely(lw->inv_weight))
				190	return;
				191
				192	w = scale_load_down(lw->weight);
				193
				194	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
				195	lw->inv_weight = 1;
				196	else if (unlikely(!w))
				197	lw->inv_weight = WMULT_CONST;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	198	else
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	199	lw->inv_weight = WMULT_CONST / w;
				200	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	201
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	202	/*
				203	* delta_exec * weight / lw.weight
				204	* OR
				205	* (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
				206	*
				207	* Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
				208	* we're guaranteed shift stays positive because inv_weight is guaranteed to
				209	* fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
				210	*
				211	* Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
				212	* weight/lw.weight <= 1, and therefore our shift will also be positive.
				213	*/
				214	static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
				215	{
				216	u64 fact = scale_load_down(weight);
				217	int shift = WMULT_SHIFT;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	218
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	219	__update_inv_weight(lw);
				220
				221	if (unlikely(fact >> 32)) {
				222	while (fact >> 32) {
				223	fact >>= 1;
				224	shift--;
				225	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	226	}
				227
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	228	/* hint to use a 32x32->64 mul */
				229	fact = (u64)(u32)fact * lw->inv_weight;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	230
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	231	while (fact >> 32) {
				232	fact >>= 1;
				233	shift--;
				234	}
				235
				236	return mul_u64_u32_shr(delta_exec, fact, shift);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	237	}
				238
				239
				240	const struct sched_class fair_sched_class;
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	241
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	242	/**************************************************************
				243	* CFS operations on generic schedulable entities:
				244	*/
				245
				246	#ifdef CONFIG_FAIR_GROUP_SCHED
				247
				248	/* cpu runqueue to which this cfs_rq is attached */
				249	static inline struct rq rq_of(struct cfs_rq cfs_rq)
				250	{
				251	return cfs_rq->rq;
				252	}
				253
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	254	/* An entity is a task if it doesn't "own" a runqueue */
				255	#define entity_is_task(se) (!se->my_q)
				256
Peter Zijlstra	8f48894	2009-07-24 12:25:30 +0200	[diff] [blame]	257	static inline struct task_struct task_of(struct sched_entity se)
				258	{
				259	#ifdef CONFIG_SCHED_DEBUG
				260	WARN_ON_ONCE(!entity_is_task(se));
				261	#endif
				262	return container_of(se, struct task_struct, se);
				263	}
				264
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	265	/* Walk up scheduling entities hierarchy */
				266	#define for_each_sched_entity(se) \
				267	for (; se; se = se->parent)
				268
				269	static inline struct cfs_rq task_cfs_rq(struct task_struct p)
				270	{
				271	return p->se.cfs_rq;
				272	}
				273
				274	/* runqueue on which this entity is (to be) queued */
				275	static inline struct cfs_rq cfs_rq_of(struct sched_entity se)
				276	{
				277	return se->cfs_rq;
				278	}
				279
				280	/* runqueue "owned" by this group */
				281	static inline struct cfs_rq group_cfs_rq(struct sched_entity grp)
				282	{
				283	return grp->my_q;
				284	}
				285
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	286	static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				287	{
				288	if (!cfs_rq->on_list) {
Paul Turner	67e8625	2010-11-15 15:47:05 -0800	[diff] [blame]	289	/*
				290	* Ensure we either appear before our parent (if already
				291	* enqueued) or force our parent to appear after us when it is
				292	* enqueued. The fact that we always enqueue bottom-up
				293	* reduces this to two cases.
				294	*/
				295	if (cfs_rq->tg->parent &&
				296	cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
				297	list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	298	&rq_of(cfs_rq)->leaf_cfs_rq_list);
Paul Turner	67e8625	2010-11-15 15:47:05 -0800	[diff] [blame]	299	} else {
				300	list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
				301	&rq_of(cfs_rq)->leaf_cfs_rq_list);
				302	}
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	303
				304	cfs_rq->on_list = 1;
				305	}
				306	}
				307
				308	static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				309	{
				310	if (cfs_rq->on_list) {
				311	list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
				312	cfs_rq->on_list = 0;
				313	}
				314	}
				315
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	316	/* Iterate thr' all leaf cfs_rq's on a runqueue */
				317	#define for_each_leaf_cfs_rq(rq, cfs_rq) \
				318	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
				319
				320	/* Do the two (enqueued) entities belong to the same group ? */
Peter Zijlstra	fed14d4	2012-02-11 06:05:00 +0100	[diff] [blame]	321	static inline struct cfs_rq *
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	322	is_same_group(struct sched_entity se, struct sched_entity pse)
				323	{
				324	if (se->cfs_rq == pse->cfs_rq)
Peter Zijlstra	fed14d4	2012-02-11 06:05:00 +0100	[diff] [blame]	325	return se->cfs_rq;
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	326
Peter Zijlstra	fed14d4	2012-02-11 06:05:00 +0100	[diff] [blame]	327	return NULL;
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	328	}
				329
				330	static inline struct sched_entity parent_entity(struct sched_entity se)
				331	{
				332	return se->parent;
				333	}
				334
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	335	static void
				336	find_matching_se(struct sched_entity se, struct sched_entity pse)
				337	{
				338	int se_depth, pse_depth;
				339
				340	/*
				341	* preemption test can be made between sibling entities who are in the
				342	* same cfs_rq i.e who have a common parent. Walk up the hierarchy of
				343	* both tasks until we find their ancestors who are siblings of common
				344	* parent.
				345	*/
				346
				347	/* First walk up until both entities are at same depth */
Peter Zijlstra	fed14d4	2012-02-11 06:05:00 +0100	[diff] [blame]	348	se_depth = (*se)->depth;
				349	pse_depth = (*pse)->depth;
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	350
				351	while (se_depth > pse_depth) {
				352	se_depth--;
				353	se = parent_entity(se);
				354	}
				355
				356	while (pse_depth > se_depth) {
				357	pse_depth--;
				358	pse = parent_entity(pse);
				359	}
				360
				361	while (!is_same_group(se, pse)) {
				362	se = parent_entity(se);
				363	pse = parent_entity(pse);
				364	}
				365	}
				366
Peter Zijlstra	8f48894	2009-07-24 12:25:30 +0200	[diff] [blame]	367	#else /* !CONFIG_FAIR_GROUP_SCHED */
				368
				369	static inline struct task_struct task_of(struct sched_entity se)
				370	{
				371	return container_of(se, struct task_struct, se);
				372	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	373
				374	static inline struct rq rq_of(struct cfs_rq cfs_rq)
				375	{
				376	return container_of(cfs_rq, struct rq, cfs);
				377	}
				378
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	379	#define entity_is_task(se) 1
				380
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	381	#define for_each_sched_entity(se) \
				382	for (; se; se = NULL)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	383
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	384	static inline struct cfs_rq task_cfs_rq(struct task_struct p)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	385	{
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	386	return &task_rq(p)->cfs;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	387	}
				388
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	389	static inline struct cfs_rq cfs_rq_of(struct sched_entity se)
				390	{
				391	struct task_struct *p = task_of(se);
				392	struct rq *rq = task_rq(p);
				393
				394	return &rq->cfs;
				395	}
				396
				397	/* runqueue "owned" by this group */
				398	static inline struct cfs_rq group_cfs_rq(struct sched_entity grp)
				399	{
				400	return NULL;
				401	}
				402
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	403	static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				404	{
				405	}
				406
				407	static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				408	{
				409	}
				410
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	411	#define for_each_leaf_cfs_rq(rq, cfs_rq) \
				412	for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
				413
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	414	static inline struct sched_entity parent_entity(struct sched_entity se)
				415	{
				416	return NULL;
				417	}
				418
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	419	static inline void
				420	find_matching_se(struct sched_entity se, struct sched_entity pse)
				421	{
				422	}
				423
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	424	#endif /* CONFIG_FAIR_GROUP_SCHED */
				425
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	426	static __always_inline
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	427	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	428
				429	/**************************************************************
				430	* Scheduling class tree data structure manipulation methods:
				431	*/
				432
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	433	static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
Peter Zijlstra	02e0431	2007-10-15 17:00:07 +0200	[diff] [blame]	434	{
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	435	s64 delta = (s64)(vruntime - max_vruntime);
Peter Zijlstra	368059a	2007-10-15 17:00:11 +0200	[diff] [blame]	436	if (delta > 0)
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	437	max_vruntime = vruntime;
Peter Zijlstra	02e0431	2007-10-15 17:00:07 +0200	[diff] [blame]	438
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	439	return max_vruntime;
Peter Zijlstra	02e0431	2007-10-15 17:00:07 +0200	[diff] [blame]	440	}
				441
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	442	static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
Peter Zijlstra	b0ffd24	2007-10-15 17:00:12 +0200	[diff] [blame]	443	{
				444	s64 delta = (s64)(vruntime - min_vruntime);
				445	if (delta < 0)
				446	min_vruntime = vruntime;
				447
				448	return min_vruntime;
				449	}
				450
Fabio Checconi	54fdc58	2009-07-16 12:32:27 +0200	[diff] [blame]	451	static inline int entity_before(struct sched_entity *a,
				452	struct sched_entity *b)
				453	{
				454	return (s64)(a->vruntime - b->vruntime) < 0;
				455	}
				456
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	457	static void update_min_vruntime(struct cfs_rq *cfs_rq)
				458	{
				459	u64 vruntime = cfs_rq->min_vruntime;
				460
				461	if (cfs_rq->curr)
				462	vruntime = cfs_rq->curr->vruntime;
				463
				464	if (cfs_rq->rb_leftmost) {
				465	struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
				466	struct sched_entity,
				467	run_node);
				468
Peter Zijlstra	e17036d	2009-01-15 14:53:39 +0100	[diff] [blame]	469	if (!cfs_rq->curr)
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	470	vruntime = se->vruntime;
				471	else
				472	vruntime = min_vruntime(vruntime, se->vruntime);
				473	}
				474
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	475	/* ensure we never gain time by being placed backwards. */
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	476	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	477	#ifndef CONFIG_64BIT
				478	smp_wmb();
				479	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
				480	#endif
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	481	}
				482
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	483	/*
				484	* Enqueue an entity into the rb-tree:
				485	*/
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	486	static void __enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	487	{
				488	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
				489	struct rb_node *parent = NULL;
				490	struct sched_entity *entry;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	491	int leftmost = 1;
				492
				493	/*
				494	* Find the right place in the rbtree:
				495	*/
				496	while (*link) {
				497	parent = *link;
				498	entry = rb_entry(parent, struct sched_entity, run_node);
				499	/*
				500	* We dont care about collisions. Nodes with
				501	* the same key stay together.
				502	*/
Stephan Baerwolf	2bd2d6f	2011-07-20 14:46:59 +0200	[diff] [blame]	503	if (entity_before(se, entry)) {
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	504	link = &parent->rb_left;
				505	} else {
				506	link = &parent->rb_right;
				507	leftmost = 0;
				508	}
				509	}
				510
				511	/*
				512	* Maintain a cache of leftmost tree entries (it is frequently
				513	* used):
				514	*/
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	515	if (leftmost)
Ingo Molnar	57cb499	2007-10-15 17:00:11 +0200	[diff] [blame]	516	cfs_rq->rb_leftmost = &se->run_node;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	517
				518	rb_link_node(&se->run_node, parent, link);
				519	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	520	}
				521
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	522	static void __dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	523	{
Peter Zijlstra	3fe6974	2008-03-14 20:55:51 +0100	[diff] [blame]	524	if (cfs_rq->rb_leftmost == &se->run_node) {
				525	struct rb_node *next_node;
Peter Zijlstra	3fe6974	2008-03-14 20:55:51 +0100	[diff] [blame]	526
				527	next_node = rb_next(&se->run_node);
				528	cfs_rq->rb_leftmost = next_node;
Peter Zijlstra	3fe6974	2008-03-14 20:55:51 +0100	[diff] [blame]	529	}
Ingo Molnar	e9acbff	2007-10-15 17:00:04 +0200	[diff] [blame]	530
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	531	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	532	}
				533
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	534	struct sched_entity __pick_first_entity(struct cfs_rq cfs_rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	535	{
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	536	struct rb_node *left = cfs_rq->rb_leftmost;
				537
				538	if (!left)
				539	return NULL;
				540
				541	return rb_entry(left, struct sched_entity, run_node);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	542	}
				543
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	544	static struct sched_entity __pick_next_entity(struct sched_entity se)
				545	{
				546	struct rb_node *next = rb_next(&se->run_node);
				547
				548	if (!next)
				549	return NULL;
				550
				551	return rb_entry(next, struct sched_entity, run_node);
				552	}
				553
				554	#ifdef CONFIG_SCHED_DEBUG
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	555	struct sched_entity __pick_last_entity(struct cfs_rq cfs_rq)
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	556	{
Ingo Molnar	7eee3e6	2008-02-22 10:32:21 +0100	[diff] [blame]	557	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	558
Balbir Singh	70eee74	2008-02-22 13:25:53 +0530	[diff] [blame]	559	if (!last)
				560	return NULL;
Ingo Molnar	7eee3e6	2008-02-22 10:32:21 +0100	[diff] [blame]	561
				562	return rb_entry(last, struct sched_entity, run_node);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	563	}
				564
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	565	/**************************************************************
				566	* Scheduling class statistics methods:
				567	*/
				568
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	569	int sched_proc_update_handler(struct ctl_table *table, int write,
Alexey Dobriyan	8d65af7	2009-09-23 15:57:19 -0700	[diff] [blame]	570	void __user buffer, size_t lenp,
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	571	loff_t *ppos)
				572	{
Alexey Dobriyan	8d65af7	2009-09-23 15:57:19 -0700	[diff] [blame]	573	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
Nicholas Mc Guire	58ac93e	2015-05-15 21:05:42 +0200	[diff] [blame]	574	unsigned int factor = get_update_sysctl_factor();
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	575
				576	if (ret \|\| !write)
				577	return ret;
				578
				579	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
				580	sysctl_sched_min_granularity);
				581
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	582	#define WRT_SYSCTL(name) \
				583	(normalized_sysctl_##name = sysctl_##name / (factor))
				584	WRT_SYSCTL(sched_min_granularity);
				585	WRT_SYSCTL(sched_latency);
				586	WRT_SYSCTL(sched_wakeup_granularity);
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	587	#undef WRT_SYSCTL
				588
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	589	return 0;
				590	}
				591	#endif
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	592
				593	/*
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	594	* delta /= w
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	595	*/
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	596	static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	597	{
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	598	if (unlikely(se->load.weight != NICE_0_LOAD))
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	599	delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	600
				601	return delta;
				602	}
				603
				604	/*
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	605	* The idea is to set a period in which each task runs once.
				606	*
Borislav Petkov	532b185	2012-08-08 16:16:04 +0200	[diff] [blame]	607	* When there are too many tasks (sched_nr_latency) we have to stretch
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	608	* this period because otherwise the slices get too small.
				609	*
				610	* p = (nr <= nl) ? l : l*nr/nl
				611	*/
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	612	static u64 __sched_period(unsigned long nr_running)
				613	{
Boqun Feng	8e2b0bf	2015-07-02 22:25:52 +0800	[diff] [blame]	614	if (unlikely(nr_running > sched_nr_latency))
				615	return nr_running * sysctl_sched_min_granularity;
				616	else
				617	return sysctl_sched_latency;
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	618	}
				619
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	620	/*
				621	* We calculate the wall-time slice from the period by taking a part
				622	* proportional to the weight.
				623	*
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	624	* s = p*P[w/rw]
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	625	*/
Peter Zijlstra	6d0f0eb	2007-10-15 17:00:05 +0200	[diff] [blame]	626	static u64 sched_slice(struct cfs_rq cfs_rq, struct sched_entity se)
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	627	{
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	628	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	629
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	630	for_each_sched_entity(se) {
Lin Ming	6272d68	2009-01-15 17:17:15 +0100	[diff] [blame]	631	struct load_weight *load;
Christian Engelmayer	3104bf0	2009-06-16 10:35:12 +0200	[diff] [blame]	632	struct load_weight lw;
Lin Ming	6272d68	2009-01-15 17:17:15 +0100	[diff] [blame]	633
				634	cfs_rq = cfs_rq_of(se);
				635	load = &cfs_rq->load;
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	636
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	637	if (unlikely(!se->on_rq)) {
Christian Engelmayer	3104bf0	2009-06-16 10:35:12 +0200	[diff] [blame]	638	lw = cfs_rq->load;
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	639
				640	update_load_add(&lw, se->load.weight);
				641	load = &lw;
				642	}
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	643	slice = __calc_delta(slice, se->load.weight, load);
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	644	}
				645	return slice;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	646	}
				647
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	648	/*
Andrei Epure	660cc00	2013-03-11 12:03:20 +0200	[diff] [blame]	649	* We calculate the vruntime slice of a to-be-inserted task.
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	650	*
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	651	* vs = s/w
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	652	*/
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	653	static u64 sched_vslice(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	654	{
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	655	return calc_delta_fair(sched_slice(cfs_rq, se), se);
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	656	}
				657
Alex Shi	a75cdaa	2013-06-20 10:18:47 +0800	[diff] [blame]	658	#ifdef CONFIG_SMP
Rik van Riel	ba7e5a2	2014-09-04 16:35:30 -0400	[diff] [blame]	659	static int select_idle_sibling(struct task_struct *p, int cpu);
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	660	static unsigned long task_h_load(struct task_struct *p);
				661
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	662	/*
				663	* We choose a half-life close to 1 scheduling period.
Leo Yan	84fb5a1	2015-09-15 18:57:37 +0800	[diff] [blame]	664	* Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
				665	* dependent on this value.
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	666	*/
				667	#define LOAD_AVG_PERIOD 32
				668	#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
Leo Yan	84fb5a1	2015-09-15 18:57:37 +0800	[diff] [blame]	669	#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */
Alex Shi	a75cdaa	2013-06-20 10:18:47 +0800	[diff] [blame]	670
Yuyang Du	540247f	2015-07-15 08:04:39 +0800	[diff] [blame]	671	/* Give new sched_entity start runnable values to heavy its load in infant time */
				672	void init_entity_runnable_average(struct sched_entity *se)
Alex Shi	a75cdaa	2013-06-20 10:18:47 +0800	[diff] [blame]	673	{
Yuyang Du	540247f	2015-07-15 08:04:39 +0800	[diff] [blame]	674	struct sched_avg *sa = &se->avg;
Alex Shi	a75cdaa	2013-06-20 10:18:47 +0800	[diff] [blame]	675
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	676	sa->last_update_time = 0;
				677	/*
				678	* sched_avg's period_contrib should be strictly less then 1024, so
				679	* we give it 1023 to make sure it is almost a period (1024us), and
				680	* will definitely be update (after enqueue).
				681	*/
				682	sa->period_contrib = 1023;
Yuyang Du	540247f	2015-07-15 08:04:39 +0800	[diff] [blame]	683	sa->load_avg = scale_load_down(se->load.weight);
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	684	sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
				685	sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
Peter Zijlstra	006cdf0	2015-09-09 09:06:17 +0200	[diff] [blame]	686	sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	687	/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
Alex Shi	a75cdaa	2013-06-20 10:18:47 +0800	[diff] [blame]	688	}
Yuyang Du	7ea241a	2015-07-15 08:04:42 +0800	[diff] [blame]	689
				690	static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
				691	static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
Alex Shi	a75cdaa	2013-06-20 10:18:47 +0800	[diff] [blame]	692	#else
Yuyang Du	540247f	2015-07-15 08:04:39 +0800	[diff] [blame]	693	void init_entity_runnable_average(struct sched_entity *se)
Alex Shi	a75cdaa	2013-06-20 10:18:47 +0800	[diff] [blame]	694	{
				695	}
				696	#endif
				697
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	698	/*
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	699	* Update the current task's runtime statistics.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	700	*/
Ingo Molnar	b7cc089	2007-08-09 11:16:47 +0200	[diff] [blame]	701	static void update_curr(struct cfs_rq *cfs_rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	702	{
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	703	struct sched_entity *curr = cfs_rq->curr;
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	704	u64 now = rq_clock_task(rq_of(cfs_rq));
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	705	u64 delta_exec;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	706
				707	if (unlikely(!curr))
				708	return;
				709
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	710	delta_exec = now - curr->exec_start;
				711	if (unlikely((s64)delta_exec <= 0))
Peter Zijlstra	34f28ec	2008-12-16 08:45:31 +0100	[diff] [blame]	712	return;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	713
Ingo Molnar	8ebc91d	2007-10-15 17:00:03 +0200	[diff] [blame]	714	curr->exec_start = now;
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	715
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	716	schedstat_set(curr->statistics.exec_max,
				717	max(delta_exec, curr->statistics.exec_max));
				718
				719	curr->sum_exec_runtime += delta_exec;
				720	schedstat_add(cfs_rq, exec_clock, delta_exec);
				721
				722	curr->vruntime += calc_delta_fair(delta_exec, curr);
				723	update_min_vruntime(cfs_rq);
				724
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	725	if (entity_is_task(curr)) {
				726	struct task_struct *curtask = task_of(curr);
				727
Ingo Molnar	f977bb4	2009-09-13 18:15:54 +0200	[diff] [blame]	728	trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	729	cpuacct_charge(curtask, delta_exec);
Frank Mayhar	f06febc	2008-09-12 09:54:39 -0700	[diff] [blame]	730	account_group_exec_runtime(curtask, delta_exec);
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	731	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	732
				733	account_cfs_rq_runtime(cfs_rq, delta_exec);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	734	}
				735
Stanislaw Gruszka	6e99891	2014-11-12 16:58:44 +0100	[diff] [blame]	736	static void update_curr_fair(struct rq *rq)
				737	{
				738	update_curr(cfs_rq_of(&rq->curr->se));
				739	}
				740
Joonwoo Park	3ea94de	2015-11-12 19:38:54 -0800	[diff] [blame]	741	#ifdef CONFIG_SCHEDSTATS
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	742	static inline void
Ingo Molnar	5870db5	2007-08-09 11:16:47 +0200	[diff] [blame]	743	update_stats_wait_start(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	744	{
Joonwoo Park	3ea94de	2015-11-12 19:38:54 -0800	[diff] [blame]	745	u64 wait_start = rq_clock(rq_of(cfs_rq));
				746
				747	if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
				748	likely(wait_start > se->statistics.wait_start))
				749	wait_start -= se->statistics.wait_start;
				750
				751	se->statistics.wait_start = wait_start;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	752	}
				753
Joonwoo Park	3ea94de	2015-11-12 19:38:54 -0800	[diff] [blame]	754	static void
				755	update_stats_wait_end(struct cfs_rq cfs_rq, struct sched_entity se)
				756	{
				757	struct task_struct *p;
				758	u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
				759
				760	if (entity_is_task(se)) {
				761	p = task_of(se);
				762	if (task_on_rq_migrating(p)) {
				763	/*
				764	* Preserve migrating task's wait time so wait_start
				765	* time stamp can be adjusted to accumulate wait time
				766	* prior to migration.
				767	*/
				768	se->statistics.wait_start = delta;
				769	return;
				770	}
				771	trace_sched_stat_wait(p, delta);
				772	}
				773
				774	se->statistics.wait_max = max(se->statistics.wait_max, delta);
				775	se->statistics.wait_count++;
				776	se->statistics.wait_sum += delta;
				777	se->statistics.wait_start = 0;
				778	}
				779	#else
				780	static inline void
				781	update_stats_wait_start(struct cfs_rq cfs_rq, struct sched_entity se)
				782	{
				783	}
				784
				785	static inline void
				786	update_stats_wait_end(struct cfs_rq cfs_rq, struct sched_entity se)
				787	{
				788	}
				789	#endif
				790
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	791	/*
				792	* Task is being enqueued - update stats:
				793	*/
Ingo Molnar	d2417e5	2007-08-09 11:16:47 +0200	[diff] [blame]	794	static void update_stats_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	795	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	796	/*
				797	* Are we enqueueing a waiting task? (for current tasks
				798	* a dequeue/enqueue event is a NOP)
				799	*/
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	800	if (se != cfs_rq->curr)
Ingo Molnar	5870db5	2007-08-09 11:16:47 +0200	[diff] [blame]	801	update_stats_wait_start(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	802	}
				803
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	804	static inline void
Ingo Molnar	19b6a2e	2007-08-09 11:16:48 +0200	[diff] [blame]	805	update_stats_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	806	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	807	/*
				808	* Mark the end of the wait period if dequeueing a
				809	* waiting task:
				810	*/
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	811	if (se != cfs_rq->curr)
Ingo Molnar	9ef0a96	2007-08-09 11:16:47 +0200	[diff] [blame]	812	update_stats_wait_end(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	813	}
				814
				815	/*
				816	* We are picking a new current task - update its stats:
				817	*/
				818	static inline void
Ingo Molnar	79303e9	2007-08-09 11:16:47 +0200	[diff] [blame]	819	update_stats_curr_start(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	820	{
				821	/*
				822	* We are starting a new run period:
				823	*/
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	824	se->exec_start = rq_clock_task(rq_of(cfs_rq));
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	825	}
				826
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	827	/**************************************************
				828	* Scheduling class queueing methods:
				829	*/
				830
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	831	#ifdef CONFIG_NUMA_BALANCING
				832	/*
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	833	* Approximate time to scan a full NUMA task in ms. The task scan period is
				834	* calculated based on the tasks virtual memory size and
				835	* numa_balancing_scan_size.
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	836	*/
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	837	unsigned int sysctl_numa_balancing_scan_period_min = 1000;
				838	unsigned int sysctl_numa_balancing_scan_period_max = 60000;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	839
				840	/* Portion of address space to scan in MB */
				841	unsigned int sysctl_numa_balancing_scan_size = 256;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	842
Peter Zijlstra	4b96a29	2012-10-25 14:16:47 +0200	[diff] [blame]	843	/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
				844	unsigned int sysctl_numa_balancing_scan_delay = 1000;
				845
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	846	static unsigned int task_nr_scan_windows(struct task_struct *p)
				847	{
				848	unsigned long rss = 0;
				849	unsigned long nr_scan_pages;
				850
				851	/*
				852	* Calculations based on RSS as non-present and empty pages are skipped
				853	* by the PTE scanner and NUMA hinting faults should be trapped based
				854	* on resident pages
				855	*/
				856	nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
				857	rss = get_mm_rss(p->mm);
				858	if (!rss)
				859	rss = nr_scan_pages;
				860
				861	rss = round_up(rss, nr_scan_pages);
				862	return rss / nr_scan_pages;
				863	}
				864
				865	/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
				866	#define MAX_SCAN_WINDOW 2560
				867
				868	static unsigned int task_scan_min(struct task_struct *p)
				869	{
Jason Low	316c1608d	2015-04-28 13:00:20 -0700	[diff] [blame]	870	unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	871	unsigned int scan, floor;
				872	unsigned int windows = 1;
				873
Kirill Tkhai	6419265	2014-10-16 14:39:37 +0400	[diff] [blame]	874	if (scan_size < MAX_SCAN_WINDOW)
				875	windows = MAX_SCAN_WINDOW / scan_size;
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	876	floor = 1000 / windows;
				877
				878	scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
				879	return max_t(unsigned int, floor, scan);
				880	}
				881
				882	static unsigned int task_scan_max(struct task_struct *p)
				883	{
				884	unsigned int smin = task_scan_min(p);
				885	unsigned int smax;
				886
				887	/* Watch for min being lower than max due to floor calculations */
				888	smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
				889	return max(smin, smax);
				890	}
				891
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	892	static void account_numa_enqueue(struct rq rq, struct task_struct p)
				893	{
				894	rq->nr_numa_running += (p->numa_preferred_nid != -1);
				895	rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
				896	}
				897
				898	static void account_numa_dequeue(struct rq rq, struct task_struct p)
				899	{
				900	rq->nr_numa_running -= (p->numa_preferred_nid != -1);
				901	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
				902	}
				903
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	904	struct numa_group {
				905	atomic_t refcount;
				906
				907	spinlock_t lock; /* nr_tasks, tasks */
				908	int nr_tasks;
Mel Gorman	e29cf08	2013-10-07 11:29:22 +0100	[diff] [blame]	909	pid_t gid;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	910
				911	struct rcu_head rcu;
Rik van Riel	20e07de	2014-01-27 17:03:43 -0500	[diff] [blame]	912	nodemask_t active_nodes;
Mel Gorman	989348b	2013-10-07 11:29:40 +0100	[diff] [blame]	913	unsigned long total_faults;
Rik van Riel	7e2703e	2014-01-27 17:03:45 -0500	[diff] [blame]	914	/*
				915	* Faults_cpu is used to decide whether memory should move
				916	* towards the CPU. As a consequence, these stats are weighted
				917	* more by CPU use than by memory faults.
				918	*/
Rik van Riel	50ec8a4	2014-01-27 17:03:42 -0500	[diff] [blame]	919	unsigned long *faults_cpu;
Mel Gorman	989348b	2013-10-07 11:29:40 +0100	[diff] [blame]	920	unsigned long faults[0];
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	921	};
				922
Rik van Riel	be1e4e7	2014-01-27 17:03:48 -0500	[diff] [blame]	923	/* Shared or private faults. */
				924	#define NR_NUMA_HINT_FAULT_TYPES 2
				925
				926	/* Memory and CPU locality */
				927	#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
				928
				929	/* Averaged statistics, and temporary buffers. */
				930	#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
				931
Mel Gorman	e29cf08	2013-10-07 11:29:22 +0100	[diff] [blame]	932	pid_t task_numa_group_id(struct task_struct *p)
				933	{
				934	return p->numa_group ? p->numa_group->gid : 0;
				935	}
				936
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	937	/*
				938	* The averaged statistics, shared & private, memory & cpu,
				939	* occupy the first half of the array. The second half of the
				940	* array is for current counters, which are averaged into the
				941	* first set by task_numa_placement.
				942	*/
				943	static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
Mel Gorman	ac8e895	2013-10-07 11:29:03 +0100	[diff] [blame]	944	{
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	945	return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
Mel Gorman	ac8e895	2013-10-07 11:29:03 +0100	[diff] [blame]	946	}
				947
				948	static inline unsigned long task_faults(struct task_struct *p, int nid)
				949	{
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	950	if (!p->numa_faults)
Mel Gorman	ac8e895	2013-10-07 11:29:03 +0100	[diff] [blame]	951	return 0;
				952
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	953	return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
				954	p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
Mel Gorman	ac8e895	2013-10-07 11:29:03 +0100	[diff] [blame]	955	}
				956
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	957	static inline unsigned long group_faults(struct task_struct *p, int nid)
				958	{
				959	if (!p->numa_group)
				960	return 0;
				961
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	962	return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
				963	p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	964	}
				965
Rik van Riel	20e07de	2014-01-27 17:03:43 -0500	[diff] [blame]	966	static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
				967	{
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	968	return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
				969	group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
Rik van Riel	20e07de	2014-01-27 17:03:43 -0500	[diff] [blame]	970	}
				971
Rik van Riel	6c6b119	2014-10-17 03:29:52 -0400	[diff] [blame]	972	/* Handle placement on systems where not all nodes are directly connected. */
				973	static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
				974	int maxdist, bool task)
				975	{
				976	unsigned long score = 0;
				977	int node;
				978
				979	/*
				980	* All nodes are directly connected, and the same distance
				981	* from each other. No need for fancy placement algorithms.
				982	*/
				983	if (sched_numa_topology_type == NUMA_DIRECT)
				984	return 0;
				985
				986	/*
				987	* This code is called for each node, introducing N^2 complexity,
				988	* which should be ok given the number of nodes rarely exceeds 8.
				989	*/
				990	for_each_online_node(node) {
				991	unsigned long faults;
				992	int dist = node_distance(nid, node);
				993
				994	/*
				995	* The furthest away nodes in the system are not interesting
				996	* for placement; nid was already counted.
				997	*/
				998	if (dist == sched_max_numa_distance \|\| node == nid)
				999	continue;
				1000
				1001	/*
				1002	* On systems with a backplane NUMA topology, compare groups
				1003	* of nodes, and move tasks towards the group with the most
				1004	* memory accesses. When comparing two nodes at distance
				1005	* "hoplimit", only nodes closer by than "hoplimit" are part
				1006	* of each group. Skip other nodes.
				1007	*/
				1008	if (sched_numa_topology_type == NUMA_BACKPLANE &&
				1009	dist > maxdist)
				1010	continue;
				1011
				1012	/* Add up the faults from nearby nodes. */
				1013	if (task)
				1014	faults = task_faults(p, node);
				1015	else
				1016	faults = group_faults(p, node);
				1017
				1018	/*
				1019	* On systems with a glueless mesh NUMA topology, there are
				1020	* no fixed "groups of nodes". Instead, nodes that are not
				1021	* directly connected bounce traffic through intermediate
				1022	* nodes; a numa_group can occupy any set of nodes.
				1023	* The further away a node is, the less the faults count.
				1024	* This seems to result in good task placement.
				1025	*/
				1026	if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
				1027	faults *= (sched_max_numa_distance - dist);
				1028	faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
				1029	}
				1030
				1031	score += faults;
				1032	}
				1033
				1034	return score;
				1035	}
				1036
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	1037	/*
				1038	* These return the fraction of accesses done by a particular task, or
				1039	* task group, on a particular numa node. The group weight is given a
				1040	* larger multiplier, in order to group tasks together that are almost
				1041	* evenly spread out between numa nodes.
				1042	*/
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1043	static inline unsigned long task_weight(struct task_struct *p, int nid,
				1044	int dist)
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	1045	{
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1046	unsigned long faults, total_faults;
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	1047
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	1048	if (!p->numa_faults)
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	1049	return 0;
				1050
				1051	total_faults = p->total_numa_faults;
				1052
				1053	if (!total_faults)
				1054	return 0;
				1055
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1056	faults = task_faults(p, nid);
Rik van Riel	6c6b119	2014-10-17 03:29:52 -0400	[diff] [blame]	1057	faults += score_nearby_nodes(p, nid, dist, true);
				1058
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1059	return 1000 * faults / total_faults;
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	1060	}
				1061
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1062	static inline unsigned long group_weight(struct task_struct *p, int nid,
				1063	int dist)
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	1064	{
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1065	unsigned long faults, total_faults;
				1066
				1067	if (!p->numa_group)
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	1068	return 0;
				1069
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1070	total_faults = p->numa_group->total_faults;
				1071
				1072	if (!total_faults)
				1073	return 0;
				1074
				1075	faults = group_faults(p, nid);
Rik van Riel	6c6b119	2014-10-17 03:29:52 -0400	[diff] [blame]	1076	faults += score_nearby_nodes(p, nid, dist, false);
				1077
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1078	return 1000 * faults / total_faults;
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	1079	}
				1080
Rik van Riel	10f3904	2014-01-27 17:03:44 -0500	[diff] [blame]	1081	bool should_numa_migrate_memory(struct task_struct p, struct page page,
				1082	int src_nid, int dst_cpu)
				1083	{
				1084	struct numa_group *ng = p->numa_group;
				1085	int dst_nid = cpu_to_node(dst_cpu);
				1086	int last_cpupid, this_cpupid;
				1087
				1088	this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
				1089
				1090	/*
				1091	* Multi-stage node selection is used in conjunction with a periodic
				1092	* migration fault to build a temporal task<->page relation. By using
				1093	* a two-stage filter we remove short/unlikely relations.
				1094	*
				1095	* Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
				1096	* a task's usage of a particular page (n_p) per total usage of this
				1097	* page (n_t) (in a given time-span) to a probability.
				1098	*
				1099	* Our periodic faults will sample this probability and getting the
				1100	* same result twice in a row, given these samples are fully
				1101	* independent, is then given by P(n)^2, provided our sample period
				1102	* is sufficiently short compared to the usage pattern.
				1103	*
				1104	* This quadric squishes small probabilities, making it less likely we
				1105	* act on an unlikely task<->page relation.
				1106	*/
				1107	last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
				1108	if (!cpupid_pid_unset(last_cpupid) &&
				1109	cpupid_to_nid(last_cpupid) != dst_nid)
				1110	return false;
				1111
				1112	/* Always allow migrate on private faults */
				1113	if (cpupid_match_pid(p, last_cpupid))
				1114	return true;
				1115
				1116	/* A shared fault, but p->numa_group has not been set up yet. */
				1117	if (!ng)
				1118	return true;
				1119
				1120	/*
				1121	* Do not migrate if the destination is not a node that
				1122	* is actively used by this numa group.
				1123	*/
				1124	if (!node_isset(dst_nid, ng->active_nodes))
				1125	return false;
				1126
				1127	/*
				1128	* Source is a node that is not actively used by this
				1129	* numa group, while the destination is. Migrate.
				1130	*/
				1131	if (!node_isset(src_nid, ng->active_nodes))
				1132	return true;
				1133
				1134	/*
				1135	* Both source and destination are nodes in active
				1136	* use by this numa group. Maximize memory bandwidth
				1137	* by migrating from more heavily used groups, to less
				1138	* heavily used ones, spreading the load around.
				1139	* Use a 1/4 hysteresis to avoid spurious page movement.
				1140	*/
				1141	return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
				1142	}
				1143
Mel Gorman	e6628d5	2013-10-07 11:29:02 +0100	[diff] [blame]	1144	static unsigned long weighted_cpuload(const int cpu);
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1145	static unsigned long source_load(int cpu, int type);
				1146	static unsigned long target_load(int cpu, int type);
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	1147	static unsigned long capacity_of(int cpu);
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1148	static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
Mel Gorman	e6628d5	2013-10-07 11:29:02 +0100	[diff] [blame]	1149
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1150	/* Cached statistics for all CPUs within a node */
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1151	struct numa_stats {
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1152	unsigned long nr_running;
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1153	unsigned long load;
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1154
				1155	/* Total compute capacity of CPUs on a node */
Nicolas Pitre	5ef20ca	2014-05-26 18:19:34 -0400	[diff] [blame]	1156	unsigned long compute_capacity;
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1157
				1158	/* Approximate capacity in terms of runnable tasks on a node */
Nicolas Pitre	5ef20ca	2014-05-26 18:19:34 -0400	[diff] [blame]	1159	unsigned long task_capacity;
Nicolas Pitre	1b6a749	2014-05-26 18:19:35 -0400	[diff] [blame]	1160	int has_free_capacity;
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1161	};
Mel Gorman	e6628d5	2013-10-07 11:29:02 +0100	[diff] [blame]	1162
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1163	/*
				1164	* XXX borrowed from update_sg_lb_stats
				1165	*/
				1166	static void update_numa_stats(struct numa_stats *ns, int nid)
				1167	{
Rik van Riel	83d7f24	2014-08-04 13:23:28 -0400	[diff] [blame]	1168	int smt, cpu, cpus = 0;
				1169	unsigned long capacity;
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1170
				1171	memset(ns, 0, sizeof(*ns));
				1172	for_each_cpu(cpu, cpumask_of_node(nid)) {
				1173	struct rq *rq = cpu_rq(cpu);
				1174
				1175	ns->nr_running += rq->nr_running;
				1176	ns->load += weighted_cpuload(cpu);
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	1177	ns->compute_capacity += capacity_of(cpu);
Peter Zijlstra	5eca82a	2013-11-06 18:47:57 +0100	[diff] [blame]	1178
				1179	cpus++;
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1180	}
				1181
Peter Zijlstra	5eca82a	2013-11-06 18:47:57 +0100	[diff] [blame]	1182	/*
				1183	* If we raced with hotplug and there are no CPUs left in our mask
				1184	* the @ns structure is NULL'ed and task_numa_compare() will
				1185	* not find this node attractive.
				1186	*
Nicolas Pitre	1b6a749	2014-05-26 18:19:35 -0400	[diff] [blame]	1187	* We'll either bail at !has_free_capacity, or we'll detect a huge
				1188	* imbalance and bail there.
Peter Zijlstra	5eca82a	2013-11-06 18:47:57 +0100	[diff] [blame]	1189	*/
				1190	if (!cpus)
				1191	return;
				1192
Rik van Riel	83d7f24	2014-08-04 13:23:28 -0400	[diff] [blame]	1193	/* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
				1194	smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
				1195	capacity = cpus / smt; /* cores */
				1196
				1197	ns->task_capacity = min_t(unsigned, capacity,
				1198	DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
Nicolas Pitre	1b6a749	2014-05-26 18:19:35 -0400	[diff] [blame]	1199	ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1200	}
				1201
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1202	struct task_numa_env {
				1203	struct task_struct *p;
				1204
				1205	int src_cpu, src_nid;
				1206	int dst_cpu, dst_nid;
				1207
				1208	struct numa_stats src_stats, dst_stats;
				1209
Wanpeng Li	40ea2b4	2013-12-05 19:10:17 +0800	[diff] [blame]	1210	int imbalance_pct;
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1211	int dist;
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1212
				1213	struct task_struct *best_task;
				1214	long best_imp;
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1215	int best_cpu;
				1216	};
				1217
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1218	static void task_numa_assign(struct task_numa_env *env,
				1219	struct task_struct *p, long imp)
				1220	{
				1221	if (env->best_task)
				1222	put_task_struct(env->best_task);
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1223
				1224	env->best_task = p;
				1225	env->best_imp = imp;
				1226	env->best_cpu = env->dst_cpu;
				1227	}
				1228
Rik van Riel	28a2174	2014-06-23 11:46:13 -0400	[diff] [blame]	1229	static bool load_too_imbalanced(long src_load, long dst_load,
Rik van Riel	e63da03	2014-05-14 13:22:21 -0400	[diff] [blame]	1230	struct task_numa_env *env)
				1231	{
Rik van Riel	e4991b2	2015-05-27 15:04:27 -0400	[diff] [blame]	1232	long imb, old_imb;
				1233	long orig_src_load, orig_dst_load;
Rik van Riel	28a2174	2014-06-23 11:46:13 -0400	[diff] [blame]	1234	long src_capacity, dst_capacity;
				1235
				1236	/*
				1237	* The load is corrected for the CPU capacity available on each node.
				1238	*
				1239	* src_load dst_load
				1240	* ------------ vs ---------
				1241	* src_capacity dst_capacity
				1242	*/
				1243	src_capacity = env->src_stats.compute_capacity;
				1244	dst_capacity = env->dst_stats.compute_capacity;
Rik van Riel	e63da03	2014-05-14 13:22:21 -0400	[diff] [blame]	1245
				1246	/* We care about the slope of the imbalance, not the direction. */
Rik van Riel	e4991b2	2015-05-27 15:04:27 -0400	[diff] [blame]	1247	if (dst_load < src_load)
				1248	swap(dst_load, src_load);
Rik van Riel	e63da03	2014-05-14 13:22:21 -0400	[diff] [blame]	1249
				1250	/* Is the difference below the threshold? */
Rik van Riel	e4991b2	2015-05-27 15:04:27 -0400	[diff] [blame]	1251	imb = dst_load * src_capacity * 100 -
				1252	src_load * dst_capacity * env->imbalance_pct;
Rik van Riel	e63da03	2014-05-14 13:22:21 -0400	[diff] [blame]	1253	if (imb <= 0)
				1254	return false;
				1255
				1256	/*
				1257	* The imbalance is above the allowed threshold.
Rik van Riel	e4991b2	2015-05-27 15:04:27 -0400	[diff] [blame]	1258	* Compare it with the old imbalance.
Rik van Riel	e63da03	2014-05-14 13:22:21 -0400	[diff] [blame]	1259	*/
Rik van Riel	28a2174	2014-06-23 11:46:13 -0400	[diff] [blame]	1260	orig_src_load = env->src_stats.load;
Rik van Riel	e4991b2	2015-05-27 15:04:27 -0400	[diff] [blame]	1261	orig_dst_load = env->dst_stats.load;
Rik van Riel	28a2174	2014-06-23 11:46:13 -0400	[diff] [blame]	1262
Rik van Riel	e4991b2	2015-05-27 15:04:27 -0400	[diff] [blame]	1263	if (orig_dst_load < orig_src_load)
				1264	swap(orig_dst_load, orig_src_load);
Rik van Riel	e63da03	2014-05-14 13:22:21 -0400	[diff] [blame]	1265
Rik van Riel	e4991b2	2015-05-27 15:04:27 -0400	[diff] [blame]	1266	old_imb = orig_dst_load * src_capacity * 100 -
				1267	orig_src_load * dst_capacity * env->imbalance_pct;
				1268
				1269	/* Would this change make things worse? */
				1270	return (imb > old_imb);
Rik van Riel	e63da03	2014-05-14 13:22:21 -0400	[diff] [blame]	1271	}
				1272
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1273	/*
				1274	* This checks if the overall compute and NUMA accesses of the system would
				1275	* be improved if the source tasks was migrated to the target dst_cpu taking
				1276	* into account that it might be best if task running on the dst_cpu should
				1277	* be exchanged with the source task
				1278	*/
Rik van Riel	887c290	2013-10-07 11:29:31 +0100	[diff] [blame]	1279	static void task_numa_compare(struct task_numa_env *env,
				1280	long taskimp, long groupimp)
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1281	{
				1282	struct rq *src_rq = cpu_rq(env->src_cpu);
				1283	struct rq *dst_rq = cpu_rq(env->dst_cpu);
				1284	struct task_struct *cur;
Rik van Riel	28a2174	2014-06-23 11:46:13 -0400	[diff] [blame]	1285	long src_load, dst_load;
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1286	long load;
Rik van Riel	1c5d3eb	2014-06-23 11:46:15 -0400	[diff] [blame]	1287	long imp = env->p->numa_group ? groupimp : taskimp;
Rik van Riel	0132c3e	2014-06-23 11:46:16 -0400	[diff] [blame]	1288	long moveimp = imp;
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1289	int dist = env->dist;
Gavin Guo	1dff76b	2016-01-20 12:36:58 +0800	[diff] [blame]	1290	bool assigned = false;
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1291
				1292	rcu_read_lock();
Kirill Tkhai	1effd9f	2014-10-22 11:17:11 +0400	[diff] [blame]	1293
				1294	raw_spin_lock_irq(&dst_rq->lock);
				1295	cur = dst_rq->curr;
				1296	/*
Gavin Guo	1dff76b	2016-01-20 12:36:58 +0800	[diff] [blame]	1297	* No need to move the exiting task or idle task.
Kirill Tkhai	1effd9f	2014-10-22 11:17:11 +0400	[diff] [blame]	1298	*/
				1299	if ((cur->flags & PF_EXITING) \|\| is_idle_task(cur))
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1300	cur = NULL;
Gavin Guo	1dff76b	2016-01-20 12:36:58 +0800	[diff] [blame]	1301	else {
				1302	/*
				1303	* The task_struct must be protected here to protect the
				1304	* p->numa_faults access in the task_weight since the
				1305	* numa_faults could already be freed in the following path:
				1306	* finish_task_switch()
				1307	* --> put_task_struct()
				1308	* --> __put_task_struct()
				1309	* --> task_numa_free()
				1310	*/
				1311	get_task_struct(cur);
				1312	}
				1313
Kirill Tkhai	1effd9f	2014-10-22 11:17:11 +0400	[diff] [blame]	1314	raw_spin_unlock_irq(&dst_rq->lock);
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1315
				1316	/*
Peter Zijlstra	7af6833	2014-11-10 10:54:35 +0100	[diff] [blame]	1317	* Because we have preemption enabled we can get migrated around and
				1318	* end try selecting ourselves (current == env->p) as a swap candidate.
				1319	*/
				1320	if (cur == env->p)
				1321	goto unlock;
				1322
				1323	/*
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1324	* "imp" is the fault differential for the source task between the
				1325	* source and destination node. Calculate the total differential for
				1326	* the source task and potential destination task. The more negative
				1327	* the value is, the more rmeote accesses that would be expected to
				1328	* be incurred if the tasks were swapped.
				1329	*/
				1330	if (cur) {
				1331	/* Skip this swap candidate if cannot move to the source cpu */
				1332	if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
				1333	goto unlock;
				1334
Rik van Riel	887c290	2013-10-07 11:29:31 +0100	[diff] [blame]	1335	/*
				1336	* If dst and source tasks are in the same NUMA group, or not
Rik van Riel	ca28aa53	2013-10-07 11:29:32 +0100	[diff] [blame]	1337	* in any group then look only at task weights.
Rik van Riel	887c290	2013-10-07 11:29:31 +0100	[diff] [blame]	1338	*/
Rik van Riel	ca28aa53	2013-10-07 11:29:32 +0100	[diff] [blame]	1339	if (cur->numa_group == env->p->numa_group) {
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1340	imp = taskimp + task_weight(cur, env->src_nid, dist) -
				1341	task_weight(cur, env->dst_nid, dist);
Rik van Riel	ca28aa53	2013-10-07 11:29:32 +0100	[diff] [blame]	1342	/*
				1343	* Add some hysteresis to prevent swapping the
				1344	* tasks within a group over tiny differences.
				1345	*/
				1346	if (cur->numa_group)
				1347	imp -= imp/16;
Rik van Riel	887c290	2013-10-07 11:29:31 +0100	[diff] [blame]	1348	} else {
Rik van Riel	ca28aa53	2013-10-07 11:29:32 +0100	[diff] [blame]	1349	/*
				1350	* Compare the group weights. If a task is all by
				1351	* itself (not part of a group), use the task weight
				1352	* instead.
				1353	*/
Rik van Riel	ca28aa53	2013-10-07 11:29:32 +0100	[diff] [blame]	1354	if (cur->numa_group)
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1355	imp += group_weight(cur, env->src_nid, dist) -
				1356	group_weight(cur, env->dst_nid, dist);
Rik van Riel	ca28aa53	2013-10-07 11:29:32 +0100	[diff] [blame]	1357	else
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1358	imp += task_weight(cur, env->src_nid, dist) -
				1359	task_weight(cur, env->dst_nid, dist);
Rik van Riel	887c290	2013-10-07 11:29:31 +0100	[diff] [blame]	1360	}
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1361	}
				1362
Rik van Riel	0132c3e	2014-06-23 11:46:16 -0400	[diff] [blame]	1363	if (imp <= env->best_imp && moveimp <= env->best_imp)
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1364	goto unlock;
				1365
				1366	if (!cur) {
				1367	/* Is there capacity at our destination? */
Rik van Riel	b932c03	2014-08-04 13:23:27 -0400	[diff] [blame]	1368	if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
Nicolas Pitre	1b6a749	2014-05-26 18:19:35 -0400	[diff] [blame]	1369	!env->dst_stats.has_free_capacity)
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1370	goto unlock;
				1371
				1372	goto balance;
				1373	}
				1374
				1375	/* Balance doesn't matter much if we're running a task per cpu */
Rik van Riel	0132c3e	2014-06-23 11:46:16 -0400	[diff] [blame]	1376	if (imp > env->best_imp && src_rq->nr_running == 1 &&
				1377	dst_rq->nr_running == 1)
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1378	goto assign;
				1379
				1380	/*
				1381	* In the overloaded case, try and keep the load balanced.
				1382	*/
				1383	balance:
Peter Zijlstra	e720fff	2014-07-11 16:01:53 +0200	[diff] [blame]	1384	load = task_h_load(env->p);
				1385	dst_load = env->dst_stats.load + load;
				1386	src_load = env->src_stats.load - load;
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1387
Rik van Riel	0132c3e	2014-06-23 11:46:16 -0400	[diff] [blame]	1388	if (moveimp > imp && moveimp > env->best_imp) {
				1389	/*
				1390	* If the improvement from just moving env->p direction is
				1391	* better than swapping tasks around, check if a move is
				1392	* possible. Store a slightly smaller score than moveimp,
				1393	* so an actually idle CPU will win.
				1394	*/
				1395	if (!load_too_imbalanced(src_load, dst_load, env)) {
				1396	imp = moveimp - 1;
Gavin Guo	1dff76b	2016-01-20 12:36:58 +0800	[diff] [blame]	1397	put_task_struct(cur);
Rik van Riel	0132c3e	2014-06-23 11:46:16 -0400	[diff] [blame]	1398	cur = NULL;
				1399	goto assign;
				1400	}
				1401	}
				1402
				1403	if (imp <= env->best_imp)
				1404	goto unlock;
				1405
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1406	if (cur) {
Peter Zijlstra	e720fff	2014-07-11 16:01:53 +0200	[diff] [blame]	1407	load = task_h_load(cur);
				1408	dst_load -= load;
				1409	src_load += load;
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1410	}
				1411
Rik van Riel	28a2174	2014-06-23 11:46:13 -0400	[diff] [blame]	1412	if (load_too_imbalanced(src_load, dst_load, env))
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1413	goto unlock;
				1414
Rik van Riel	ba7e5a2	2014-09-04 16:35:30 -0400	[diff] [blame]	1415	/*
				1416	* One idle CPU per node is evaluated for a task numa move.
				1417	* Call select_idle_sibling to maybe find a better one.
				1418	*/
				1419	if (!cur)
				1420	env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
				1421
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1422	assign:
Gavin Guo	1dff76b	2016-01-20 12:36:58 +0800	[diff] [blame]	1423	assigned = true;
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1424	task_numa_assign(env, cur, imp);
				1425	unlock:
				1426	rcu_read_unlock();
Gavin Guo	1dff76b	2016-01-20 12:36:58 +0800	[diff] [blame]	1427	/*
				1428	* The dst_rq->curr isn't assigned. The protection for task_struct is
				1429	* finished.
				1430	*/
				1431	if (cur && !assigned)
				1432	put_task_struct(cur);
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1433	}
				1434
Rik van Riel	887c290	2013-10-07 11:29:31 +0100	[diff] [blame]	1435	static void task_numa_find_cpu(struct task_numa_env *env,
				1436	long taskimp, long groupimp)
Mel Gorman	2c8a50a	2013-10-07 11:29:18 +0100	[diff] [blame]	1437	{
				1438	int cpu;
				1439
				1440	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
				1441	/* Skip this CPU if the source task cannot migrate */
				1442	if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
				1443	continue;
				1444
				1445	env->dst_cpu = cpu;
Rik van Riel	887c290	2013-10-07 11:29:31 +0100	[diff] [blame]	1446	task_numa_compare(env, taskimp, groupimp);
Mel Gorman	2c8a50a	2013-10-07 11:29:18 +0100	[diff] [blame]	1447	}
				1448	}
				1449
Rik van Riel	6f9aad0	2015-05-28 09:52:49 -0400	[diff] [blame]	1450	/* Only move tasks to a NUMA node less busy than the current node. */
				1451	static bool numa_has_capacity(struct task_numa_env *env)
				1452	{
				1453	struct numa_stats *src = &env->src_stats;
				1454	struct numa_stats *dst = &env->dst_stats;
				1455
				1456	if (src->has_free_capacity && !dst->has_free_capacity)
				1457	return false;
				1458
				1459	/*
				1460	* Only consider a task move if the source has a higher load
				1461	* than the destination, corrected for CPU capacity on each node.
				1462	*
				1463	* src->load dst->load
				1464	* --------------------- vs ---------------------
				1465	* src->compute_capacity dst->compute_capacity
				1466	*/
Srikar Dronamraju	44dcb04	2015-06-16 17:26:00 +0530	[diff] [blame]	1467	if (src->load * dst->compute_capacity * env->imbalance_pct >
				1468
				1469	dst->load * src->compute_capacity * 100)
Rik van Riel	6f9aad0	2015-05-28 09:52:49 -0400	[diff] [blame]	1470	return true;
				1471
				1472	return false;
				1473	}
				1474
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1475	static int task_numa_migrate(struct task_struct *p)
Mel Gorman	e6628d5	2013-10-07 11:29:02 +0100	[diff] [blame]	1476	{
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1477	struct task_numa_env env = {
				1478	.p = p,
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1479
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1480	.src_cpu = task_cpu(p),
Ingo Molnar	b32e86b	2013-10-07 11:29:30 +0100	[diff] [blame]	1481	.src_nid = task_node(p),
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1482
				1483	.imbalance_pct = 112,
				1484
				1485	.best_task = NULL,
				1486	.best_imp = 0,
				1487	.best_cpu = -1
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1488	};
				1489	struct sched_domain *sd;
Rik van Riel	887c290	2013-10-07 11:29:31 +0100	[diff] [blame]	1490	unsigned long taskweight, groupweight;
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1491	int nid, ret, dist;
Rik van Riel	887c290	2013-10-07 11:29:31 +0100	[diff] [blame]	1492	long taskimp, groupimp;
Mel Gorman	e6628d5	2013-10-07 11:29:02 +0100	[diff] [blame]	1493
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1494	/*
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1495	* Pick the lowest SD_NUMA domain, as that would have the smallest
				1496	* imbalance and would be the first to start moving tasks about.
				1497	*
				1498	* And we want to avoid any moving of tasks about, as that would create
				1499	* random movement of tasks -- counter the numa conditions we're trying
				1500	* to satisfy here.
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1501	*/
Mel Gorman	e6628d5	2013-10-07 11:29:02 +0100	[diff] [blame]	1502	rcu_read_lock();
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1503	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
Rik van Riel	46a73e8	2013-11-11 19:29:25 -0500	[diff] [blame]	1504	if (sd)
				1505	env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
Mel Gorman	e6628d5	2013-10-07 11:29:02 +0100	[diff] [blame]	1506	rcu_read_unlock();
				1507
Rik van Riel	46a73e8	2013-11-11 19:29:25 -0500	[diff] [blame]	1508	/*
				1509	* Cpusets can break the scheduler domain tree into smaller
				1510	* balance domains, some of which do not cross NUMA boundaries.
				1511	* Tasks that are "trapped" in such domains cannot be migrated
				1512	* elsewhere, so there is no point in (re)trying.
				1513	*/
				1514	if (unlikely(!sd)) {
Wanpeng Li	de1b301	2013-12-12 15:23:24 +0800	[diff] [blame]	1515	p->numa_preferred_nid = task_node(p);
Rik van Riel	46a73e8	2013-11-11 19:29:25 -0500	[diff] [blame]	1516	return -EINVAL;
				1517	}
				1518
Mel Gorman	2c8a50a	2013-10-07 11:29:18 +0100	[diff] [blame]	1519	env.dst_nid = p->numa_preferred_nid;
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1520	dist = env.dist = node_distance(env.src_nid, env.dst_nid);
				1521	taskweight = task_weight(p, env.src_nid, dist);
				1522	groupweight = group_weight(p, env.src_nid, dist);
				1523	update_numa_stats(&env.src_stats, env.src_nid);
				1524	taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
				1525	groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
Mel Gorman	2c8a50a	2013-10-07 11:29:18 +0100	[diff] [blame]	1526	update_numa_stats(&env.dst_stats, env.dst_nid);
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1527
Rik van Riel	a43455a	2014-06-04 16:09:42 -0400	[diff] [blame]	1528	/* Try to find a spot on the preferred nid. */
Rik van Riel	6f9aad0	2015-05-28 09:52:49 -0400	[diff] [blame]	1529	if (numa_has_capacity(&env))
				1530	task_numa_find_cpu(&env, taskimp, groupimp);
Rik van Riel	e1dda8a	2013-10-07 11:29:19 +0100	[diff] [blame]	1531
Rik van Riel	9de05d4	2014-10-09 17:27:47 -0400	[diff] [blame]	1532	/*
				1533	* Look at other nodes in these cases:
				1534	* - there is no space available on the preferred_nid
				1535	* - the task is part of a numa_group that is interleaved across
				1536	* multiple NUMA nodes; in order to better consolidate the group,
				1537	* we need to check other locations.
				1538	*/
				1539	if (env.best_cpu == -1 \|\| (p->numa_group &&
				1540	nodes_weight(p->numa_group->active_nodes) > 1)) {
Mel Gorman	2c8a50a	2013-10-07 11:29:18 +0100	[diff] [blame]	1541	for_each_online_node(nid) {
				1542	if (nid == env.src_nid \|\| nid == p->numa_preferred_nid)
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1543	continue;
				1544
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1545	dist = node_distance(env.src_nid, env.dst_nid);
Rik van Riel	6c6b119	2014-10-17 03:29:52 -0400	[diff] [blame]	1546	if (sched_numa_topology_type == NUMA_BACKPLANE &&
				1547	dist != env.dist) {
				1548	taskweight = task_weight(p, env.src_nid, dist);
				1549	groupweight = group_weight(p, env.src_nid, dist);
				1550	}
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1551
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	1552	/* Only consider nodes where both task and groups benefit */
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1553	taskimp = task_weight(p, nid, dist) - taskweight;
				1554	groupimp = group_weight(p, nid, dist) - groupweight;
Rik van Riel	887c290	2013-10-07 11:29:31 +0100	[diff] [blame]	1555	if (taskimp < 0 && groupimp < 0)
Mel Gorman	2c8a50a	2013-10-07 11:29:18 +0100	[diff] [blame]	1556	continue;
				1557
Rik van Riel	7bd9532	2014-10-17 03:29:51 -0400	[diff] [blame]	1558	env.dist = dist;
Mel Gorman	2c8a50a	2013-10-07 11:29:18 +0100	[diff] [blame]	1559	env.dst_nid = nid;
				1560	update_numa_stats(&env.dst_stats, env.dst_nid);
Rik van Riel	6f9aad0	2015-05-28 09:52:49 -0400	[diff] [blame]	1561	if (numa_has_capacity(&env))
				1562	task_numa_find_cpu(&env, taskimp, groupimp);
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	1563	}
				1564	}
				1565
Rik van Riel	68d1b02	2014-04-11 13:00:29 -0400	[diff] [blame]	1566	/*
				1567	* If the task is part of a workload that spans multiple NUMA nodes,
				1568	* and is migrating into one of the workload's active nodes, remember
				1569	* this node as the task's preferred numa node, so the workload can
				1570	* settle down.
				1571	* A task that migrated to a second choice node will be better off
				1572	* trying for a better one later. Do not set the preferred node here.
				1573	*/
Rik van Riel	db015da	2014-06-23 11:41:34 -0400	[diff] [blame]	1574	if (p->numa_group) {
				1575	if (env.best_cpu == -1)
				1576	nid = env.src_nid;
				1577	else
				1578	nid = env.dst_nid;
				1579
				1580	if (node_isset(nid, p->numa_group->active_nodes))
				1581	sched_setnuma(p, env.dst_nid);
				1582	}
				1583
				1584	/* No better CPU than the current one was found. */
				1585	if (env.best_cpu == -1)
				1586	return -EAGAIN;
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	1587
Rik van Riel	04bb2f9	2013-10-07 11:29:36 +0100	[diff] [blame]	1588	/*
				1589	* Reset the scan period if the task is being rescheduled on an
				1590	* alternative node to recheck if the tasks is now properly placed.
				1591	*/
				1592	p->numa_scan_period = task_scan_min(p);
				1593
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1594	if (env.best_task == NULL) {
Mel Gorman	286549d	2014-01-21 15:51:03 -0800	[diff] [blame]	1595	ret = migrate_task_to(p, env.best_cpu);
				1596	if (ret != 0)
				1597	trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1598	return ret;
				1599	}
				1600
				1601	ret = migrate_swap(p, env.best_task);
Mel Gorman	286549d	2014-01-21 15:51:03 -0800	[diff] [blame]	1602	if (ret != 0)
				1603	trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1604	put_task_struct(env.best_task);
				1605	return ret;
Mel Gorman	e6628d5	2013-10-07 11:29:02 +0100	[diff] [blame]	1606	}
				1607
Mel Gorman	6b9a746	2013-10-07 11:29:11 +0100	[diff] [blame]	1608	/* Attempt to migrate a task to a CPU on the preferred node. */
				1609	static void numa_migrate_preferred(struct task_struct *p)
				1610	{
Rik van Riel	5085e2a	2014-04-11 13:00:28 -0400	[diff] [blame]	1611	unsigned long interval = HZ;
				1612
Rik van Riel	2739d3e	2013-10-07 11:29:41 +0100	[diff] [blame]	1613	/* This task has no NUMA fault statistics yet */
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	1614	if (unlikely(p->numa_preferred_nid == -1 \|\| !p->numa_faults))
Rik van Riel	2739d3e	2013-10-07 11:29:41 +0100	[diff] [blame]	1615	return;
				1616
				1617	/* Periodically retry migrating the task to the preferred node */
Rik van Riel	5085e2a	2014-04-11 13:00:28 -0400	[diff] [blame]	1618	interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
				1619	p->numa_migrate_retry = jiffies + interval;
Rik van Riel	2739d3e	2013-10-07 11:29:41 +0100	[diff] [blame]	1620
Mel Gorman	6b9a746	2013-10-07 11:29:11 +0100	[diff] [blame]	1621	/* Success if task is already running on preferred CPU */
Wanpeng Li	de1b301	2013-12-12 15:23:24 +0800	[diff] [blame]	1622	if (task_node(p) == p->numa_preferred_nid)
Mel Gorman	6b9a746	2013-10-07 11:29:11 +0100	[diff] [blame]	1623	return;
				1624
Mel Gorman	6b9a746	2013-10-07 11:29:11 +0100	[diff] [blame]	1625	/* Otherwise, try migrate to a CPU on the preferred node */
Rik van Riel	2739d3e	2013-10-07 11:29:41 +0100	[diff] [blame]	1626	task_numa_migrate(p);
Mel Gorman	6b9a746	2013-10-07 11:29:11 +0100	[diff] [blame]	1627	}
				1628
Rik van Riel	04bb2f9	2013-10-07 11:29:36 +0100	[diff] [blame]	1629	/*
Rik van Riel	20e07de	2014-01-27 17:03:43 -0500	[diff] [blame]	1630	* Find the nodes on which the workload is actively running. We do this by
				1631	* tracking the nodes from which NUMA hinting faults are triggered. This can
				1632	* be different from the set of nodes where the workload's memory is currently
				1633	* located.
				1634	*
				1635	* The bitmask is used to make smarter decisions on when to do NUMA page
				1636	* migrations, To prevent flip-flopping, and excessive page migrations, nodes
				1637	* are added when they cause over 6/16 of the maximum number of faults, but
				1638	* only removed when they drop below 3/16.
				1639	*/
				1640	static void update_numa_active_node_mask(struct numa_group *numa_group)
				1641	{
				1642	unsigned long faults, max_faults = 0;
				1643	int nid;
				1644
				1645	for_each_online_node(nid) {
				1646	faults = group_faults_cpu(numa_group, nid);
				1647	if (faults > max_faults)
				1648	max_faults = faults;
				1649	}
				1650
				1651	for_each_online_node(nid) {
				1652	faults = group_faults_cpu(numa_group, nid);
				1653	if (!node_isset(nid, numa_group->active_nodes)) {
				1654	if (faults > max_faults * 6 / 16)
				1655	node_set(nid, numa_group->active_nodes);
				1656	} else if (faults < max_faults * 3 / 16)
				1657	node_clear(nid, numa_group->active_nodes);
				1658	}
				1659	}
				1660
				1661	/*
Rik van Riel	04bb2f9	2013-10-07 11:29:36 +0100	[diff] [blame]	1662	* When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
				1663	* increments. The more local the fault statistics are, the higher the scan
Rik van Riel	a22b4b0	2014-06-23 11:41:35 -0400	[diff] [blame]	1664	* period will be for the next scan window. If local/(local+remote) ratio is
				1665	* below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
				1666	* the scan period will decrease. Aim for 70% local accesses.
Rik van Riel	04bb2f9	2013-10-07 11:29:36 +0100	[diff] [blame]	1667	*/
				1668	#define NUMA_PERIOD_SLOTS 10
Rik van Riel	a22b4b0	2014-06-23 11:41:35 -0400	[diff] [blame]	1669	#define NUMA_PERIOD_THRESHOLD 7
Rik van Riel	04bb2f9	2013-10-07 11:29:36 +0100	[diff] [blame]	1670
				1671	/*
				1672	* Increase the scan period (slow down scanning) if the majority of
				1673	* our memory is already on our local node, or if the majority of
				1674	* the page accesses are shared with other processes.
				1675	* Otherwise, decrease the scan period.
				1676	*/
				1677	static void update_task_scan_period(struct task_struct *p,
				1678	unsigned long shared, unsigned long private)
				1679	{
				1680	unsigned int period_slot;
				1681	int ratio;
				1682	int diff;
				1683
				1684	unsigned long remote = p->numa_faults_locality[0];
				1685	unsigned long local = p->numa_faults_locality[1];
				1686
				1687	/*
				1688	* If there were no record hinting faults then either the task is
				1689	* completely idle or all activity is areas that are not of interest
Mel Gorman	074c238	2015-03-25 15:55:42 -0700	[diff] [blame]	1690	* to automatic numa balancing. Related to that, if there were failed
				1691	* migration then it implies we are migrating too quickly or the local
				1692	* node is overloaded. In either case, scan slower
Rik van Riel	04bb2f9	2013-10-07 11:29:36 +0100	[diff] [blame]	1693	*/
Mel Gorman	074c238	2015-03-25 15:55:42 -0700	[diff] [blame]	1694	if (local + shared == 0 \|\| p->numa_faults_locality[2]) {
Rik van Riel	04bb2f9	2013-10-07 11:29:36 +0100	[diff] [blame]	1695	p->numa_scan_period = min(p->numa_scan_period_max,
				1696	p->numa_scan_period << 1);
				1697
				1698	p->mm->numa_next_scan = jiffies +
				1699	msecs_to_jiffies(p->numa_scan_period);
				1700
				1701	return;
				1702	}
				1703
				1704	/*
				1705	* Prepare to scale scan period relative to the current period.
				1706	* == NUMA_PERIOD_THRESHOLD scan period stays the same
				1707	* < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
				1708	* >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
				1709	*/
				1710	period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
				1711	ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
				1712	if (ratio >= NUMA_PERIOD_THRESHOLD) {
				1713	int slot = ratio - NUMA_PERIOD_THRESHOLD;
				1714	if (!slot)
				1715	slot = 1;
				1716	diff = slot * period_slot;
				1717	} else {
				1718	diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
				1719
				1720	/*
				1721	* Scale scan rate increases based on sharing. There is an
				1722	* inverse relationship between the degree of sharing and
				1723	* the adjustment made to the scanning period. Broadly
				1724	* speaking the intent is that there is little point
				1725	* scanning faster if shared accesses dominate as it may
				1726	* simply bounce migrations uselessly
				1727	*/
Yasuaki Ishimatsu	2847c90	2014-10-22 16:04:35 +0900	[diff] [blame]	1728	ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
Rik van Riel	04bb2f9	2013-10-07 11:29:36 +0100	[diff] [blame]	1729	diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
				1730	}
				1731
				1732	p->numa_scan_period = clamp(p->numa_scan_period + diff,
				1733	task_scan_min(p), task_scan_max(p));
				1734	memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
				1735	}
				1736
Rik van Riel	7e2703e	2014-01-27 17:03:45 -0500	[diff] [blame]	1737	/*
				1738	* Get the fraction of time the task has been running since the last
				1739	* NUMA placement cycle. The scheduler keeps similar statistics, but
				1740	* decays those on a 32ms period, which is orders of magnitude off
				1741	* from the dozens-of-seconds NUMA balancing period. Use the scheduler
				1742	* stats only if the task is so new there are no NUMA statistics yet.
				1743	*/
				1744	static u64 numa_get_avg_runtime(struct task_struct p, u64 period)
				1745	{
				1746	u64 runtime, delta, now;
				1747	/* Use the start of this time slice to avoid calculations. */
				1748	now = p->se.exec_start;
				1749	runtime = p->se.sum_exec_runtime;
				1750
				1751	if (p->last_task_numa_placement) {
				1752	delta = runtime - p->last_sum_exec_runtime;
				1753	*period = now - p->last_task_numa_placement;
				1754	} else {
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	1755	delta = p->se.avg.load_sum / p->se.load.weight;
				1756	*period = LOAD_AVG_MAX;
Rik van Riel	7e2703e	2014-01-27 17:03:45 -0500	[diff] [blame]	1757	}
				1758
				1759	p->last_sum_exec_runtime = runtime;
				1760	p->last_task_numa_placement = now;
				1761
				1762	return delta;
				1763	}
				1764
Rik van Riel	5400941	2014-10-17 03:29:53 -0400	[diff] [blame]	1765	/*
				1766	* Determine the preferred nid for a task in a numa_group. This needs to
				1767	* be done in a way that produces consistent results with group_weight,
				1768	* otherwise workloads might not converge.
				1769	*/
				1770	static int preferred_group_nid(struct task_struct *p, int nid)
				1771	{
				1772	nodemask_t nodes;
				1773	int dist;
				1774
				1775	/* Direct connections between all NUMA nodes. */
				1776	if (sched_numa_topology_type == NUMA_DIRECT)
				1777	return nid;
				1778
				1779	/*
				1780	* On a system with glueless mesh NUMA topology, group_weight
				1781	* scores nodes according to the number of NUMA hinting faults on
				1782	* both the node itself, and on nearby nodes.
				1783	*/
				1784	if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
				1785	unsigned long score, max_score = 0;
				1786	int node, max_node = nid;
				1787
				1788	dist = sched_max_numa_distance;
				1789
				1790	for_each_online_node(node) {
				1791	score = group_weight(p, node, dist);
				1792	if (score > max_score) {
				1793	max_score = score;
				1794	max_node = node;
				1795	}
				1796	}
				1797	return max_node;
				1798	}
				1799
				1800	/*
				1801	* Finding the preferred nid in a system with NUMA backplane
				1802	* interconnect topology is more involved. The goal is to locate
				1803	* tasks from numa_groups near each other in the system, and
				1804	* untangle workloads from different sides of the system. This requires
				1805	* searching down the hierarchy of node groups, recursively searching
				1806	* inside the highest scoring group of nodes. The nodemask tricks
				1807	* keep the complexity of the search down.
				1808	*/
				1809	nodes = node_online_map;
				1810	for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
				1811	unsigned long max_faults = 0;
Jan Beulich	8190747	2015-01-23 08:25:38 +0000	[diff] [blame]	1812	nodemask_t max_group = NODE_MASK_NONE;
Rik van Riel	5400941	2014-10-17 03:29:53 -0400	[diff] [blame]	1813	int a, b;
				1814
				1815	/* Are there nodes at this distance from each other? */
				1816	if (!find_numa_distance(dist))
				1817	continue;
				1818
				1819	for_each_node_mask(a, nodes) {
				1820	unsigned long faults = 0;
				1821	nodemask_t this_group;
				1822	nodes_clear(this_group);
				1823
				1824	/* Sum group's NUMA faults; includes a==b case. */
				1825	for_each_node_mask(b, nodes) {
				1826	if (node_distance(a, b) < dist) {
				1827	faults += group_faults(p, b);
				1828	node_set(b, this_group);
				1829	node_clear(b, nodes);
				1830	}
				1831	}
				1832
				1833	/* Remember the top group. */
				1834	if (faults > max_faults) {
				1835	max_faults = faults;
				1836	max_group = this_group;
				1837	/*
				1838	* subtle: at the smallest distance there is
				1839	* just one node left in each "group", the
				1840	* winner is the preferred nid.
				1841	*/
				1842	nid = a;
				1843	}
				1844	}
				1845	/* Next round, evaluate the nodes within max_group. */
Jan Beulich	890a540	2015-02-09 12:30:00 +0100	[diff] [blame]	1846	if (!max_faults)
				1847	break;
Rik van Riel	5400941	2014-10-17 03:29:53 -0400	[diff] [blame]	1848	nodes = max_group;
				1849	}
				1850	return nid;
				1851	}
				1852
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1853	static void task_numa_placement(struct task_struct *p)
				1854	{
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	1855	int seq, nid, max_nid = -1, max_group_nid = -1;
				1856	unsigned long max_faults = 0, max_group_faults = 0;
Rik van Riel	04bb2f9	2013-10-07 11:29:36 +0100	[diff] [blame]	1857	unsigned long fault_types[2] = { 0, 0 };
Rik van Riel	7e2703e	2014-01-27 17:03:45 -0500	[diff] [blame]	1858	unsigned long total_faults;
				1859	u64 runtime, period;
Mel Gorman	7dbd13e	2013-10-07 11:29:29 +0100	[diff] [blame]	1860	spinlock_t *group_lock = NULL;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1861
Jason Low	7e5a2c1	2015-04-30 17:28:14 -0700	[diff] [blame]	1862	/*
				1863	* The p->mm->numa_scan_seq field gets updated without
				1864	* exclusive access. Use READ_ONCE() here to ensure
				1865	* that the field is read in a single access:
				1866	*/
Jason Low	316c1608d	2015-04-28 13:00:20 -0700	[diff] [blame]	1867	seq = READ_ONCE(p->mm->numa_scan_seq);
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1868	if (p->numa_scan_seq == seq)
				1869	return;
				1870	p->numa_scan_seq = seq;
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	1871	p->numa_scan_period_max = task_scan_max(p);
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1872
Rik van Riel	7e2703e	2014-01-27 17:03:45 -0500	[diff] [blame]	1873	total_faults = p->numa_faults_locality[0] +
				1874	p->numa_faults_locality[1];
				1875	runtime = numa_get_avg_runtime(p, &period);
				1876
Mel Gorman	7dbd13e	2013-10-07 11:29:29 +0100	[diff] [blame]	1877	/* If the task is part of a group prevent parallel updates to group stats */
				1878	if (p->numa_group) {
				1879	group_lock = &p->numa_group->lock;
Mike Galbraith	60e69ee	2014-04-07 10:55:15 +0200	[diff] [blame]	1880	spin_lock_irq(group_lock);
Mel Gorman	7dbd13e	2013-10-07 11:29:29 +0100	[diff] [blame]	1881	}
				1882
Mel Gorman	688b758	2013-10-07 11:28:58 +0100	[diff] [blame]	1883	/* Find the node with the highest number of faults */
				1884	for_each_online_node(nid) {
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	1885	/* Keep track of the offsets in numa_faults array */
				1886	int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	1887	unsigned long faults = 0, group_faults = 0;
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	1888	int priv;
Mel Gorman	745d614	2013-10-07 11:28:59 +0100	[diff] [blame]	1889
Rik van Riel	be1e4e7	2014-01-27 17:03:48 -0500	[diff] [blame]	1890	for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
Rik van Riel	7e2703e	2014-01-27 17:03:45 -0500	[diff] [blame]	1891	long diff, f_diff, f_weight;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	1892
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	1893	mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
				1894	membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
				1895	cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
				1896	cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
Mel Gorman	745d614	2013-10-07 11:28:59 +0100	[diff] [blame]	1897
Mel Gorman	ac8e895	2013-10-07 11:29:03 +0100	[diff] [blame]	1898	/* Decay existing window, copy faults since last scan */
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	1899	diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
				1900	fault_types[priv] += p->numa_faults[membuf_idx];
				1901	p->numa_faults[membuf_idx] = 0;
Mel Gorman	fb13c7e	2013-10-07 11:29:17 +0100	[diff] [blame]	1902
Rik van Riel	7e2703e	2014-01-27 17:03:45 -0500	[diff] [blame]	1903	/*
				1904	* Normalize the faults_from, so all tasks in a group
				1905	* count according to CPU use, instead of by the raw
				1906	* number of faults. Tasks with little runtime have
				1907	* little over-all impact on throughput, and thus their
				1908	* faults are less important.
				1909	*/
				1910	f_weight = div64_u64(runtime << 16, period + 1);
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	1911	f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
Rik van Riel	7e2703e	2014-01-27 17:03:45 -0500	[diff] [blame]	1912	(total_faults + 1);
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	1913	f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
				1914	p->numa_faults[cpubuf_idx] = 0;
Rik van Riel	50ec8a4	2014-01-27 17:03:42 -0500	[diff] [blame]	1915
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	1916	p->numa_faults[mem_idx] += diff;
				1917	p->numa_faults[cpu_idx] += f_diff;
				1918	faults += p->numa_faults[mem_idx];
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	1919	p->total_numa_faults += diff;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	1920	if (p->numa_group) {
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	1921	/*
				1922	* safe because we can only change our own group
				1923	*
				1924	* mem_idx represents the offset for a given
				1925	* nid and priv in a specific region because it
				1926	* is at the beginning of the numa_faults array.
				1927	*/
				1928	p->numa_group->faults[mem_idx] += diff;
				1929	p->numa_group->faults_cpu[mem_idx] += f_diff;
Mel Gorman	989348b	2013-10-07 11:29:40 +0100	[diff] [blame]	1930	p->numa_group->total_faults += diff;
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	1931	group_faults += p->numa_group->faults[mem_idx];
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	1932	}
Mel Gorman	ac8e895	2013-10-07 11:29:03 +0100	[diff] [blame]	1933	}
				1934
Mel Gorman	688b758	2013-10-07 11:28:58 +0100	[diff] [blame]	1935	if (faults > max_faults) {
				1936	max_faults = faults;
				1937	max_nid = nid;
				1938	}
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	1939
				1940	if (group_faults > max_group_faults) {
				1941	max_group_faults = group_faults;
				1942	max_group_nid = nid;
				1943	}
				1944	}
				1945
Rik van Riel	04bb2f9	2013-10-07 11:29:36 +0100	[diff] [blame]	1946	update_task_scan_period(p, fault_types[0], fault_types[1]);
				1947
Mel Gorman	7dbd13e	2013-10-07 11:29:29 +0100	[diff] [blame]	1948	if (p->numa_group) {
Rik van Riel	20e07de	2014-01-27 17:03:43 -0500	[diff] [blame]	1949	update_numa_active_node_mask(p->numa_group);
Mike Galbraith	60e69ee	2014-04-07 10:55:15 +0200	[diff] [blame]	1950	spin_unlock_irq(group_lock);
Rik van Riel	5400941	2014-10-17 03:29:53 -0400	[diff] [blame]	1951	max_nid = preferred_group_nid(p, max_group_nid);
Mel Gorman	688b758	2013-10-07 11:28:58 +0100	[diff] [blame]	1952	}
				1953
Rik van Riel	bb97fc3	2014-06-04 16:33:15 -0400	[diff] [blame]	1954	if (max_faults) {
				1955	/* Set the new preferred node */
				1956	if (max_nid != p->numa_preferred_nid)
				1957	sched_setnuma(p, max_nid);
				1958
				1959	if (task_node(p) != p->numa_preferred_nid)
				1960	numa_migrate_preferred(p);
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	1961	}
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1962	}
				1963
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	1964	static inline int get_numa_group(struct numa_group *grp)
				1965	{
				1966	return atomic_inc_not_zero(&grp->refcount);
				1967	}
				1968
				1969	static inline void put_numa_group(struct numa_group *grp)
				1970	{
				1971	if (atomic_dec_and_test(&grp->refcount))
				1972	kfree_rcu(grp, rcu);
				1973	}
				1974
Mel Gorman	3e6a941	2013-10-07 11:29:35 +0100	[diff] [blame]	1975	static void task_numa_group(struct task_struct *p, int cpupid, int flags,
				1976	int *priv)
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	1977	{
				1978	struct numa_group grp, my_grp;
				1979	struct task_struct *tsk;
				1980	bool join = false;
				1981	int cpu = cpupid_to_cpu(cpupid);
				1982	int i;
				1983
				1984	if (unlikely(!p->numa_group)) {
				1985	unsigned int size = sizeof(struct numa_group) +
Rik van Riel	50ec8a4	2014-01-27 17:03:42 -0500	[diff] [blame]	1986	4nr_node_idssizeof(unsigned long);
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	1987
				1988	grp = kzalloc(size, GFP_KERNEL \| __GFP_NOWARN);
				1989	if (!grp)
				1990	return;
				1991
				1992	atomic_set(&grp->refcount, 1);
				1993	spin_lock_init(&grp->lock);
Mel Gorman	e29cf08	2013-10-07 11:29:22 +0100	[diff] [blame]	1994	grp->gid = p->pid;
Rik van Riel	50ec8a4	2014-01-27 17:03:42 -0500	[diff] [blame]	1995	/* Second half of the array tracks nids where faults happen */
Rik van Riel	be1e4e7	2014-01-27 17:03:48 -0500	[diff] [blame]	1996	grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
				1997	nr_node_ids;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	1998
Rik van Riel	20e07de	2014-01-27 17:03:43 -0500	[diff] [blame]	1999	node_set(task_node(current), grp->active_nodes);
				2000
Rik van Riel	be1e4e7	2014-01-27 17:03:48 -0500	[diff] [blame]	2001	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2002	grp->faults[i] = p->numa_faults[i];
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2003
Mel Gorman	989348b	2013-10-07 11:29:40 +0100	[diff] [blame]	2004	grp->total_faults = p->total_numa_faults;
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	2005
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2006	grp->nr_tasks++;
				2007	rcu_assign_pointer(p->numa_group, grp);
				2008	}
				2009
				2010	rcu_read_lock();
Jason Low	316c1608d	2015-04-28 13:00:20 -0700	[diff] [blame]	2011	tsk = READ_ONCE(cpu_rq(cpu)->curr);
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2012
				2013	if (!cpupid_match_pid(tsk, cpupid))
Peter Zijlstra	3354781	2013-10-09 10:24:48 +0200	[diff] [blame]	2014	goto no_join;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2015
				2016	grp = rcu_dereference(tsk->numa_group);
				2017	if (!grp)
Peter Zijlstra	3354781	2013-10-09 10:24:48 +0200	[diff] [blame]	2018	goto no_join;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2019
				2020	my_grp = p->numa_group;
				2021	if (grp == my_grp)
Peter Zijlstra	3354781	2013-10-09 10:24:48 +0200	[diff] [blame]	2022	goto no_join;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2023
				2024	/*
				2025	* Only join the other group if its bigger; if we're the bigger group,
				2026	* the other task will join us.
				2027	*/
				2028	if (my_grp->nr_tasks > grp->nr_tasks)
Peter Zijlstra	3354781	2013-10-09 10:24:48 +0200	[diff] [blame]	2029	goto no_join;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2030
				2031	/*
				2032	* Tie-break on the grp address.
				2033	*/
				2034	if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
Peter Zijlstra	3354781	2013-10-09 10:24:48 +0200	[diff] [blame]	2035	goto no_join;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2036
Rik van Riel	dabe1d9	2013-10-07 11:29:34 +0100	[diff] [blame]	2037	/* Always join threads in the same process. */
				2038	if (tsk->mm == current->mm)
				2039	join = true;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2040
Rik van Riel	dabe1d9	2013-10-07 11:29:34 +0100	[diff] [blame]	2041	/* Simple filter to avoid false positives due to PID collisions */
				2042	if (flags & TNF_SHARED)
				2043	join = true;
				2044
Mel Gorman	3e6a941	2013-10-07 11:29:35 +0100	[diff] [blame]	2045	/* Update priv based on whether false sharing was detected */
				2046	*priv = !join;
				2047
Rik van Riel	dabe1d9	2013-10-07 11:29:34 +0100	[diff] [blame]	2048	if (join && !get_numa_group(grp))
Peter Zijlstra	3354781	2013-10-09 10:24:48 +0200	[diff] [blame]	2049	goto no_join;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2050
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2051	rcu_read_unlock();
				2052
				2053	if (!join)
				2054	return;
				2055
Mike Galbraith	60e69ee	2014-04-07 10:55:15 +0200	[diff] [blame]	2056	BUG_ON(irqs_disabled());
				2057	double_lock_irq(&my_grp->lock, &grp->lock);
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2058
Rik van Riel	be1e4e7	2014-01-27 17:03:48 -0500	[diff] [blame]	2059	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2060	my_grp->faults[i] -= p->numa_faults[i];
				2061	grp->faults[i] += p->numa_faults[i];
Mel Gorman	989348b	2013-10-07 11:29:40 +0100	[diff] [blame]	2062	}
				2063	my_grp->total_faults -= p->total_numa_faults;
				2064	grp->total_faults += p->total_numa_faults;
				2065
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2066	my_grp->nr_tasks--;
				2067	grp->nr_tasks++;
				2068
				2069	spin_unlock(&my_grp->lock);
Mike Galbraith	60e69ee	2014-04-07 10:55:15 +0200	[diff] [blame]	2070	spin_unlock_irq(&grp->lock);
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2071
				2072	rcu_assign_pointer(p->numa_group, grp);
				2073
				2074	put_numa_group(my_grp);
Peter Zijlstra	3354781	2013-10-09 10:24:48 +0200	[diff] [blame]	2075	return;
				2076
				2077	no_join:
				2078	rcu_read_unlock();
				2079	return;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2080	}
				2081
				2082	void task_numa_free(struct task_struct *p)
				2083	{
				2084	struct numa_group *grp = p->numa_group;
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2085	void *numa_faults = p->numa_faults;
Steven Rostedt	e9dd685	2014-05-27 17:02:04 -0400	[diff] [blame]	2086	unsigned long flags;
				2087	int i;
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2088
				2089	if (grp) {
Steven Rostedt	e9dd685	2014-05-27 17:02:04 -0400	[diff] [blame]	2090	spin_lock_irqsave(&grp->lock, flags);
Rik van Riel	be1e4e7	2014-01-27 17:03:48 -0500	[diff] [blame]	2091	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2092	grp->faults[i] -= p->numa_faults[i];
Mel Gorman	989348b	2013-10-07 11:29:40 +0100	[diff] [blame]	2093	grp->total_faults -= p->total_numa_faults;
				2094
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2095	grp->nr_tasks--;
Steven Rostedt	e9dd685	2014-05-27 17:02:04 -0400	[diff] [blame]	2096	spin_unlock_irqrestore(&grp->lock, flags);
Andreea-Cristina Bernat	35b123e	2014-08-22 17:50:43 +0300	[diff] [blame]	2097	RCU_INIT_POINTER(p->numa_group, NULL);
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2098	put_numa_group(grp);
				2099	}
				2100
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2101	p->numa_faults = NULL;
Rik van Riel	8272701	2013-10-07 11:29:28 +0100	[diff] [blame]	2102	kfree(numa_faults);
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2103	}
				2104
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2105	/*
				2106	* Got a PROT_NONE fault for a page on @node.
				2107	*/
Rik van Riel	58b46da	2014-01-27 17:03:47 -0500	[diff] [blame]	2108	void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2109	{
				2110	struct task_struct *p = current;
Peter Zijlstra	6688cc0	2013-10-07 11:29:24 +0100	[diff] [blame]	2111	bool migrated = flags & TNF_MIGRATED;
Rik van Riel	58b46da	2014-01-27 17:03:47 -0500	[diff] [blame]	2112	int cpu_node = task_node(current);
Rik van Riel	792568e	2014-04-11 13:00:27 -0400	[diff] [blame]	2113	int local = !!(flags & TNF_FAULT_LOCAL);
Mel Gorman	ac8e895	2013-10-07 11:29:03 +0100	[diff] [blame]	2114	int priv;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2115
Srikar Dronamraju	2a59572	2015-08-11 21:54:21 +0530	[diff] [blame]	2116	if (!static_branch_likely(&sched_numa_balancing))
Mel Gorman	1a687c2	2012-11-22 11:16:36 +0000	[diff] [blame]	2117	return;
				2118
Mel Gorman	9ff1d9f	2013-10-07 11:29:04 +0100	[diff] [blame]	2119	/* for example, ksmd faulting in a user's mm */
				2120	if (!p->mm)
				2121	return;
				2122
Mel Gorman	f809ca9	2013-10-07 11:28:57 +0100	[diff] [blame]	2123	/* Allocate buffer to track faults on a per-node basis */
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2124	if (unlikely(!p->numa_faults)) {
				2125	int size = sizeof(p->numa_faults)
Rik van Riel	be1e4e7	2014-01-27 17:03:48 -0500	[diff] [blame]	2126	NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
Mel Gorman	f809ca9	2013-10-07 11:28:57 +0100	[diff] [blame]	2127
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2128	p->numa_faults = kzalloc(size, GFP_KERNEL\|__GFP_NOWARN);
				2129	if (!p->numa_faults)
Mel Gorman	f809ca9	2013-10-07 11:28:57 +0100	[diff] [blame]	2130	return;
Mel Gorman	745d614	2013-10-07 11:28:59 +0100	[diff] [blame]	2131
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	2132	p->total_numa_faults = 0;
Rik van Riel	04bb2f9	2013-10-07 11:29:36 +0100	[diff] [blame]	2133	memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
Mel Gorman	f809ca9	2013-10-07 11:28:57 +0100	[diff] [blame]	2134	}
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2135
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	2136	/*
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2137	* First accesses are treated as private, otherwise consider accesses
				2138	* to be private if the accessing pid has not changed
				2139	*/
				2140	if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
				2141	priv = 1;
				2142	} else {
				2143	priv = cpupid_match_pid(p, last_cpupid);
Peter Zijlstra	6688cc0	2013-10-07 11:29:24 +0100	[diff] [blame]	2144	if (!priv && !(flags & TNF_NO_GROUP))
Mel Gorman	3e6a941	2013-10-07 11:29:35 +0100	[diff] [blame]	2145	task_numa_group(p, last_cpupid, flags, &priv);
Peter Zijlstra	8c8a743	2013-10-07 11:29:21 +0100	[diff] [blame]	2146	}
				2147
Rik van Riel	792568e	2014-04-11 13:00:27 -0400	[diff] [blame]	2148	/*
				2149	* If a workload spans multiple NUMA nodes, a shared fault that
				2150	* occurs wholly within the set of nodes that the workload is
				2151	* actively using should be counted as local. This allows the
				2152	* scan rate to slow down when a workload has settled down.
				2153	*/
				2154	if (!priv && !local && p->numa_group &&
				2155	node_isset(cpu_node, p->numa_group->active_nodes) &&
				2156	node_isset(mem_node, p->numa_group->active_nodes))
				2157	local = 1;
				2158
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2159	task_numa_placement(p);
Mel Gorman	f809ca9	2013-10-07 11:28:57 +0100	[diff] [blame]	2160
Rik van Riel	2739d3e	2013-10-07 11:29:41 +0100	[diff] [blame]	2161	/*
				2162	* Retry task to preferred node migration periodically, in case it
				2163	* case it previously failed, or the scheduler moved us.
				2164	*/
				2165	if (time_after(jiffies, p->numa_migrate_retry))
Mel Gorman	6b9a746	2013-10-07 11:29:11 +0100	[diff] [blame]	2166	numa_migrate_preferred(p);
				2167
Ingo Molnar	b32e86b	2013-10-07 11:29:30 +0100	[diff] [blame]	2168	if (migrated)
				2169	p->numa_pages_migrated += pages;
Mel Gorman	074c238	2015-03-25 15:55:42 -0700	[diff] [blame]	2170	if (flags & TNF_MIGRATE_FAIL)
				2171	p->numa_faults_locality[2] += pages;
Ingo Molnar	b32e86b	2013-10-07 11:29:30 +0100	[diff] [blame]	2172
Iulia Manda	44dba3d	2014-10-31 02:13:31 +0200	[diff] [blame]	2173	p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
				2174	p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
Rik van Riel	792568e	2014-04-11 13:00:27 -0400	[diff] [blame]	2175	p->numa_faults_locality[local] += pages;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2176	}
				2177
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2178	static void reset_ptenuma_scan(struct task_struct *p)
				2179	{
Jason Low	7e5a2c1	2015-04-30 17:28:14 -0700	[diff] [blame]	2180	/*
				2181	* We only did a read acquisition of the mmap sem, so
				2182	* p->mm->numa_scan_seq is written to without exclusive access
				2183	* and the update is not guaranteed to be atomic. That's not
				2184	* much of an issue though, since this is just used for
				2185	* statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
				2186	* expensive, to avoid any form of compiler optimizations:
				2187	*/
Jason Low	316c1608d	2015-04-28 13:00:20 -0700	[diff] [blame]	2188	WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2189	p->mm->numa_scan_offset = 0;
				2190	}
				2191
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2192	/*
				2193	* The expensive part of numa migration is done from task_work context.
				2194	* Triggered from task_tick_numa().
				2195	*/
				2196	void task_numa_work(struct callback_head *work)
				2197	{
				2198	unsigned long migrate, next_scan, now = jiffies;
				2199	struct task_struct *p = current;
				2200	struct mm_struct *mm = p->mm;
Rik van Riel	5117084	2015-11-05 15:56:23 -0500	[diff] [blame]	2201	u64 runtime = p->se.sum_exec_runtime;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2202	struct vm_area_struct *vma;
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	2203	unsigned long start, end;
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	2204	unsigned long nr_pte_updates = 0;
Rik van Riel	4620f8c	2015-09-11 09:00:27 -0400	[diff] [blame]	2205	long pages, virtpages;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2206
				2207	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
				2208
				2209	work->next = work; /* protect against double add */
				2210	/*
				2211	* Who cares about NUMA placement when they're dying.
				2212	*
				2213	* NOTE: make sure not to dereference p->mm before this check,
				2214	* exit_task_work() happens _after_ exit_mm() so we could be called
				2215	* without p->mm even though we still had it when we enqueued this
				2216	* work.
				2217	*/
				2218	if (p->flags & PF_EXITING)
				2219	return;
				2220
Mel Gorman	930aa17	2013-10-07 11:29:37 +0100	[diff] [blame]	2221	if (!mm->numa_next_scan) {
Mel Gorman	7e8d16b	2013-10-07 11:28:54 +0100	[diff] [blame]	2222	mm->numa_next_scan = now +
				2223	msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	2224	}
				2225
				2226	/*
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2227	* Enforce maximal scan/migration frequency..
				2228	*/
				2229	migrate = mm->numa_next_scan;
				2230	if (time_before(now, migrate))
				2231	return;
				2232
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	2233	if (p->numa_scan_period == 0) {
				2234	p->numa_scan_period_max = task_scan_max(p);
				2235	p->numa_scan_period = task_scan_min(p);
				2236	}
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2237
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	2238	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2239	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
				2240	return;
				2241
Mel Gorman	e14808b	2012-11-19 10:59:15 +0000	[diff] [blame]	2242	/*
Peter Zijlstra	19a78d1	2013-10-07 11:28:51 +0100	[diff] [blame]	2243	* Delay this task enough that another task of this mm will likely win
				2244	* the next time around.
				2245	*/
				2246	p->node_stamp += 2 * TICK_NSEC;
				2247
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	2248	start = mm->numa_scan_offset;
				2249	pages = sysctl_numa_balancing_scan_size;
				2250	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
Rik van Riel	4620f8c	2015-09-11 09:00:27 -0400	[diff] [blame]	2251	virtpages = pages * 8; /* Scan up to this much virtual space */
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	2252	if (!pages)
				2253	return;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2254
Rik van Riel	4620f8c	2015-09-11 09:00:27 -0400	[diff] [blame]	2255
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2256	down_read(&mm->mmap_sem);
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	2257	vma = find_vma(mm, start);
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2258	if (!vma) {
				2259	reset_ptenuma_scan(p);
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	2260	start = 0;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2261	vma = mm->mmap;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2262	}
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	2263	for (; vma; vma = vma->vm_next) {
Naoya Horiguchi	6b79c57	2015-04-07 14:26:47 -0700	[diff] [blame]	2264	if (!vma_migratable(vma) \|\| !vma_policy_mof(vma) \|\|
Mel Gorman	8e76d4e	2015-06-10 11:15:00 -0700	[diff] [blame]	2265	is_vm_hugetlb_page(vma) \|\| (vma->vm_flags & VM_MIXEDMAP)) {
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2266	continue;
Naoya Horiguchi	6b79c57	2015-04-07 14:26:47 -0700	[diff] [blame]	2267	}
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2268
Mel Gorman	4591ce4f	2013-10-07 11:29:13 +0100	[diff] [blame]	2269	/*
				2270	* Shared library pages mapped by multiple processes are not
				2271	* migrated as it is expected they are cache replicated. Avoid
				2272	* hinting faults in read-only file-backed mappings or the vdso
				2273	* as migrating the pages will be of marginal benefit.
				2274	*/
				2275	if (!vma->vm_mm \|\|
				2276	(vma->vm_file && (vma->vm_flags & (VM_READ\|VM_WRITE)) == (VM_READ)))
				2277	continue;
				2278
Mel Gorman	3c67f47	2013-12-18 17:08:40 -0800	[diff] [blame]	2279	/*
				2280	* Skip inaccessible VMAs to avoid any confusion between
				2281	* PROT_NONE and NUMA hinting ptes
				2282	*/
				2283	if (!(vma->vm_flags & (VM_READ \| VM_EXEC \| VM_WRITE)))
				2284	continue;
				2285
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	2286	do {
				2287	start = max(start, vma->vm_start);
				2288	end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
				2289	end = min(end, vma->vm_end);
Rik van Riel	4620f8c	2015-09-11 09:00:27 -0400	[diff] [blame]	2290	nr_pte_updates = change_prot_numa(vma, start, end);
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	2291
				2292	/*
Rik van Riel	4620f8c	2015-09-11 09:00:27 -0400	[diff] [blame]	2293	* Try to scan sysctl_numa_balancing_size worth of
				2294	* hpages that have at least one present PTE that
				2295	* is not already pte-numa. If the VMA contains
				2296	* areas that are unused or already full of prot_numa
				2297	* PTEs, scan up to virtpages, to skip through those
				2298	* areas faster.
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	2299	*/
				2300	if (nr_pte_updates)
				2301	pages -= (end - start) >> PAGE_SHIFT;
Rik van Riel	4620f8c	2015-09-11 09:00:27 -0400	[diff] [blame]	2302	virtpages -= (end - start) >> PAGE_SHIFT;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2303
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	2304	start = end;
Rik van Riel	4620f8c	2015-09-11 09:00:27 -0400	[diff] [blame]	2305	if (pages <= 0 \|\| virtpages <= 0)
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	2306	goto out;
Rik van Riel	3cf1962	2014-02-18 17:12:44 -0500	[diff] [blame]	2307
				2308	cond_resched();
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	2309	} while (end != vma->vm_end);
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2310	}
				2311
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	2312	out:
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2313	/*
Peter Zijlstra	c69307d	2013-10-07 11:28:41 +0100	[diff] [blame]	2314	* It is possible to reach the end of the VMA list but the last few
				2315	* VMAs are not guaranteed to the vma_migratable. If they are not, we
				2316	* would find the !migratable VMA on the next scan but not reset the
				2317	* scanner to the start so check it now.
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2318	*/
				2319	if (vma)
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	2320	mm->numa_scan_offset = start;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	2321	else
				2322	reset_ptenuma_scan(p);
				2323	up_read(&mm->mmap_sem);
Rik van Riel	5117084	2015-11-05 15:56:23 -0500	[diff] [blame]	2324
				2325	/*
				2326	* Make sure tasks use at least 32x as much time to run other code
				2327	* than they used here, to limit NUMA PTE scanning overhead to 3% max.
				2328	* Usually update_task_scan_period slows down scanning enough; on an
				2329	* overloaded system we need to limit overhead on a per task basis.
				2330	*/
				2331	if (unlikely(p->se.sum_exec_runtime != runtime)) {
				2332	u64 diff = p->se.sum_exec_runtime - runtime;
				2333	p->node_stamp += 32 * diff;
				2334	}
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2335	}
				2336
				2337	/*
				2338	* Drive the periodic memory faults..
				2339	*/
				2340	void task_tick_numa(struct rq rq, struct task_struct curr)
				2341	{
				2342	struct callback_head *work = &curr->numa_work;
				2343	u64 period, now;
				2344
				2345	/*
				2346	* We don't care about NUMA placement if we don't have memory.
				2347	*/
				2348	if (!curr->mm \|\| (curr->flags & PF_EXITING) \|\| work->next != work)
				2349	return;
				2350
				2351	/*
				2352	* Using runtime rather than walltime has the dual advantage that
				2353	* we (mostly) drive the selection from busy threads and that the
				2354	* task needs to have done some actual work before we bother with
				2355	* NUMA placement.
				2356	*/
				2357	now = curr->se.sum_exec_runtime;
				2358	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
				2359
Rik van Riel	25b3e5a	2015-11-05 15:56:22 -0500	[diff] [blame]	2360	if (now > curr->node_stamp + period) {
Peter Zijlstra	4b96a29	2012-10-25 14:16:47 +0200	[diff] [blame]	2361	if (!curr->node_stamp)
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	2362	curr->numa_scan_period = task_scan_min(curr);
Peter Zijlstra	19a78d1	2013-10-07 11:28:51 +0100	[diff] [blame]	2363	curr->node_stamp += period;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2364
				2365	if (!time_before(jiffies, curr->mm->numa_next_scan)) {
				2366	init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
				2367	task_work_add(curr, work, true);
				2368	}
				2369	}
				2370	}
				2371	#else
				2372	static void task_tick_numa(struct rq rq, struct task_struct curr)
				2373	{
				2374	}
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	2375
				2376	static inline void account_numa_enqueue(struct rq rq, struct task_struct p)
				2377	{
				2378	}
				2379
				2380	static inline void account_numa_dequeue(struct rq rq, struct task_struct p)
				2381	{
				2382	}
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	2383	#endif /* CONFIG_NUMA_BALANCING */
				2384
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	2385	static void
				2386	account_entity_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
				2387	{
				2388	update_load_add(&cfs_rq->load, se->load.weight);
Peter Zijlstra	c09595f	2008-06-27 13:41:14 +0200	[diff] [blame]	2389	if (!parent_entity(se))
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2390	update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	2391	#ifdef CONFIG_SMP
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	2392	if (entity_is_task(se)) {
				2393	struct rq *rq = rq_of(cfs_rq);
				2394
				2395	account_numa_enqueue(rq, task_of(se));
				2396	list_add(&se->group_node, &rq->cfs_tasks);
				2397	}
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	2398	#endif
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	2399	cfs_rq->nr_running++;
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	2400	}
				2401
				2402	static void
				2403	account_entity_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)
				2404	{
				2405	update_load_sub(&cfs_rq->load, se->load.weight);
Peter Zijlstra	c09595f	2008-06-27 13:41:14 +0200	[diff] [blame]	2406	if (!parent_entity(se))
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2407	update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	2408	if (entity_is_task(se)) {
				2409	account_numa_dequeue(rq_of(cfs_rq), task_of(se));
Bharata B Rao	b87f172	2008-09-25 09:53:54 +0530	[diff] [blame]	2410	list_del_init(&se->group_node);
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	2411	}
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	2412	cfs_rq->nr_running--;
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	2413	}
				2414
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	2415	#ifdef CONFIG_FAIR_GROUP_SCHED
				2416	# ifdef CONFIG_SMP
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	2417	static inline long calc_tg_weight(struct task_group tg, struct cfs_rq cfs_rq)
				2418	{
				2419	long tg_weight;
				2420
				2421	/*
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2422	* Use this CPU's real-time load instead of the last load contribution
				2423	* as the updating of the contribution is delayed, and we will use the
				2424	* the real-time load to calc the share. See update_tg_load_avg().
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	2425	*/
Alex Shi	bf5b986	2013-06-20 10:18:54 +0800	[diff] [blame]	2426	tg_weight = atomic_long_read(&tg->load_avg);
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2427	tg_weight -= cfs_rq->tg_load_avg_contrib;
Yuyang Du	fde7d22e	2015-10-13 09:18:22 +0800	[diff] [blame]	2428	tg_weight += cfs_rq->load.weight;
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	2429
				2430	return tg_weight;
				2431	}
				2432
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	2433	static long calc_cfs_shares(struct cfs_rq cfs_rq, struct task_group tg)
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	2434	{
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	2435	long tg_weight, load, shares;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	2436
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	2437	tg_weight = calc_tg_weight(tg, cfs_rq);
Yuyang Du	fde7d22e	2015-10-13 09:18:22 +0800	[diff] [blame]	2438	load = cfs_rq->load.weight;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	2439
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	2440	shares = (tg->shares * load);
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	2441	if (tg_weight)
				2442	shares /= tg_weight;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	2443
				2444	if (shares < MIN_SHARES)
				2445	shares = MIN_SHARES;
				2446	if (shares > tg->shares)
				2447	shares = tg->shares;
				2448
				2449	return shares;
				2450	}
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	2451	# else /* CONFIG_SMP */
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	2452	static inline long calc_cfs_shares(struct cfs_rq cfs_rq, struct task_group tg)
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	2453	{
				2454	return tg->shares;
				2455	}
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	2456	# endif /* CONFIG_SMP */
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2457	static void reweight_entity(struct cfs_rq cfs_rq, struct sched_entity se,
				2458	unsigned long weight)
				2459	{
Paul Turner	19e5eeb	2010-12-15 19:10:18 -0800	[diff] [blame]	2460	if (se->on_rq) {
				2461	/* commit outstanding execution time */
				2462	if (cfs_rq->curr == se)
				2463	update_curr(cfs_rq);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2464	account_entity_dequeue(cfs_rq, se);
Paul Turner	19e5eeb	2010-12-15 19:10:18 -0800	[diff] [blame]	2465	}
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2466
				2467	update_load_set(&se->load, weight);
				2468
				2469	if (se->on_rq)
				2470	account_entity_enqueue(cfs_rq, se);
				2471	}
				2472
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	2473	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
				2474
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	2475	static void update_cfs_shares(struct cfs_rq *cfs_rq)
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2476	{
				2477	struct task_group *tg;
				2478	struct sched_entity *se;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	2479	long shares;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2480
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2481	tg = cfs_rq->tg;
				2482	se = tg->se[cpu_of(rq_of(cfs_rq))];
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2483	if (!se \|\| throttled_hierarchy(cfs_rq))
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2484	return;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	2485	#ifndef CONFIG_SMP
				2486	if (likely(se->load.weight == tg->shares))
				2487	return;
				2488	#endif
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	2489	shares = calc_cfs_shares(cfs_rq, tg);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2490
				2491	reweight_entity(cfs_rq_of(se), se, shares);
				2492	}
				2493	#else /* CONFIG_FAIR_GROUP_SCHED */
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	2494	static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2495	{
				2496	}
				2497	#endif /* CONFIG_FAIR_GROUP_SCHED */
				2498
Alex Shi	141965c	2013-06-26 13:05:39 +0800	[diff] [blame]	2499	#ifdef CONFIG_SMP
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	2500	/* Precomputed fixed inverse multiplies for multiplication by y^n */
				2501	static const u32 runnable_avg_yN_inv[] = {
				2502	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
				2503	0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
				2504	0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
				2505	0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
				2506	0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
				2507	0x85aac367, 0x82cd8698,
				2508	};
				2509
				2510	/*
				2511	* Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
				2512	* over-estimates when re-combining.
				2513	*/
				2514	static const u32 runnable_avg_yN_sum[] = {
				2515	0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
				2516	9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
				2517	17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
				2518	};
				2519
				2520	/*
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2521	* Approximate:
				2522	* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
				2523	*/
				2524	static __always_inline u64 decay_load(u64 val, u64 n)
				2525	{
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	2526	unsigned int local_n;
				2527
				2528	if (!n)
				2529	return val;
				2530	else if (unlikely(n > LOAD_AVG_PERIOD * 63))
				2531	return 0;
				2532
				2533	/* after bounds checking we can collapse to 32-bit */
				2534	local_n = n;
				2535
				2536	/*
				2537	* As y^PERIOD = 1/2, we can combine
Zhihui Zhang	9c58c79	2014-09-20 21:24:36 -0400	[diff] [blame]	2538	* y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
				2539	* With a look-up table which covers y^n (n<PERIOD)
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	2540	*
				2541	* To achieve constant time decay_load.
				2542	*/
				2543	if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
				2544	val >>= local_n / LOAD_AVG_PERIOD;
				2545	local_n %= LOAD_AVG_PERIOD;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2546	}
				2547
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2548	val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
				2549	return val;
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	2550	}
				2551
				2552	/*
				2553	* For updates fully spanning n periods, the contribution to runnable
				2554	* average will be: \Sum 1024*y^n
				2555	*
				2556	* We can compute this reasonably efficiently by combining:
				2557	* y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
				2558	*/
				2559	static u32 __compute_runnable_contrib(u64 n)
				2560	{
				2561	u32 contrib = 0;
				2562
				2563	if (likely(n <= LOAD_AVG_PERIOD))
				2564	return runnable_avg_yN_sum[n];
				2565	else if (unlikely(n >= LOAD_AVG_MAX_N))
				2566	return LOAD_AVG_MAX;
				2567
				2568	/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
				2569	do {
				2570	contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
				2571	contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
				2572
				2573	n -= LOAD_AVG_PERIOD;
				2574	} while (n > LOAD_AVG_PERIOD);
				2575
				2576	contrib = decay_load(contrib, n);
				2577	return contrib + runnable_avg_yN_sum[n];
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2578	}
				2579
Peter Zijlstra	006cdf0	2015-09-09 09:06:17 +0200	[diff] [blame]	2580	#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 \|\| SCHED_CAPACITY_SHIFT != 10
				2581	#error "load tracking assumes 2^10 as unit"
				2582	#endif
				2583
Peter Zijlstra	54a2138	2015-09-07 15:05:42 +0200	[diff] [blame]	2584	#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
Dietmar Eggemann	e0f5f3a	2015-08-14 17:23:09 +0100	[diff] [blame]	2585
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2586	/*
				2587	* We can represent the historical contribution to runnable average as the
				2588	* coefficients of a geometric series. To do this we sub-divide our runnable
				2589	* history into segments of approximately 1ms (1024us); label the segment that
				2590	* occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
				2591	*
				2592	* [<- 1024us ->\|<- 1024us ->\|<- 1024us ->\| ...
				2593	* p0 p1 p2
				2594	* (now) (~1ms ago) (~2ms ago)
				2595	*
				2596	* Let u_i denote the fraction of p_i that the entity was runnable.
				2597	*
				2598	* We then designate the fractions u_i as our co-efficients, yielding the
				2599	* following representation of historical load:
				2600	* u_0 + u_1y + u_2y^2 + u_3*y^3 + ...
				2601	*
				2602	* We choose y based on the with of a reasonably scheduling period, fixing:
				2603	* y^32 = 0.5
				2604	*
				2605	* This means that the contribution to load ~32ms ago (u_32) will be weighted
				2606	* approximately half as much as the contribution to load within the last ms
				2607	* (u_0).
				2608	*
				2609	* When a period "rolls over" and we have new u_0`, multiplying the previous
				2610	* sum again by y is sufficient to update:
				2611	* load_avg = u_0` + y(u_0 + u_1y + u_2*y^2 + ... )
				2612	* = u_0 + u_1y + u_2y^2 + ... [re-labeling u_i --> u_{i+1}]
				2613	*/
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2614	static __always_inline int
				2615	__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	2616	unsigned long weight, int running, struct cfs_rq *cfs_rq)
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2617	{
Dietmar Eggemann	e0f5f3a	2015-08-14 17:23:09 +0100	[diff] [blame]	2618	u64 delta, scaled_delta, periods;
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2619	u32 contrib;
Peter Zijlstra	6115c79	2015-09-07 15:09:15 +0200	[diff] [blame]	2620	unsigned int delta_w, scaled_delta_w, decayed = 0;
Dietmar Eggemann	6f2b045	2015-09-07 14:57:22 +0100	[diff] [blame]	2621	unsigned long scale_freq, scale_cpu;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2622
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2623	delta = now - sa->last_update_time;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2624	/*
				2625	* This should only happen when time goes backwards, which it
				2626	* unfortunately does during sched clock init when we swap over to TSC.
				2627	*/
				2628	if ((s64)delta < 0) {
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2629	sa->last_update_time = now;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2630	return 0;
				2631	}
				2632
				2633	/*
				2634	* Use 1024ns as the unit of measurement since it's a reasonable
				2635	* approximation of 1us and fast to compute.
				2636	*/
				2637	delta >>= 10;
				2638	if (!delta)
				2639	return 0;
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2640	sa->last_update_time = now;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2641
Dietmar Eggemann	6f2b045	2015-09-07 14:57:22 +0100	[diff] [blame]	2642	scale_freq = arch_scale_freq_capacity(NULL, cpu);
				2643	scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
				2644
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2645	/* delta_w is the amount already accumulated against our next period */
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2646	delta_w = sa->period_contrib;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2647	if (delta + delta_w >= 1024) {
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2648	decayed = 1;
				2649
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2650	/* how much left for next period will start over, we don't know yet */
				2651	sa->period_contrib = 0;
				2652
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2653	/*
				2654	* Now that we know we're crossing a period boundary, figure
				2655	* out how much from delta we need to complete the current
				2656	* period and accrue it.
				2657	*/
				2658	delta_w = 1024 - delta_w;
Peter Zijlstra	54a2138	2015-09-07 15:05:42 +0200	[diff] [blame]	2659	scaled_delta_w = cap_scale(delta_w, scale_freq);
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	2660	if (weight) {
Dietmar Eggemann	e0f5f3a	2015-08-14 17:23:09 +0100	[diff] [blame]	2661	sa->load_sum += weight * scaled_delta_w;
				2662	if (cfs_rq) {
				2663	cfs_rq->runnable_load_sum +=
				2664	weight * scaled_delta_w;
				2665	}
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	2666	}
Vincent Guittot	36ee28e	2015-02-27 16:54:04 +0100	[diff] [blame]	2667	if (running)
Peter Zijlstra	006cdf0	2015-09-09 09:06:17 +0200	[diff] [blame]	2668	sa->util_sum += scaled_delta_w * scale_cpu;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2669
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	2670	delta -= delta_w;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2671
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	2672	/* Figure out how many additional periods this update spans */
				2673	periods = delta / 1024;
				2674	delta %= 1024;
				2675
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2676	sa->load_sum = decay_load(sa->load_sum, periods + 1);
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	2677	if (cfs_rq) {
				2678	cfs_rq->runnable_load_sum =
				2679	decay_load(cfs_rq->runnable_load_sum, periods + 1);
				2680	}
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2681	sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	2682
				2683	/* Efficiently calculate \sum (1..n_period) 1024y^i /
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2684	contrib = __compute_runnable_contrib(periods);
Peter Zijlstra	54a2138	2015-09-07 15:05:42 +0200	[diff] [blame]	2685	contrib = cap_scale(contrib, scale_freq);
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	2686	if (weight) {
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2687	sa->load_sum += weight * contrib;
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	2688	if (cfs_rq)
				2689	cfs_rq->runnable_load_sum += weight * contrib;
				2690	}
Vincent Guittot	36ee28e	2015-02-27 16:54:04 +0100	[diff] [blame]	2691	if (running)
Peter Zijlstra	006cdf0	2015-09-09 09:06:17 +0200	[diff] [blame]	2692	sa->util_sum += contrib * scale_cpu;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2693	}
				2694
				2695	/* Remainder of delta accrued against u_0` */
Peter Zijlstra	54a2138	2015-09-07 15:05:42 +0200	[diff] [blame]	2696	scaled_delta = cap_scale(delta, scale_freq);
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	2697	if (weight) {
Dietmar Eggemann	e0f5f3a	2015-08-14 17:23:09 +0100	[diff] [blame]	2698	sa->load_sum += weight * scaled_delta;
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	2699	if (cfs_rq)
Dietmar Eggemann	e0f5f3a	2015-08-14 17:23:09 +0100	[diff] [blame]	2700	cfs_rq->runnable_load_sum += weight * scaled_delta;
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	2701	}
Vincent Guittot	36ee28e	2015-02-27 16:54:04 +0100	[diff] [blame]	2702	if (running)
Peter Zijlstra	006cdf0	2015-09-09 09:06:17 +0200	[diff] [blame]	2703	sa->util_sum += scaled_delta * scale_cpu;
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2704
				2705	sa->period_contrib += delta;
				2706
				2707	if (decayed) {
				2708	sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	2709	if (cfs_rq) {
				2710	cfs_rq->runnable_load_avg =
				2711	div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
				2712	}
Peter Zijlstra	006cdf0	2015-09-09 09:06:17 +0200	[diff] [blame]	2713	sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2714	}
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2715
				2716	return decayed;
				2717	}
				2718
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	2719	#ifdef CONFIG_FAIR_GROUP_SCHED
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	2720	/*
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2721	* Updating tg's load_avg is necessary before update_cfs_share (which is done)
				2722	* and effective_load (which is not done because it is too costly).
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	2723	*/
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2724	static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	2725	{
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2726	long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	2727
Waiman Long	aa0b7ae	2015-12-02 13:41:50 -0500	[diff] [blame]	2728	/*
				2729	* No need to update load_avg for root_task_group as it is not used.
				2730	*/
				2731	if (cfs_rq->tg == &root_task_group)
				2732	return;
				2733
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2734	if (force \|\| abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
				2735	atomic_long_add(delta, &cfs_rq->tg->load_avg);
				2736	cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	2737	}
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	2738	}
Dietmar Eggemann	f5f9739	2014-02-26 11:19:33 +0000	[diff] [blame]	2739
Byungchul Park	ad936d8	2015-10-24 01:16:19 +0900	[diff] [blame]	2740	/*
				2741	* Called within set_task_rq() right before setting a task's cpu. The
				2742	* caller only guarantees p->pi_lock is held; no other assumptions,
				2743	* including the state of rq->lock, should be made.
				2744	*/
				2745	void set_task_rq_fair(struct sched_entity *se,
				2746	struct cfs_rq prev, struct cfs_rq next)
				2747	{
				2748	if (!sched_feat(ATTACH_AGE_LOAD))
				2749	return;
				2750
				2751	/*
				2752	* We are supposed to update the task to "current" time, then its up to
				2753	* date and ready to go to new CPU/cfs_rq. But we have difficulty in
				2754	* getting what current time is, so simply throw away the out-of-date
				2755	* time. This will result in the wakee task is less decayed, but giving
				2756	* the wakee more load sounds not bad.
				2757	*/
				2758	if (se->avg.last_update_time && prev) {
				2759	u64 p_last_update_time;
				2760	u64 n_last_update_time;
				2761
				2762	#ifndef CONFIG_64BIT
				2763	u64 p_last_update_time_copy;
				2764	u64 n_last_update_time_copy;
				2765
				2766	do {
				2767	p_last_update_time_copy = prev->load_last_update_time_copy;
				2768	n_last_update_time_copy = next->load_last_update_time_copy;
				2769
				2770	smp_rmb();
				2771
				2772	p_last_update_time = prev->avg.last_update_time;
				2773	n_last_update_time = next->avg.last_update_time;
				2774
				2775	} while (p_last_update_time != p_last_update_time_copy \|\|
				2776	n_last_update_time != n_last_update_time_copy);
				2777	#else
				2778	p_last_update_time = prev->avg.last_update_time;
				2779	n_last_update_time = next->avg.last_update_time;
				2780	#endif
				2781	__update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
				2782	&se->avg, 0, 0, NULL);
				2783	se->avg.last_update_time = n_last_update_time;
				2784	}
				2785	}
Peter Zijlstra	6e83125	2014-02-11 16:11:48 +0100	[diff] [blame]	2786	#else /* CONFIG_FAIR_GROUP_SCHED */
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2787	static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
Peter Zijlstra	6e83125	2014-02-11 16:11:48 +0100	[diff] [blame]	2788	#endif /* CONFIG_FAIR_GROUP_SCHED */
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	2789
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2790	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
				2791
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2792	/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
				2793	static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
				2794	{
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2795	struct sched_avg *sa = &cfs_rq->avg;
Yuyang Du	3e386d5	2015-10-13 09:18:23 +0800	[diff] [blame]	2796	int decayed, removed = 0;
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2797
				2798	if (atomic_long_read(&cfs_rq->removed_load_avg)) {
Andrey Ryabinin	9e0e83a	2015-12-14 15:47:23 +0300	[diff] [blame]	2799	s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2800	sa->load_avg = max_t(long, sa->load_avg - r, 0);
				2801	sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
Yuyang Du	3e386d5	2015-10-13 09:18:23 +0800	[diff] [blame]	2802	removed = 1;
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2803	}
				2804
				2805	if (atomic_long_read(&cfs_rq->removed_util_avg)) {
				2806	long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
				2807	sa->util_avg = max_t(long, sa->util_avg - r, 0);
Peter Zijlstra	006cdf0	2015-09-09 09:06:17 +0200	[diff] [blame]	2808	sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2809	}
				2810
				2811	decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	2812	scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2813
				2814	#ifndef CONFIG_64BIT
				2815	smp_wmb();
				2816	cfs_rq->load_last_update_time_copy = sa->last_update_time;
				2817	#endif
				2818
Yuyang Du	3e386d5	2015-10-13 09:18:23 +0800	[diff] [blame]	2819	return decayed \|\| removed;
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2820	}
				2821
				2822	/* Update task and its cfs_rq load average */
				2823	static inline void update_load_avg(struct sched_entity *se, int update_tg)
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2824	{
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	2825	struct cfs_rq *cfs_rq = cfs_rq_of(se);
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2826	u64 now = cfs_rq_clock_task(cfs_rq);
Byungchul Park	a05e8c5	2015-08-20 20:21:56 +0900	[diff] [blame]	2827	int cpu = cpu_of(rq_of(cfs_rq));
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	2828
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2829	/*
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2830	* Track task load average for carrying it to new CPU after migrated, and
				2831	* track group sched_entity load average for task_h_load calc in migration
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2832	*/
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2833	__update_load_avg(now, cpu, &se->avg,
Byungchul Park	a05e8c5	2015-08-20 20:21:56 +0900	[diff] [blame]	2834	se->on_rq * scale_load_down(se->load.weight),
				2835	cfs_rq->curr == se, NULL);
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2836
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2837	if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
				2838	update_tg_load_avg(cfs_rq, 0);
				2839	}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	2840
Byungchul Park	a05e8c5	2015-08-20 20:21:56 +0900	[diff] [blame]	2841	static void attach_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se)
				2842	{
Peter Zijlstra	a928051	2015-09-11 16:10:59 +0200	[diff] [blame]	2843	if (!sched_feat(ATTACH_AGE_LOAD))
				2844	goto skip_aging;
				2845
Byungchul Park	6efdb10	2015-08-20 20:21:59 +0900	[diff] [blame]	2846	/*
				2847	* If we got migrated (either between CPUs or between cgroups) we'll
				2848	* have aged the average right before clearing @last_update_time.
				2849	*/
				2850	if (se->avg.last_update_time) {
				2851	__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
				2852	&se->avg, 0, 0, NULL);
				2853
				2854	/*
				2855	* XXX: we could have just aged the entire load away if we've been
				2856	* absent from the fair class for too long.
				2857	*/
				2858	}
				2859
Peter Zijlstra	a928051	2015-09-11 16:10:59 +0200	[diff] [blame]	2860	skip_aging:
Byungchul Park	a05e8c5	2015-08-20 20:21:56 +0900	[diff] [blame]	2861	se->avg.last_update_time = cfs_rq->avg.last_update_time;
				2862	cfs_rq->avg.load_avg += se->avg.load_avg;
				2863	cfs_rq->avg.load_sum += se->avg.load_sum;
				2864	cfs_rq->avg.util_avg += se->avg.util_avg;
				2865	cfs_rq->avg.util_sum += se->avg.util_sum;
				2866	}
				2867
				2868	static void detach_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se)
				2869	{
				2870	__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
				2871	&se->avg, se->on_rq * scale_load_down(se->load.weight),
				2872	cfs_rq->curr == se, NULL);
				2873
				2874	cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
				2875	cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0);
				2876	cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
				2877	cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0);
				2878	}
				2879
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2880	/* Add the load generated by se into cfs_rq's load average */
				2881	static inline void
				2882	enqueue_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se)
				2883	{
				2884	struct sched_avg *sa = &se->avg;
				2885	u64 now = cfs_rq_clock_task(cfs_rq);
Byungchul Park	a05e8c5	2015-08-20 20:21:56 +0900	[diff] [blame]	2886	int migrated, decayed;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2887
Byungchul Park	a05e8c5	2015-08-20 20:21:56 +0900	[diff] [blame]	2888	migrated = !sa->last_update_time;
				2889	if (!migrated) {
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2890	__update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	2891	se->on_rq * scale_load_down(se->load.weight),
				2892	cfs_rq->curr == se, NULL);
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2893	}
				2894
				2895	decayed = update_cfs_rq_load_avg(now, cfs_rq);
				2896
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	2897	cfs_rq->runnable_load_avg += sa->load_avg;
				2898	cfs_rq->runnable_load_sum += sa->load_sum;
				2899
Byungchul Park	a05e8c5	2015-08-20 20:21:56 +0900	[diff] [blame]	2900	if (migrated)
				2901	attach_entity_load_avg(cfs_rq, se);
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2902
				2903	if (decayed \|\| migrated)
				2904	update_tg_load_avg(cfs_rq, 0);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2905	}
				2906
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	2907	/* Remove the runnable load generated by se from cfs_rq's runnable load average */
				2908	static inline void
				2909	dequeue_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se)
				2910	{
				2911	update_load_avg(se, 1);
				2912
				2913	cfs_rq->runnable_load_avg =
				2914	max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
				2915	cfs_rq->runnable_load_sum =
Byungchul Park	a05e8c5	2015-08-20 20:21:56 +0900	[diff] [blame]	2916	max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	2917	}
				2918
Yuyang Du	0905f04	2015-12-17 07:34:27 +0800	[diff] [blame]	2919	#ifndef CONFIG_64BIT
				2920	static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
				2921	{
				2922	u64 last_update_time_copy;
				2923	u64 last_update_time;
				2924
				2925	do {
				2926	last_update_time_copy = cfs_rq->load_last_update_time_copy;
				2927	smp_rmb();
				2928	last_update_time = cfs_rq->avg.last_update_time;
				2929	} while (last_update_time != last_update_time_copy);
				2930
				2931	return last_update_time;
				2932	}
				2933	#else
				2934	static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
				2935	{
				2936	return cfs_rq->avg.last_update_time;
				2937	}
				2938	#endif
				2939
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2940	/*
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2941	* Task first catches up with cfs_rq, and then subtract
				2942	* itself from the cfs_rq (task must be off the queue now).
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2943	*/
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2944	void remove_entity_load_avg(struct sched_entity *se)
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2945	{
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2946	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				2947	u64 last_update_time;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2948
Yuyang Du	0905f04	2015-12-17 07:34:27 +0800	[diff] [blame]	2949	/*
				2950	* Newly created task or never used group entity should not be removed
				2951	* from its (source) cfs_rq
				2952	*/
				2953	if (se->avg.last_update_time == 0)
				2954	return;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2955
Yuyang Du	0905f04	2015-12-17 07:34:27 +0800	[diff] [blame]	2956	last_update_time = cfs_rq_last_update_time(cfs_rq);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2957
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	2958	__update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2959	atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
				2960	atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	2961	}
Vincent Guittot	642dbc3	2013-04-18 18:34:26 +0200	[diff] [blame]	2962
Yuyang Du	7ea241a	2015-07-15 08:04:42 +0800	[diff] [blame]	2963	static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
				2964	{
				2965	return cfs_rq->runnable_load_avg;
				2966	}
				2967
				2968	static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
				2969	{
				2970	return cfs_rq->avg.load_avg;
				2971	}
				2972
Peter Zijlstra	6e83125	2014-02-11 16:11:48 +0100	[diff] [blame]	2973	static int idle_balance(struct rq *this_rq);
				2974
Peter Zijlstra	38033c3	2014-01-23 20:32:21 +0100	[diff] [blame]	2975	#else /* CONFIG_SMP */
				2976
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2977	static inline void update_load_avg(struct sched_entity *se, int update_tg) {}
				2978	static inline void
				2979	enqueue_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) {}
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	2980	static inline void
				2981	dequeue_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) {}
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	2982	static inline void remove_entity_load_avg(struct sched_entity *se) {}
Peter Zijlstra	6e83125	2014-02-11 16:11:48 +0100	[diff] [blame]	2983
Byungchul Park	a05e8c5	2015-08-20 20:21:56 +0900	[diff] [blame]	2984	static inline void
				2985	attach_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) {}
				2986	static inline void
				2987	detach_entity_load_avg(struct cfs_rq cfs_rq, struct sched_entity se) {}
				2988
Peter Zijlstra	6e83125	2014-02-11 16:11:48 +0100	[diff] [blame]	2989	static inline int idle_balance(struct rq *rq)
				2990	{
				2991	return 0;
				2992	}
				2993
Peter Zijlstra	38033c3	2014-01-23 20:32:21 +0100	[diff] [blame]	2994	#endif /* CONFIG_SMP */
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2995
Ingo Molnar	2396af6	2007-08-09 11:16:48 +0200	[diff] [blame]	2996	static void enqueue_sleeper(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2997	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2998	#ifdef CONFIG_SCHEDSTATS
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	2999	struct task_struct *tsk = NULL;
				3000
				3001	if (entity_is_task(se))
				3002	tsk = task_of(se);
				3003
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3004	if (se->statistics.sleep_start) {
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	3005	u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3006
				3007	if ((s64)delta < 0)
				3008	delta = 0;
				3009
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3010	if (unlikely(delta > se->statistics.sleep_max))
				3011	se->statistics.sleep_max = delta;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3012
Peter Zijlstra	8c79a04	2012-01-30 14:51:37 +0100	[diff] [blame]	3013	se->statistics.sleep_start = 0;
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3014	se->statistics.sum_sleep_runtime += delta;
Arjan van de Ven	9745512	2008-01-25 21:08:34 +0100	[diff] [blame]	3015
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	3016	if (tsk) {
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	3017	account_scheduler_latency(tsk, delta >> 10, 1);
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	3018	trace_sched_stat_sleep(tsk, delta);
				3019	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3020	}
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3021	if (se->statistics.block_start) {
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	3022	u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3023
				3024	if ((s64)delta < 0)
				3025	delta = 0;
				3026
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3027	if (unlikely(delta > se->statistics.block_max))
				3028	se->statistics.block_max = delta;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3029
Peter Zijlstra	8c79a04	2012-01-30 14:51:37 +0100	[diff] [blame]	3030	se->statistics.block_start = 0;
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3031	se->statistics.sum_sleep_runtime += delta;
Ingo Molnar	30084fb	2007-10-02 14:13:08 +0200	[diff] [blame]	3032
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	3033	if (tsk) {
Arjan van de Ven	8f0dfc3	2009-07-20 11:26:58 -0700	[diff] [blame]	3034	if (tsk->in_iowait) {
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3035	se->statistics.iowait_sum += delta;
				3036	se->statistics.iowait_count++;
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	3037	trace_sched_stat_iowait(tsk, delta);
Arjan van de Ven	8f0dfc3	2009-07-20 11:26:58 -0700	[diff] [blame]	3038	}
				3039
Andrew Vagin	b781a60	2011-11-28 12:03:35 +0300	[diff] [blame]	3040	trace_sched_stat_blocked(tsk, delta);
				3041
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	3042	/*
				3043	* Blocking time is in units of nanosecs, so shift by
				3044	* 20 to get a milliseconds-range estimation of the
				3045	* amount of time that the task spent sleeping:
				3046	*/
				3047	if (unlikely(prof_on == SLEEP_PROFILING)) {
				3048	profile_hits(SLEEP_PROFILING,
				3049	(void *)get_wchan(tsk),
				3050	delta >> 20);
				3051	}
				3052	account_scheduler_latency(tsk, delta >> 10, 0);
Ingo Molnar	30084fb	2007-10-02 14:13:08 +0200	[diff] [blame]	3053	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3054	}
				3055	#endif
				3056	}
				3057
Peter Zijlstra	ddc9729	2007-10-15 17:00:10 +0200	[diff] [blame]	3058	static void check_spread(struct cfs_rq cfs_rq, struct sched_entity se)
				3059	{
				3060	#ifdef CONFIG_SCHED_DEBUG
				3061	s64 d = se->vruntime - cfs_rq->min_vruntime;
				3062
				3063	if (d < 0)
				3064	d = -d;
				3065
				3066	if (d > 3*sysctl_sched_latency)
				3067	schedstat_inc(cfs_rq, nr_spread_over);
				3068	#endif
				3069	}
				3070
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3071	static void
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	3072	place_entity(struct cfs_rq cfs_rq, struct sched_entity se, int initial)
				3073	{
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	3074	u64 vruntime = cfs_rq->min_vruntime;
Peter Zijlstra	94dfb5e	2007-10-15 17:00:05 +0200	[diff] [blame]	3075
Peter Zijlstra	2cb8600	2007-11-09 22:39:37 +0100	[diff] [blame]	3076	/*
				3077	* The 'current' period is already promised to the current tasks,
				3078	* however the extra weight of the new task will slow them down a
				3079	* little, place the new task so that it fits in the slot that
				3080	* stays open at the end.
				3081	*/
Peter Zijlstra	94dfb5e	2007-10-15 17:00:05 +0200	[diff] [blame]	3082	if (initial && sched_feat(START_DEBIT))
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	3083	vruntime += sched_vslice(cfs_rq, se);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	3084
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	3085	/* sleeps up to a single latency don't count. */
Mike Galbraith	5ca9880	2010-03-11 17:17:17 +0100	[diff] [blame]	3086	if (!initial) {
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	3087	unsigned long thresh = sysctl_sched_latency;
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	3088
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	3089	/*
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	3090	* Halve their sleep time's effect, to allow
				3091	* for a gentler effect of sleepers:
				3092	*/
				3093	if (sched_feat(GENTLE_FAIR_SLEEPERS))
				3094	thresh >>= 1;
Ingo Molnar	51e0304	2009-09-16 08:54:45 +0200	[diff] [blame]	3095
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	3096	vruntime -= thresh;
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	3097	}
				3098
Mike Galbraith	b5d9d73	2009-09-08 11:12:28 +0200	[diff] [blame]	3099	/* ensure we never gain time by being placed backwards. */
Viresh Kumar	16c8f1c	2012-11-08 13:33:46 +0530	[diff] [blame]	3100	se->vruntime = max_vruntime(se->vruntime, vruntime);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	3101	}
				3102
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	3103	static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
				3104
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	3105	static void
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	3106	enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3107	{
				3108	/*
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	3109	* Update the normalized vruntime before updating min_vruntime
Kamalesh Babulal	0fc576d	2013-06-27 11:24:18 +0530	[diff] [blame]	3110	* through calling update_curr().
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	3111	*/
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	3112	if (!(flags & ENQUEUE_WAKEUP) \|\| (flags & ENQUEUE_WAKING))
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	3113	se->vruntime += cfs_rq->min_vruntime;
				3114
				3115	/*
Dmitry Adamushko	a2a2d68	2007-10-15 17:00:13 +0200	[diff] [blame]	3116	* Update run-time statistics of the 'current'.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3117	*/
Ingo Molnar	b7cc089	2007-08-09 11:16:47 +0200	[diff] [blame]	3118	update_curr(cfs_rq);
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3119	enqueue_entity_load_avg(cfs_rq, se);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	3120	account_entity_enqueue(cfs_rq, se);
				3121	update_cfs_shares(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3122
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	3123	if (flags & ENQUEUE_WAKEUP) {
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	3124	place_entity(cfs_rq, se, 0);
Ingo Molnar	2396af6	2007-08-09 11:16:48 +0200	[diff] [blame]	3125	enqueue_sleeper(cfs_rq, se);
Ingo Molnar	e9acbff	2007-10-15 17:00:04 +0200	[diff] [blame]	3126	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3127
Ingo Molnar	d2417e5	2007-08-09 11:16:47 +0200	[diff] [blame]	3128	update_stats_enqueue(cfs_rq, se);
Peter Zijlstra	ddc9729	2007-10-15 17:00:10 +0200	[diff] [blame]	3129	check_spread(cfs_rq, se);
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	3130	if (se != cfs_rq->curr)
				3131	__enqueue_entity(cfs_rq, se);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	3132	se->on_rq = 1;
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	3133
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	3134	if (cfs_rq->nr_running == 1) {
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	3135	list_add_leaf_cfs_rq(cfs_rq);
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	3136	check_enqueue_throttle(cfs_rq);
				3137	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3138	}
				3139
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	3140	static void __clear_buddies_last(struct sched_entity *se)
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	3141	{
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	3142	for_each_sched_entity(se) {
				3143	struct cfs_rq *cfs_rq = cfs_rq_of(se);
Peter Zijlstra	f104479	2012-02-11 06:05:00 +0100	[diff] [blame]	3144	if (cfs_rq->last != se)
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	3145	break;
Peter Zijlstra	f104479	2012-02-11 06:05:00 +0100	[diff] [blame]	3146
				3147	cfs_rq->last = NULL;
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	3148	}
				3149	}
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	3150
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	3151	static void __clear_buddies_next(struct sched_entity *se)
				3152	{
				3153	for_each_sched_entity(se) {
				3154	struct cfs_rq *cfs_rq = cfs_rq_of(se);
Peter Zijlstra	f104479	2012-02-11 06:05:00 +0100	[diff] [blame]	3155	if (cfs_rq->next != se)
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	3156	break;
Peter Zijlstra	f104479	2012-02-11 06:05:00 +0100	[diff] [blame]	3157
				3158	cfs_rq->next = NULL;
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	3159	}
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	3160	}
				3161
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3162	static void __clear_buddies_skip(struct sched_entity *se)
				3163	{
				3164	for_each_sched_entity(se) {
				3165	struct cfs_rq *cfs_rq = cfs_rq_of(se);
Peter Zijlstra	f104479	2012-02-11 06:05:00 +0100	[diff] [blame]	3166	if (cfs_rq->skip != se)
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3167	break;
Peter Zijlstra	f104479	2012-02-11 06:05:00 +0100	[diff] [blame]	3168
				3169	cfs_rq->skip = NULL;
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3170	}
				3171	}
				3172
Peter Zijlstra	a571bbe	2009-01-28 14:51:40 +0100	[diff] [blame]	3173	static void clear_buddies(struct cfs_rq cfs_rq, struct sched_entity se)
				3174	{
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	3175	if (cfs_rq->last == se)
				3176	__clear_buddies_last(se);
				3177
				3178	if (cfs_rq->next == se)
				3179	__clear_buddies_next(se);
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3180
				3181	if (cfs_rq->skip == se)
				3182	__clear_buddies_skip(se);
Peter Zijlstra	a571bbe	2009-01-28 14:51:40 +0100	[diff] [blame]	3183	}
				3184
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	3185	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	3186
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3187	static void
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	3188	dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3189	{
Dmitry Adamushko	a2a2d68	2007-10-15 17:00:13 +0200	[diff] [blame]	3190	/*
				3191	* Update run-time statistics of the 'current'.
				3192	*/
				3193	update_curr(cfs_rq);
Yuyang Du	1396223	2015-07-15 08:04:41 +0800	[diff] [blame]	3194	dequeue_entity_load_avg(cfs_rq, se);
Dmitry Adamushko	a2a2d68	2007-10-15 17:00:13 +0200	[diff] [blame]	3195
Ingo Molnar	19b6a2e	2007-08-09 11:16:48 +0200	[diff] [blame]	3196	update_stats_dequeue(cfs_rq, se);
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	3197	if (flags & DEQUEUE_SLEEP) {
Peter Zijlstra	67e9fb2	2007-10-15 17:00:10 +0200	[diff] [blame]	3198	#ifdef CONFIG_SCHEDSTATS
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3199	if (entity_is_task(se)) {
				3200	struct task_struct *tsk = task_of(se);
				3201
				3202	if (tsk->state & TASK_INTERRUPTIBLE)
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	3203	se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3204	if (tsk->state & TASK_UNINTERRUPTIBLE)
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	3205	se->statistics.block_start = rq_clock(rq_of(cfs_rq));
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3206	}
Dmitry Adamushko	db36cc7	2007-10-15 17:00:06 +0200	[diff] [blame]	3207	#endif
Peter Zijlstra	67e9fb2	2007-10-15 17:00:10 +0200	[diff] [blame]	3208	}
				3209
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	3210	clear_buddies(cfs_rq, se);
Peter Zijlstra	4793241	2008-11-04 21:25:09 +0100	[diff] [blame]	3211
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	3212	if (se != cfs_rq->curr)
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	3213	__dequeue_entity(cfs_rq, se);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	3214	se->on_rq = 0;
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	3215	account_entity_dequeue(cfs_rq, se);
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	3216
				3217	/*
				3218	* Normalize the entity after updating the min_vruntime because the
				3219	* update can refer to the ->curr item and we need to reflect this
				3220	* movement in our normalized position.
				3221	*/
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	3222	if (!(flags & DEQUEUE_SLEEP))
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	3223	se->vruntime -= cfs_rq->min_vruntime;
Peter Zijlstra	1e87623	2011-05-17 16:21:10 -0700	[diff] [blame]	3224
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	3225	/* return excess runtime on last dequeue */
				3226	return_cfs_rq_runtime(cfs_rq);
				3227
Peter Zijlstra	1e87623	2011-05-17 16:21:10 -0700	[diff] [blame]	3228	update_min_vruntime(cfs_rq);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	3229	update_cfs_shares(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3230	}
				3231
				3232	/*
				3233	* Preempt the current task with a newly woken task if needed:
				3234	*/
Peter Zijlstra	7c92e54	2007-09-05 14:32:49 +0200	[diff] [blame]	3235	static void
Ingo Molnar	2e09bf5	2007-10-15 17:00:05 +0200	[diff] [blame]	3236	check_preempt_tick(struct cfs_rq cfs_rq, struct sched_entity curr)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3237	{
Peter Zijlstra	1169783	2007-09-05 14:32:49 +0200	[diff] [blame]	3238	unsigned long ideal_runtime, delta_exec;
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	3239	struct sched_entity *se;
				3240	s64 delta;
Peter Zijlstra	1169783	2007-09-05 14:32:49 +0200	[diff] [blame]	3241
Peter Zijlstra	6d0f0eb	2007-10-15 17:00:05 +0200	[diff] [blame]	3242	ideal_runtime = sched_slice(cfs_rq, curr);
Peter Zijlstra	1169783	2007-09-05 14:32:49 +0200	[diff] [blame]	3243	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
Mike Galbraith	a9f3e2b	2009-01-28 14:51:39 +0100	[diff] [blame]	3244	if (delta_exec > ideal_runtime) {
Kirill Tkhai	8875125	2014-06-29 00:03:57 +0400	[diff] [blame]	3245	resched_curr(rq_of(cfs_rq));
Mike Galbraith	a9f3e2b	2009-01-28 14:51:39 +0100	[diff] [blame]	3246	/*
				3247	* The current task ran long enough, ensure it doesn't get
				3248	* re-elected due to buddy favours.
				3249	*/
				3250	clear_buddies(cfs_rq, curr);
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	3251	return;
				3252	}
				3253
				3254	/*
				3255	* Ensure that a task that missed wakeup preemption by a
				3256	* narrow margin doesn't have to wait for a full slice.
				3257	* This also mitigates buddy induced latencies under load.
				3258	*/
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	3259	if (delta_exec < sysctl_sched_min_granularity)
				3260	return;
				3261
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	3262	se = __pick_first_entity(cfs_rq);
				3263	delta = curr->vruntime - se->vruntime;
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	3264
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	3265	if (delta < 0)
				3266	return;
Mike Galbraith	d7d8294	2011-01-05 05:41:17 +0100	[diff] [blame]	3267
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	3268	if (delta > ideal_runtime)
Kirill Tkhai	8875125	2014-06-29 00:03:57 +0400	[diff] [blame]	3269	resched_curr(rq_of(cfs_rq));
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3270	}
				3271
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	3272	static void
Ingo Molnar	8494f41	2007-08-09 11:16:48 +0200	[diff] [blame]	3273	set_next_entity(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3274	{
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	3275	/* 'current' is not kept within the tree. */
				3276	if (se->on_rq) {
				3277	/*
				3278	* Any task has to be enqueued before it get to execute on
				3279	* a CPU. So account for the time it spent waiting on the
				3280	* runqueue.
				3281	*/
				3282	update_stats_wait_end(cfs_rq, se);
				3283	__dequeue_entity(cfs_rq, se);
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3284	update_load_avg(se, 1);
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	3285	}
				3286
Ingo Molnar	79303e9	2007-08-09 11:16:47 +0200	[diff] [blame]	3287	update_stats_curr_start(cfs_rq, se);
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	3288	cfs_rq->curr = se;
Ingo Molnar	eba1ed4	2007-10-15 17:00:02 +0200	[diff] [blame]	3289	#ifdef CONFIG_SCHEDSTATS
				3290	/*
				3291	* Track our maximum slice length, if the CPU's load is at
				3292	* least twice that of our own weight (i.e. dont track it
				3293	* when there are only lesser-weight tasks around):
				3294	*/
Dmitry Adamushko	495eca4	2007-10-15 17:00:06 +0200	[diff] [blame]	3295	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3296	se->statistics.slice_max = max(se->statistics.slice_max,
Ingo Molnar	eba1ed4	2007-10-15 17:00:02 +0200	[diff] [blame]	3297	se->sum_exec_runtime - se->prev_sum_exec_runtime);
				3298	}
				3299	#endif
Peter Zijlstra	4a55b45	2007-09-05 14:32:49 +0200	[diff] [blame]	3300	se->prev_sum_exec_runtime = se->sum_exec_runtime;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3301	}
				3302
Peter Zijlstra	3f3a490	2008-10-24 11:06:16 +0200	[diff] [blame]	3303	static int
				3304	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se);
				3305
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3306	/*
				3307	* Pick the next process, keeping these things in mind, in this order:
				3308	* 1) keep things fair between processes/task groups
				3309	* 2) pick the "next" process, since someone really wants that to run
				3310	* 3) pick the "last" process, for cache locality
				3311	* 4) do not run the "skip" process, if something else is available
				3312	*/
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	3313	static struct sched_entity *
				3314	pick_next_entity(struct cfs_rq cfs_rq, struct sched_entity curr)
Peter Zijlstra	aa2ac25	2008-03-14 21:12:12 +0100	[diff] [blame]	3315	{
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	3316	struct sched_entity *left = __pick_first_entity(cfs_rq);
				3317	struct sched_entity *se;
				3318
				3319	/*
				3320	* If curr is set we have to see if its left of the leftmost entity
				3321	* still in the tree, provided there was anything in the tree at all.
				3322	*/
				3323	if (!left \|\| (curr && entity_before(curr, left)))
				3324	left = curr;
				3325
				3326	se = left; /* ideally we run the leftmost entity */
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	3327
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3328	/*
				3329	* Avoid running the skip buddy, if running something else can
				3330	* be done without getting too unfair.
				3331	*/
				3332	if (cfs_rq->skip == se) {
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	3333	struct sched_entity *second;
				3334
				3335	if (se == curr) {
				3336	second = __pick_first_entity(cfs_rq);
				3337	} else {
				3338	second = __pick_next_entity(se);
				3339	if (!second \|\| (curr && entity_before(curr, second)))
				3340	second = curr;
				3341	}
				3342
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3343	if (second && wakeup_preempt_entity(second, left) < 1)
				3344	se = second;
				3345	}
Peter Zijlstra	aa2ac25	2008-03-14 21:12:12 +0100	[diff] [blame]	3346
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	3347	/*
				3348	* Prefer last buddy, try to return the CPU to a preempted task.
				3349	*/
				3350	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
				3351	se = cfs_rq->last;
				3352
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3353	/*
				3354	* Someone really wants this to run. If it's not unfair, run it.
				3355	*/
				3356	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
				3357	se = cfs_rq->next;
				3358
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	3359	clear_buddies(cfs_rq, se);
Peter Zijlstra	4793241	2008-11-04 21:25:09 +0100	[diff] [blame]	3360
				3361	return se;
Peter Zijlstra	aa2ac25	2008-03-14 21:12:12 +0100	[diff] [blame]	3362	}
				3363
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	3364	static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	3365
Ingo Molnar	ab6cde2	2007-08-09 11:16:48 +0200	[diff] [blame]	3366	static void put_prev_entity(struct cfs_rq cfs_rq, struct sched_entity prev)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3367	{
				3368	/*
				3369	* If still on the runqueue then deactivate_task()
				3370	* was not called and update_curr() has to be done:
				3371	*/
				3372	if (prev->on_rq)
Ingo Molnar	b7cc089	2007-08-09 11:16:47 +0200	[diff] [blame]	3373	update_curr(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3374
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	3375	/* throttle cfs_rqs exceeding runtime */
				3376	check_cfs_rq_runtime(cfs_rq);
				3377
Peter Zijlstra	ddc9729	2007-10-15 17:00:10 +0200	[diff] [blame]	3378	check_spread(cfs_rq, prev);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	3379	if (prev->on_rq) {
Ingo Molnar	5870db5	2007-08-09 11:16:47 +0200	[diff] [blame]	3380	update_stats_wait_start(cfs_rq, prev);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	3381	/* Put 'current' back into the tree. */
				3382	__enqueue_entity(cfs_rq, prev);
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	3383	/* in !on_rq case, update occurred at dequeue */
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3384	update_load_avg(prev, 0);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	3385	}
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	3386	cfs_rq->curr = NULL;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3387	}
				3388
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3389	static void
				3390	entity_tick(struct cfs_rq cfs_rq, struct sched_entity curr, int queued)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3391	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3392	/*
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	3393	* Update run-time statistics of the 'current'.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3394	*/
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	3395	update_curr(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3396
Paul Turner	43365bd	2010-12-15 19:10:17 -0800	[diff] [blame]	3397	/*
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	3398	* Ensure that runnable average is periodically updated.
				3399	*/
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	3400	update_load_avg(curr, 1);
Peter Zijlstra	bf0bd94	2013-07-26 23:48:42 +0200	[diff] [blame]	3401	update_cfs_shares(cfs_rq);
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	3402
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3403	#ifdef CONFIG_SCHED_HRTICK
				3404	/*
				3405	* queued ticks are scheduled to match the slice, so don't bother
				3406	* validating it and just reschedule.
				3407	*/
Harvey Harrison	983ed7a	2008-04-24 18:17:55 -0700	[diff] [blame]	3408	if (queued) {
Kirill Tkhai	8875125	2014-06-29 00:03:57 +0400	[diff] [blame]	3409	resched_curr(rq_of(cfs_rq));
Harvey Harrison	983ed7a	2008-04-24 18:17:55 -0700	[diff] [blame]	3410	return;
				3411	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3412	/*
				3413	* don't let the period tick interfere with the hrtick preemption
				3414	*/
				3415	if (!sched_feat(DOUBLE_TICK) &&
				3416	hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
				3417	return;
				3418	#endif
				3419
Yong Zhang	2c2efae	2011-07-29 16:20:33 +0800	[diff] [blame]	3420	if (cfs_rq->nr_running > 1)
Ingo Molnar	2e09bf5	2007-10-15 17:00:05 +0200	[diff] [blame]	3421	check_preempt_tick(cfs_rq, curr);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3422	}
				3423
Paul Turner	ab84d31	2011-07-21 09:43:28 -0700	[diff] [blame]	3424
				3425	/**************************************************
				3426	* CFS bandwidth control machinery
				3427	*/
				3428
				3429	#ifdef CONFIG_CFS_BANDWIDTH
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3430
				3431	#ifdef HAVE_JUMP_LABEL
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	3432	static struct static_key __cfs_bandwidth_used;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3433
				3434	static inline bool cfs_bandwidth_used(void)
				3435	{
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	3436	return static_key_false(&__cfs_bandwidth_used);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3437	}
				3438
Ben Segall	1ee14e6	2013-10-16 11:16:12 -0700	[diff] [blame]	3439	void cfs_bandwidth_usage_inc(void)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3440	{
Ben Segall	1ee14e6	2013-10-16 11:16:12 -0700	[diff] [blame]	3441	static_key_slow_inc(&__cfs_bandwidth_used);
				3442	}
				3443
				3444	void cfs_bandwidth_usage_dec(void)
				3445	{
				3446	static_key_slow_dec(&__cfs_bandwidth_used);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3447	}
				3448	#else /* HAVE_JUMP_LABEL */
				3449	static bool cfs_bandwidth_used(void)
				3450	{
				3451	return true;
				3452	}
				3453
Ben Segall	1ee14e6	2013-10-16 11:16:12 -0700	[diff] [blame]	3454	void cfs_bandwidth_usage_inc(void) {}
				3455	void cfs_bandwidth_usage_dec(void) {}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3456	#endif /* HAVE_JUMP_LABEL */
				3457
Paul Turner	ab84d31	2011-07-21 09:43:28 -0700	[diff] [blame]	3458	/*
				3459	* default period for cfs group bandwidth.
				3460	* default: 0.1s, units: nanoseconds
				3461	*/
				3462	static inline u64 default_cfs_period(void)
				3463	{
				3464	return 100000000ULL;
				3465	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	3466
				3467	static inline u64 sched_cfs_bandwidth_slice(void)
				3468	{
				3469	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
				3470	}
				3471
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	3472	/*
				3473	* Replenish runtime according to assigned quota and update expiration time.
				3474	* We use sched_clock_cpu directly instead of rq->clock to avoid adding
				3475	* additional synchronization around rq->lock.
				3476	*
				3477	* requires cfs_b->lock
				3478	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3479	void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	3480	{
				3481	u64 now;
				3482
				3483	if (cfs_b->quota == RUNTIME_INF)
				3484	return;
				3485
				3486	now = sched_clock_cpu(smp_processor_id());
				3487	cfs_b->runtime = cfs_b->quota;
				3488	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
				3489	}
				3490
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3491	static inline struct cfs_bandwidth tg_cfs_bandwidth(struct task_group tg)
				3492	{
				3493	return &tg->cfs_bandwidth;
				3494	}
				3495
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	3496	/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
				3497	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
				3498	{
				3499	if (unlikely(cfs_rq->throttle_count))
				3500	return cfs_rq->throttled_clock_task;
				3501
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	3502	return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	3503	}
				3504
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	3505	/* returns 0 on failure to allocate runtime */
				3506	static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	3507	{
				3508	struct task_group *tg = cfs_rq->tg;
				3509	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	3510	u64 amount = 0, min_amount, expires;
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	3511
				3512	/* note: this is a positive sum as runtime_remaining <= 0 */
				3513	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
				3514
				3515	raw_spin_lock(&cfs_b->lock);
				3516	if (cfs_b->quota == RUNTIME_INF)
				3517	amount = min_amount;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	3518	else {
Peter Zijlstra	77a4d1a	2015-04-15 11:41:57 +0200	[diff] [blame]	3519	start_cfs_bandwidth(cfs_b);
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	3520
				3521	if (cfs_b->runtime > 0) {
				3522	amount = min(cfs_b->runtime, min_amount);
				3523	cfs_b->runtime -= amount;
				3524	cfs_b->idle = 0;
				3525	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	3526	}
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	3527	expires = cfs_b->runtime_expires;
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	3528	raw_spin_unlock(&cfs_b->lock);
				3529
				3530	cfs_rq->runtime_remaining += amount;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	3531	/*
				3532	* we may have advanced our local expiration to account for allowed
				3533	* spread between our sched_clock and the one on which runtime was
				3534	* issued.
				3535	*/
				3536	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
				3537	cfs_rq->runtime_expires = expires;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	3538
				3539	return cfs_rq->runtime_remaining > 0;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	3540	}
				3541
				3542	/*
				3543	* Note: This depends on the synchronization provided by sched_clock and the
				3544	* fact that rq->clock snapshots this value.
				3545	*/
				3546	static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				3547	{
				3548	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	3549
				3550	/* if the deadline is ahead of our clock, nothing to do */
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	3551	if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	3552	return;
				3553
				3554	if (cfs_rq->runtime_remaining < 0)
				3555	return;
				3556
				3557	/*
				3558	* If the local deadline has passed we have to consider the
				3559	* possibility that our sched_clock is 'fast' and the global deadline
				3560	* has not truly expired.
				3561	*
				3562	* Fortunately we can check determine whether this the case by checking
Ben Segall	51f2176	2014-05-19 15:49:45 -0700	[diff] [blame]	3563	* whether the global deadline has advanced. It is valid to compare
				3564	* cfs_b->runtime_expires without any locks since we only care about
				3565	* exact equality, so a partial write will still work.
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	3566	*/
				3567
Ben Segall	51f2176	2014-05-19 15:49:45 -0700	[diff] [blame]	3568	if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	3569	/* extend local deadline, drift is bounded above by 2 ticks */
				3570	cfs_rq->runtime_expires += TICK_NSEC;
				3571	} else {
				3572	/* global deadline is ahead, expiration has passed */
				3573	cfs_rq->runtime_remaining = 0;
				3574	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	3575	}
				3576
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	3577	static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	3578	{
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	3579	/* dock delta_exec before expiring quota (as it could span periods) */
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	3580	cfs_rq->runtime_remaining -= delta_exec;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	3581	expire_cfs_rq_runtime(cfs_rq);
				3582
				3583	if (likely(cfs_rq->runtime_remaining > 0))
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	3584	return;
				3585
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	3586	/*
				3587	* if we're unable to extend our runtime we resched so that the active
				3588	* hierarchy can be throttled
				3589	*/
				3590	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
Kirill Tkhai	8875125	2014-06-29 00:03:57 +0400	[diff] [blame]	3591	resched_curr(rq_of(cfs_rq));
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	3592	}
				3593
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	3594	static __always_inline
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	3595	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	3596	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	3597	if (!cfs_bandwidth_used() \|\| !cfs_rq->runtime_enabled)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	3598	return;
				3599
				3600	__account_cfs_rq_runtime(cfs_rq, delta_exec);
				3601	}
				3602
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	3603	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
				3604	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	3605	return cfs_bandwidth_used() && cfs_rq->throttled;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	3606	}
				3607
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	3608	/* check whether cfs_rq, or any parent, is throttled */
				3609	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
				3610	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	3611	return cfs_bandwidth_used() && cfs_rq->throttle_count;
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	3612	}
				3613
				3614	/*
				3615	* Ensure that neither of the group entities corresponding to src_cpu or
				3616	* dest_cpu are members of a throttled hierarchy when performing group
				3617	* load-balance operations.
				3618	*/
				3619	static inline int throttled_lb_pair(struct task_group *tg,
				3620	int src_cpu, int dest_cpu)
				3621	{
				3622	struct cfs_rq src_cfs_rq, dest_cfs_rq;
				3623
				3624	src_cfs_rq = tg->cfs_rq[src_cpu];
				3625	dest_cfs_rq = tg->cfs_rq[dest_cpu];
				3626
				3627	return throttled_hierarchy(src_cfs_rq) \|\|
				3628	throttled_hierarchy(dest_cfs_rq);
				3629	}
				3630
				3631	/* updated child weight may affect parent so we have to do this bottom up */
				3632	static int tg_unthrottle_up(struct task_group tg, void data)
				3633	{
				3634	struct rq *rq = data;
				3635	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
				3636
				3637	cfs_rq->throttle_count--;
				3638	#ifdef CONFIG_SMP
				3639	if (!cfs_rq->throttle_count) {
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	3640	/* adjust cfs_rq_clock_task() */
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	3641	cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	3642	cfs_rq->throttled_clock_task;
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	3643	}
				3644	#endif
				3645
				3646	return 0;
				3647	}
				3648
				3649	static int tg_throttle_down(struct task_group tg, void data)
				3650	{
				3651	struct rq *rq = data;
				3652	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
				3653
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	3654	/* group is entering throttled state, stop time */
				3655	if (!cfs_rq->throttle_count)
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	3656	cfs_rq->throttled_clock_task = rq_clock_task(rq);
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	3657	cfs_rq->throttle_count++;
				3658
				3659	return 0;
				3660	}
				3661
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	3662	static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	3663	{
				3664	struct rq *rq = rq_of(cfs_rq);
				3665	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				3666	struct sched_entity *se;
				3667	long task_delta, dequeue = 1;
Peter Zijlstra	77a4d1a	2015-04-15 11:41:57 +0200	[diff] [blame]	3668	bool empty;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	3669
				3670	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
				3671
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	3672	/* freeze hierarchy runnable averages while throttled */
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	3673	rcu_read_lock();
				3674	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
				3675	rcu_read_unlock();
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	3676
				3677	task_delta = cfs_rq->h_nr_running;
				3678	for_each_sched_entity(se) {
				3679	struct cfs_rq *qcfs_rq = cfs_rq_of(se);
				3680	/* throttled entity or throttle-on-deactivate */
				3681	if (!se->on_rq)
				3682	break;
				3683
				3684	if (dequeue)
				3685	dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
				3686	qcfs_rq->h_nr_running -= task_delta;
				3687
				3688	if (qcfs_rq->load.weight)
				3689	dequeue = 0;
				3690	}
				3691
				3692	if (!se)
Kirill Tkhai	7246544	2014-05-09 03:00:14 +0400	[diff] [blame]	3693	sub_nr_running(rq, task_delta);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	3694
				3695	cfs_rq->throttled = 1;
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	3696	cfs_rq->throttled_clock = rq_clock(rq);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	3697	raw_spin_lock(&cfs_b->lock);
Cong Wang	d49db34	2015-06-24 12:41:47 -0700	[diff] [blame]	3698	empty = list_empty(&cfs_b->throttled_cfs_rq);
Peter Zijlstra	77a4d1a	2015-04-15 11:41:57 +0200	[diff] [blame]	3699
Ben Segall	c06f04c	2014-06-20 15:21:20 -0700	[diff] [blame]	3700	/*
				3701	* Add to the _head_ of the list, so that an already-started
				3702	* distribute_cfs_runtime will not see us
				3703	*/
				3704	list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
Peter Zijlstra	77a4d1a	2015-04-15 11:41:57 +0200	[diff] [blame]	3705
				3706	/*
				3707	* If we're the first throttled task, make sure the bandwidth
				3708	* timer is running.
				3709	*/
				3710	if (empty)
				3711	start_cfs_bandwidth(cfs_b);
				3712
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	3713	raw_spin_unlock(&cfs_b->lock);
				3714	}
				3715
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3716	void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	3717	{
				3718	struct rq *rq = rq_of(cfs_rq);
				3719	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				3720	struct sched_entity *se;
				3721	int enqueue = 1;
				3722	long task_delta;
				3723
Michael Wang	22b958d	2013-06-04 14:23:39 +0800	[diff] [blame]	3724	se = cfs_rq->tg->se[cpu_of(rq)];
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	3725
				3726	cfs_rq->throttled = 0;
Frederic Weisbecker	1a55af2	2013-04-12 01:51:01 +0200	[diff] [blame]	3727
				3728	update_rq_clock(rq);
				3729
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	3730	raw_spin_lock(&cfs_b->lock);
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	3731	cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	3732	list_del_rcu(&cfs_rq->throttled_list);
				3733	raw_spin_unlock(&cfs_b->lock);
				3734
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	3735	/* update hierarchical throttle state */
				3736	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
				3737
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	3738	if (!cfs_rq->load.weight)
				3739	return;
				3740
				3741	task_delta = cfs_rq->h_nr_running;
				3742	for_each_sched_entity(se) {
				3743	if (se->on_rq)
				3744	enqueue = 0;
				3745
				3746	cfs_rq = cfs_rq_of(se);
				3747	if (enqueue)
				3748	enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
				3749	cfs_rq->h_nr_running += task_delta;
				3750
				3751	if (cfs_rq_throttled(cfs_rq))
				3752	break;
				3753	}
				3754
				3755	if (!se)
Kirill Tkhai	7246544	2014-05-09 03:00:14 +0400	[diff] [blame]	3756	add_nr_running(rq, task_delta);
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	3757
				3758	/* determine whether we need to wake up potentially idle cpu */
				3759	if (rq->curr == rq->idle && rq->cfs.nr_running)
Kirill Tkhai	8875125	2014-06-29 00:03:57 +0400	[diff] [blame]	3760	resched_curr(rq);
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	3761	}
				3762
				3763	static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
				3764	u64 remaining, u64 expires)
				3765	{
				3766	struct cfs_rq *cfs_rq;
Ben Segall	c06f04c	2014-06-20 15:21:20 -0700	[diff] [blame]	3767	u64 runtime;
				3768	u64 starting_runtime = remaining;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	3769
				3770	rcu_read_lock();
				3771	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
				3772	throttled_list) {
				3773	struct rq *rq = rq_of(cfs_rq);
				3774
				3775	raw_spin_lock(&rq->lock);
				3776	if (!cfs_rq_throttled(cfs_rq))
				3777	goto next;
				3778
				3779	runtime = -cfs_rq->runtime_remaining + 1;
				3780	if (runtime > remaining)
				3781	runtime = remaining;
				3782	remaining -= runtime;
				3783
				3784	cfs_rq->runtime_remaining += runtime;
				3785	cfs_rq->runtime_expires = expires;
				3786
				3787	/* we check whether we're throttled above */
				3788	if (cfs_rq->runtime_remaining > 0)
				3789	unthrottle_cfs_rq(cfs_rq);
				3790
				3791	next:
				3792	raw_spin_unlock(&rq->lock);
				3793
				3794	if (!remaining)
				3795	break;
				3796	}
				3797	rcu_read_unlock();
				3798
Ben Segall	c06f04c	2014-06-20 15:21:20 -0700	[diff] [blame]	3799	return starting_runtime - remaining;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	3800	}
				3801
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	3802	/*
				3803	* Responsible for refilling a task_group's bandwidth and unthrottling its
				3804	* cfs_rqs as appropriate. If there has been no activity within the last
				3805	* period the timer is deactivated until scheduling resumes; cfs_b->idle is
				3806	* used to track this state.
				3807	*/
				3808	static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
				3809	{
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	3810	u64 runtime, runtime_expires;
Ben Segall	51f2176	2014-05-19 15:49:45 -0700	[diff] [blame]	3811	int throttled;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	3812
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	3813	/* no need to continue the timer with no bandwidth constraint */
				3814	if (cfs_b->quota == RUNTIME_INF)
Ben Segall	51f2176	2014-05-19 15:49:45 -0700	[diff] [blame]	3815	goto out_deactivate;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	3816
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	3817	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
Nikhil Rao	e8da1b1	2011-07-21 09:43:40 -0700	[diff] [blame]	3818	cfs_b->nr_periods += overrun;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	3819
Ben Segall	51f2176	2014-05-19 15:49:45 -0700	[diff] [blame]	3820	/*
				3821	* idle depends on !throttled (for the case of a large deficit), and if
				3822	* we're going inactive then everything else can be deferred
				3823	*/
				3824	if (cfs_b->idle && !throttled)
				3825	goto out_deactivate;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	3826
				3827	__refill_cfs_bandwidth_runtime(cfs_b);
				3828
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	3829	if (!throttled) {
				3830	/* mark as potentially idle for the upcoming period */
				3831	cfs_b->idle = 1;
Ben Segall	51f2176	2014-05-19 15:49:45 -0700	[diff] [blame]	3832	return 0;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	3833	}
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	3834
Nikhil Rao	e8da1b1	2011-07-21 09:43:40 -0700	[diff] [blame]	3835	/* account preceding periods in which throttling occurred */
				3836	cfs_b->nr_throttled += overrun;
				3837
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	3838	runtime_expires = cfs_b->runtime_expires;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	3839
				3840	/*
Ben Segall	c06f04c	2014-06-20 15:21:20 -0700	[diff] [blame]	3841	* This check is repeated as we are holding onto the new bandwidth while
				3842	* we unthrottle. This can potentially race with an unthrottled group
				3843	* trying to acquire new bandwidth from the global pool. This can result
				3844	* in us over-using our runtime if it is all used during this loop, but
				3845	* only by limited amounts in that extreme case.
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	3846	*/
Ben Segall	c06f04c	2014-06-20 15:21:20 -0700	[diff] [blame]	3847	while (throttled && cfs_b->runtime > 0) {
				3848	runtime = cfs_b->runtime;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	3849	raw_spin_unlock(&cfs_b->lock);
				3850	/* we can't nest cfs_b->lock while distributing bandwidth */
				3851	runtime = distribute_cfs_runtime(cfs_b, runtime,
				3852	runtime_expires);
				3853	raw_spin_lock(&cfs_b->lock);
				3854
				3855	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
Ben Segall	c06f04c	2014-06-20 15:21:20 -0700	[diff] [blame]	3856
				3857	cfs_b->runtime -= min(runtime, cfs_b->runtime);
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	3858	}
				3859
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	3860	/*
				3861	* While we are ensured activity in the period following an
				3862	* unthrottle, this also covers the case in which the new bandwidth is
				3863	* insufficient to cover the existing bandwidth deficit. (Forcing the
				3864	* timer to remain active while there are any throttled entities.)
				3865	*/
				3866	cfs_b->idle = 0;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	3867
Ben Segall	51f2176	2014-05-19 15:49:45 -0700	[diff] [blame]	3868	return 0;
				3869
				3870	out_deactivate:
Ben Segall	51f2176	2014-05-19 15:49:45 -0700	[diff] [blame]	3871	return 1;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	3872	}
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	3873
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	3874	/* a cfs_rq won't donate quota below this amount */
				3875	static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
				3876	/* minimum remaining period time to redistribute slack quota */
				3877	static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
				3878	/* how long we wait to gather additional slack before distributing */
				3879	static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
				3880
Ben Segall	db06e78	2013-10-16 11:16:17 -0700	[diff] [blame]	3881	/*
				3882	* Are we near the end of the current quota period?
				3883	*
				3884	* Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
Thomas Gleixner	4961b6e	2015-04-14 21:09:05 +0000	[diff] [blame]	3885	* hrtimer base being cleared by hrtimer_start. In the case of
Ben Segall	db06e78	2013-10-16 11:16:17 -0700	[diff] [blame]	3886	* migrate_hrtimers, base is never cleared, so we are fine.
				3887	*/
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	3888	static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
				3889	{
				3890	struct hrtimer *refresh_timer = &cfs_b->period_timer;
				3891	u64 remaining;
				3892
				3893	/* if the call-back is running a quota refresh is already occurring */
				3894	if (hrtimer_callback_running(refresh_timer))
				3895	return 1;
				3896
				3897	/* is a quota refresh about to occur? */
				3898	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
				3899	if (remaining < min_expire)
				3900	return 1;
				3901
				3902	return 0;
				3903	}
				3904
				3905	static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
				3906	{
				3907	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
				3908
				3909	/* if there's a quota refresh soon don't bother with slack */
				3910	if (runtime_refresh_within(cfs_b, min_left))
				3911	return;
				3912
Peter Zijlstra	4cfafd3	2015-05-14 12:23:11 +0200	[diff] [blame]	3913	hrtimer_start(&cfs_b->slack_timer,
				3914	ns_to_ktime(cfs_bandwidth_slack_period),
				3915	HRTIMER_MODE_REL);
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	3916	}
				3917
				3918	/* we know any runtime found here is valid as update_curr() precedes return */
				3919	static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				3920	{
				3921	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				3922	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
				3923
				3924	if (slack_runtime <= 0)
				3925	return;
				3926
				3927	raw_spin_lock(&cfs_b->lock);
				3928	if (cfs_b->quota != RUNTIME_INF &&
				3929	cfs_rq->runtime_expires == cfs_b->runtime_expires) {
				3930	cfs_b->runtime += slack_runtime;
				3931
				3932	/* we are under rq->lock, defer unthrottling using a timer */
				3933	if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
				3934	!list_empty(&cfs_b->throttled_cfs_rq))
				3935	start_cfs_slack_bandwidth(cfs_b);
				3936	}
				3937	raw_spin_unlock(&cfs_b->lock);
				3938
				3939	/* even if it's not valid for return we don't want to try again */
				3940	cfs_rq->runtime_remaining -= slack_runtime;
				3941	}
				3942
				3943	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				3944	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	3945	if (!cfs_bandwidth_used())
				3946	return;
				3947
Paul Turner	fccfdc6	2011-11-07 20:26:34 -0800	[diff] [blame]	3948	if (!cfs_rq->runtime_enabled \|\| cfs_rq->nr_running)
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	3949	return;
				3950
				3951	__return_cfs_rq_runtime(cfs_rq);
				3952	}
				3953
				3954	/*
				3955	* This is done with a timer (instead of inline with bandwidth return) since
				3956	* it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
				3957	*/
				3958	static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
				3959	{
				3960	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
				3961	u64 expires;
				3962
				3963	/* confirm we're still not at a refresh boundary */
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	3964	raw_spin_lock(&cfs_b->lock);
Ben Segall	db06e78	2013-10-16 11:16:17 -0700	[diff] [blame]	3965	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
				3966	raw_spin_unlock(&cfs_b->lock);
				3967	return;
				3968	}
				3969
Ben Segall	c06f04c	2014-06-20 15:21:20 -0700	[diff] [blame]	3970	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	3971	runtime = cfs_b->runtime;
Ben Segall	c06f04c	2014-06-20 15:21:20 -0700	[diff] [blame]	3972
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	3973	expires = cfs_b->runtime_expires;
				3974	raw_spin_unlock(&cfs_b->lock);
				3975
				3976	if (!runtime)
				3977	return;
				3978
				3979	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
				3980
				3981	raw_spin_lock(&cfs_b->lock);
				3982	if (expires == cfs_b->runtime_expires)
Ben Segall	c06f04c	2014-06-20 15:21:20 -0700	[diff] [blame]	3983	cfs_b->runtime -= min(runtime, cfs_b->runtime);
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	3984	raw_spin_unlock(&cfs_b->lock);
				3985	}
				3986
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	3987	/*
				3988	* When a group wakes up we want to make sure that its quota is not already
				3989	* expired/exceeded, otherwise it may be allowed to steal additional ticks of
				3990	* runtime as update_curr() throttling can not not trigger until it's on-rq.
				3991	*/
				3992	static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
				3993	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	3994	if (!cfs_bandwidth_used())
				3995	return;
				3996
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	3997	/* an active group must be handled by the update_curr()->put() path */
				3998	if (!cfs_rq->runtime_enabled \|\| cfs_rq->curr)
				3999	return;
				4000
				4001	/* ensure the group is not already throttled */
				4002	if (cfs_rq_throttled(cfs_rq))
				4003	return;
				4004
				4005	/* update runtime allocation */
				4006	account_cfs_rq_runtime(cfs_rq, 0);
				4007	if (cfs_rq->runtime_remaining <= 0)
				4008	throttle_cfs_rq(cfs_rq);
				4009	}
				4010
				4011	/* conditionally throttle active cfs_rq's from put_prev_entity() */
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	4012	static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	4013	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	4014	if (!cfs_bandwidth_used())
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	4015	return false;
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	4016
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	4017	if (likely(!cfs_rq->runtime_enabled \|\| cfs_rq->runtime_remaining > 0))
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	4018	return false;
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	4019
				4020	/*
				4021	* it's possible for a throttled entity to be forced into a running
				4022	* state (e.g. set_curr_task), in this case we're finished.
				4023	*/
				4024	if (cfs_rq_throttled(cfs_rq))
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	4025	return true;
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	4026
				4027	throttle_cfs_rq(cfs_rq);
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	4028	return true;
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	4029	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4030
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4031	static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
				4032	{
				4033	struct cfs_bandwidth *cfs_b =
				4034	container_of(timer, struct cfs_bandwidth, slack_timer);
Peter Zijlstra	77a4d1a	2015-04-15 11:41:57 +0200	[diff] [blame]	4035
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4036	do_sched_cfs_slack_timer(cfs_b);
				4037
				4038	return HRTIMER_NORESTART;
				4039	}
				4040
				4041	static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
				4042	{
				4043	struct cfs_bandwidth *cfs_b =
				4044	container_of(timer, struct cfs_bandwidth, period_timer);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4045	int overrun;
				4046	int idle = 0;
				4047
Ben Segall	51f2176	2014-05-19 15:49:45 -0700	[diff] [blame]	4048	raw_spin_lock(&cfs_b->lock);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4049	for (;;) {
Peter Zijlstra	77a4d1a	2015-04-15 11:41:57 +0200	[diff] [blame]	4050	overrun = hrtimer_forward_now(timer, cfs_b->period);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4051	if (!overrun)
				4052	break;
				4053
				4054	idle = do_sched_cfs_period_timer(cfs_b, overrun);
				4055	}
Peter Zijlstra	4cfafd3	2015-05-14 12:23:11 +0200	[diff] [blame]	4056	if (idle)
				4057	cfs_b->period_active = 0;
Ben Segall	51f2176	2014-05-19 15:49:45 -0700	[diff] [blame]	4058	raw_spin_unlock(&cfs_b->lock);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4059
				4060	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
				4061	}
				4062
				4063	void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				4064	{
				4065	raw_spin_lock_init(&cfs_b->lock);
				4066	cfs_b->runtime = 0;
				4067	cfs_b->quota = RUNTIME_INF;
				4068	cfs_b->period = ns_to_ktime(default_cfs_period());
				4069
				4070	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
Peter Zijlstra	4cfafd3	2015-05-14 12:23:11 +0200	[diff] [blame]	4071	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4072	cfs_b->period_timer.function = sched_cfs_period_timer;
				4073	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
				4074	cfs_b->slack_timer.function = sched_cfs_slack_timer;
				4075	}
				4076
				4077	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				4078	{
				4079	cfs_rq->runtime_enabled = 0;
				4080	INIT_LIST_HEAD(&cfs_rq->throttled_list);
				4081	}
				4082
Peter Zijlstra	77a4d1a	2015-04-15 11:41:57 +0200	[diff] [blame]	4083	void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4084	{
Peter Zijlstra	4cfafd3	2015-05-14 12:23:11 +0200	[diff] [blame]	4085	lockdep_assert_held(&cfs_b->lock);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4086
Peter Zijlstra	4cfafd3	2015-05-14 12:23:11 +0200	[diff] [blame]	4087	if (!cfs_b->period_active) {
				4088	cfs_b->period_active = 1;
				4089	hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
				4090	hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
				4091	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4092	}
				4093
				4094	static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				4095	{
Tetsuo Handa	7f1a169	2014-12-25 15:51:21 +0900	[diff] [blame]	4096	/* init_cfs_bandwidth() was not called */
				4097	if (!cfs_b->throttled_cfs_rq.next)
				4098	return;
				4099
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4100	hrtimer_cancel(&cfs_b->period_timer);
				4101	hrtimer_cancel(&cfs_b->slack_timer);
				4102	}
				4103
Kirill Tkhai	0e59bda	2014-06-25 12:19:42 +0400	[diff] [blame]	4104	static void __maybe_unused update_runtime_enabled(struct rq *rq)
				4105	{
				4106	struct cfs_rq *cfs_rq;
				4107
				4108	for_each_leaf_cfs_rq(rq, cfs_rq) {
				4109	struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
				4110
				4111	raw_spin_lock(&cfs_b->lock);
				4112	cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
				4113	raw_spin_unlock(&cfs_b->lock);
				4114	}
				4115	}
				4116
Arnd Bergmann	38dc334	2013-01-25 14:14:22 +0000	[diff] [blame]	4117	static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4118	{
				4119	struct cfs_rq *cfs_rq;
				4120
				4121	for_each_leaf_cfs_rq(rq, cfs_rq) {
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4122	if (!cfs_rq->runtime_enabled)
				4123	continue;
				4124
				4125	/*
				4126	* clock_task is not advancing so we just need to make sure
				4127	* there's some valid quota amount
				4128	*/
Ben Segall	51f2176	2014-05-19 15:49:45 -0700	[diff] [blame]	4129	cfs_rq->runtime_remaining = 1;
Kirill Tkhai	0e59bda	2014-06-25 12:19:42 +0400	[diff] [blame]	4130	/*
				4131	* Offline rq is schedulable till cpu is completely disabled
				4132	* in take_cpu_down(), so we prevent new cfs throttling here.
				4133	*/
				4134	cfs_rq->runtime_enabled = 0;
				4135
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4136	if (cfs_rq_throttled(cfs_rq))
				4137	unthrottle_cfs_rq(cfs_rq);
				4138	}
				4139	}
				4140
				4141	#else /* CONFIG_CFS_BANDWIDTH */
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	4142	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
				4143	{
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	4144	return rq_clock_task(rq_of(cfs_rq));
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	4145	}
				4146
Peter Zijlstra	9dbdb15	2013-11-18 18:27:06 +0100	[diff] [blame]	4147	static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	4148	static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	4149	static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	4150	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4151
				4152	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
				4153	{
				4154	return 0;
				4155	}
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	4156
				4157	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
				4158	{
				4159	return 0;
				4160	}
				4161
				4162	static inline int throttled_lb_pair(struct task_group *tg,
				4163	int src_cpu, int dest_cpu)
				4164	{
				4165	return 0;
				4166	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4167
				4168	void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
				4169
				4170	#ifdef CONFIG_FAIR_GROUP_SCHED
				4171	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
Paul Turner	ab84d31	2011-07-21 09:43:28 -0700	[diff] [blame]	4172	#endif
				4173
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4174	static inline struct cfs_bandwidth tg_cfs_bandwidth(struct task_group tg)
				4175	{
				4176	return NULL;
				4177	}
				4178	static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
Kirill Tkhai	0e59bda	2014-06-25 12:19:42 +0400	[diff] [blame]	4179	static inline void update_runtime_enabled(struct rq *rq) {}
Peter Boonstoppel	a4c96ae	2012-08-09 15:34:47 -0700	[diff] [blame]	4180	static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4181
				4182	#endif /* CONFIG_CFS_BANDWIDTH */
				4183
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4184	/**************************************************
				4185	* CFS operations on tasks:
				4186	*/
				4187
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	4188	#ifdef CONFIG_SCHED_HRTICK
				4189	static void hrtick_start_fair(struct rq rq, struct task_struct p)
				4190	{
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	4191	struct sched_entity *se = &p->se;
				4192	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				4193
				4194	WARN_ON(task_rq(p) != rq);
				4195
Mike Galbraith	b39e66e	2011-11-22 15:20:07 +0100	[diff] [blame]	4196	if (cfs_rq->nr_running > 1) {
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	4197	u64 slice = sched_slice(cfs_rq, se);
				4198	u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
				4199	s64 delta = slice - ran;
				4200
				4201	if (delta < 0) {
				4202	if (rq->curr == p)
Kirill Tkhai	8875125	2014-06-29 00:03:57 +0400	[diff] [blame]	4203	resched_curr(rq);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	4204	return;
				4205	}
Peter Zijlstra	3165651	2008-07-18 18:01:23 +0200	[diff] [blame]	4206	hrtick_start(rq, delta);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	4207	}
				4208	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	4209
				4210	/*
				4211	* called from enqueue/dequeue and updates the hrtick when the
				4212	* current task is from our class and nr_running is low enough
				4213	* to matter.
				4214	*/
				4215	static void hrtick_update(struct rq *rq)
				4216	{
				4217	struct task_struct *curr = rq->curr;
				4218
Mike Galbraith	b39e66e	2011-11-22 15:20:07 +0100	[diff] [blame]	4219	if (!hrtick_enabled(rq) \|\| curr->sched_class != &fair_sched_class)
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	4220	return;
				4221
				4222	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
				4223	hrtick_start_fair(rq, curr);
				4224	}
Dhaval Giani	55e12e5	2008-06-24 23:39:43 +0530	[diff] [blame]	4225	#else /* !CONFIG_SCHED_HRTICK */
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	4226	static inline void
				4227	hrtick_start_fair(struct rq rq, struct task_struct p)
				4228	{
				4229	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	4230
				4231	static inline void hrtick_update(struct rq *rq)
				4232	{
				4233	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	4234	#endif
				4235
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4236	/*
				4237	* The enqueue_task method is called before nr_running is
				4238	* increased. Here we update the fair scheduling stats and
				4239	* then put the task into the rbtree:
				4240	*/
Thomas Gleixner	ea87bb7	2010-01-20 20:58:57 +0000	[diff] [blame]	4241	static void
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	4242	enqueue_task_fair(struct rq rq, struct task_struct p, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4243	{
				4244	struct cfs_rq *cfs_rq;
Peter Zijlstra	62fb185	2008-02-25 17:34:02 +0100	[diff] [blame]	4245	struct sched_entity *se = &p->se;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4246
				4247	for_each_sched_entity(se) {
Peter Zijlstra	62fb185	2008-02-25 17:34:02 +0100	[diff] [blame]	4248	if (se->on_rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4249	break;
				4250	cfs_rq = cfs_rq_of(se);
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	4251	enqueue_entity(cfs_rq, se, flags);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4252
				4253	/*
				4254	* end evaluation on encountering a throttled cfs_rq
				4255	*
				4256	* note: in the case of encountering a throttled cfs_rq we will
				4257	* post the final h_nr_running increment below.
				4258	*/
				4259	if (cfs_rq_throttled(cfs_rq))
				4260	break;
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	4261	cfs_rq->h_nr_running++;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4262
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	4263	flags = ENQUEUE_WAKEUP;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4264	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	4265
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	4266	for_each_sched_entity(se) {
Lin Ming	0f31714	2011-07-22 09:14:31 +0800	[diff] [blame]	4267	cfs_rq = cfs_rq_of(se);
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	4268	cfs_rq->h_nr_running++;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	4269
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4270	if (cfs_rq_throttled(cfs_rq))
				4271	break;
				4272
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	4273	update_load_avg(se, 1);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	4274	update_cfs_shares(cfs_rq);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	4275	}
				4276
Yuyang Du	cd126af	2015-07-15 08:04:36 +0800	[diff] [blame]	4277	if (!se)
Kirill Tkhai	7246544	2014-05-09 03:00:14 +0400	[diff] [blame]	4278	add_nr_running(rq, 1);
Yuyang Du	cd126af	2015-07-15 08:04:36 +0800	[diff] [blame]	4279
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	4280	hrtick_update(rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4281	}
				4282
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	4283	static void set_next_buddy(struct sched_entity *se);
				4284
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4285	/*
				4286	* The dequeue_task method is called before nr_running is
				4287	* decreased. We remove the task from the rbtree and
				4288	* update the fair scheduling stats:
				4289	*/
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	4290	static void dequeue_task_fair(struct rq rq, struct task_struct p, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4291	{
				4292	struct cfs_rq *cfs_rq;
Peter Zijlstra	62fb185	2008-02-25 17:34:02 +0100	[diff] [blame]	4293	struct sched_entity *se = &p->se;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	4294	int task_sleep = flags & DEQUEUE_SLEEP;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4295
				4296	for_each_sched_entity(se) {
				4297	cfs_rq = cfs_rq_of(se);
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	4298	dequeue_entity(cfs_rq, se, flags);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4299
				4300	/*
				4301	* end evaluation on encountering a throttled cfs_rq
				4302	*
				4303	* note: in the case of encountering a throttled cfs_rq we will
				4304	* post the final h_nr_running decrement below.
				4305	*/
				4306	if (cfs_rq_throttled(cfs_rq))
				4307	break;
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	4308	cfs_rq->h_nr_running--;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	4309
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4310	/* Don't dequeue parent if it has other entities besides us */
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	4311	if (cfs_rq->load.weight) {
				4312	/*
				4313	* Bias pick_next to pick a task from this cfs_rq, as
				4314	* p is sleeping when it is within its sched_slice.
				4315	*/
				4316	if (task_sleep && parent_entity(se))
				4317	set_next_buddy(parent_entity(se));
Paul Turner	9598c82	2011-07-06 22:30:37 -0700	[diff] [blame]	4318
				4319	/* avoid re-evaluating load for this entity */
				4320	se = parent_entity(se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4321	break;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	4322	}
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	4323	flags \|= DEQUEUE_SLEEP;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4324	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	4325
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	4326	for_each_sched_entity(se) {
Lin Ming	0f31714	2011-07-22 09:14:31 +0800	[diff] [blame]	4327	cfs_rq = cfs_rq_of(se);
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	4328	cfs_rq->h_nr_running--;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	4329
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	4330	if (cfs_rq_throttled(cfs_rq))
				4331	break;
				4332
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	4333	update_load_avg(se, 1);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	4334	update_cfs_shares(cfs_rq);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	4335	}
				4336
Yuyang Du	cd126af	2015-07-15 08:04:36 +0800	[diff] [blame]	4337	if (!se)
Kirill Tkhai	7246544	2014-05-09 03:00:14 +0400	[diff] [blame]	4338	sub_nr_running(rq, 1);
Yuyang Du	cd126af	2015-07-15 08:04:36 +0800	[diff] [blame]	4339
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	4340	hrtick_update(rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	4341	}
				4342
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	4343	#ifdef CONFIG_SMP
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4344
				4345	/*
				4346	* per rq 'load' arrray crap; XXX kill this.
				4347	*/
				4348
				4349	/*
Peter Zijlstra	d937cdc	2015-10-19 13:49:30 +0200	[diff] [blame]	4350	* The exact cpuload calculated at every tick would be:
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4351	*
Peter Zijlstra	d937cdc	2015-10-19 13:49:30 +0200	[diff] [blame]	4352	* load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
				4353	*
				4354	* If a cpu misses updates for n ticks (as it was idle) and update gets
				4355	* called on the n+1-th tick when cpu may be busy, then we have:
				4356	*
				4357	* load_n = (1 - 1/2^i)^n * load_0
				4358	* load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4359	*
				4360	* decay_load_missed() below does efficient calculation of
Peter Zijlstra	d937cdc	2015-10-19 13:49:30 +0200	[diff] [blame]	4361	*
				4362	* load' = (1 - 1/2^i)^n * load
				4363	*
				4364	* Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
				4365	* This allows us to precompute the above in said factors, thereby allowing the
				4366	* reduction of an arbitrary n in O(log_2 n) steps. (See also
				4367	* fixed_power_int())
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4368	*
				4369	* The calculation is approximated on a 128 point scale.
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4370	*/
				4371	#define DEGRADE_SHIFT 7
Peter Zijlstra	d937cdc	2015-10-19 13:49:30 +0200	[diff] [blame]	4372
				4373	static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
				4374	static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
				4375	{ 0, 0, 0, 0, 0, 0, 0, 0 },
				4376	{ 64, 32, 8, 0, 0, 0, 0, 0 },
				4377	{ 96, 72, 40, 12, 1, 0, 0, 0 },
				4378	{ 112, 98, 75, 43, 15, 1, 0, 0 },
				4379	{ 120, 112, 98, 76, 45, 16, 2, 0 }
				4380	};
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4381
				4382	/*
				4383	* Update cpu_load for any missed ticks, due to tickless idle. The backlog
				4384	* would be when CPU is idle and so we just decay the old load without
				4385	* adding any new load.
				4386	*/
				4387	static unsigned long
				4388	decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
				4389	{
				4390	int j = 0;
				4391
				4392	if (!missed_updates)
				4393	return load;
				4394
				4395	if (missed_updates >= degrade_zero_ticks[idx])
				4396	return 0;
				4397
				4398	if (idx == 1)
				4399	return load >> missed_updates;
				4400
				4401	while (missed_updates) {
				4402	if (missed_updates % 2)
				4403	load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
				4404
				4405	missed_updates >>= 1;
				4406	j++;
				4407	}
				4408	return load;
				4409	}
				4410
Byungchul Park	5954327	2015-10-14 18:47:35 +0900	[diff] [blame]	4411	/**
				4412	* __update_cpu_load - update the rq->cpu_load[] statistics
				4413	* @this_rq: The rq to update statistics for
				4414	* @this_load: The current load
				4415	* @pending_updates: The number of missed updates
				4416	* @active: !0 for NOHZ_FULL
				4417	*
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4418	* Update rq->cpu_load[] statistics. This function is usually called every
Byungchul Park	5954327	2015-10-14 18:47:35 +0900	[diff] [blame]	4419	* scheduler tick (TICK_NSEC).
				4420	*
				4421	* This function computes a decaying average:
				4422	*
				4423	* load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
				4424	*
				4425	* Because of NOHZ it might not get called on every tick which gives need for
				4426	* the @pending_updates argument.
				4427	*
				4428	* load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
				4429	* = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
				4430	* = A * (A * load[i]_n-2 + B) + B
				4431	* = A * (A * (A * load[i]_n-3 + B) + B) + B
				4432	* = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
				4433	* = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
				4434	* = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
				4435	* = (1 - 1/2^i)^n * (load[i]_0 - load) + load
				4436	*
				4437	* In the above we've assumed load_n := load, which is true for NOHZ_FULL as
				4438	* any change in load would have resulted in the tick being turned back on.
				4439	*
				4440	* For regular NOHZ, this reduces to:
				4441	*
				4442	* load[i]_n = (1 - 1/2^i)^n * load[i]_0
				4443	*
				4444	* see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
				4445	* term. See the @active paramter.
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4446	*/
				4447	static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
Byungchul Park	5954327	2015-10-14 18:47:35 +0900	[diff] [blame]	4448	unsigned long pending_updates, int active)
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4449	{
Byungchul Park	5954327	2015-10-14 18:47:35 +0900	[diff] [blame]	4450	unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0;
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4451	int i, scale;
				4452
				4453	this_rq->nr_load_updates++;
				4454
				4455	/* Update our load: */
				4456	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
				4457	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
				4458	unsigned long old_load, new_load;
				4459
				4460	/* scale is effectively 1 << i now, and >> i divides by scale */
				4461
Byungchul Park	5954327	2015-10-14 18:47:35 +0900	[diff] [blame]	4462	old_load = this_rq->cpu_load[i] - tickless_load;
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4463	old_load = decay_load_missed(old_load, pending_updates - 1, i);
Byungchul Park	5954327	2015-10-14 18:47:35 +0900	[diff] [blame]	4464	old_load += tickless_load;
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4465	new_load = this_load;
				4466	/*
				4467	* Round up the averaging division if load is increasing. This
				4468	* prevents us from getting stuck on 9 if the load is 10, for
				4469	* example.
				4470	*/
				4471	if (new_load > old_load)
				4472	new_load += scale - 1;
				4473
				4474	this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
				4475	}
				4476
				4477	sched_avg_update(this_rq);
				4478	}
				4479
Yuyang Du	7ea241a	2015-07-15 08:04:42 +0800	[diff] [blame]	4480	/* Used instead of source_load when we know the type == 0 */
				4481	static unsigned long weighted_cpuload(const int cpu)
				4482	{
				4483	return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
				4484	}
				4485
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4486	#ifdef CONFIG_NO_HZ_COMMON
				4487	/*
				4488	* There is no sane way to deal with nohz on smp when using jiffies because the
				4489	* cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
				4490	* causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
				4491	*
				4492	* Therefore we cannot use the delta approach from the regular tick since that
				4493	* would seriously skew the load calculation. However we'll make do for those
				4494	* updates happening while idle (nohz_idle_balance) or coming out of idle
				4495	* (tick_nohz_idle_exit).
				4496	*
				4497	* This means we might still be one tick off for nohz periods.
				4498	*/
				4499
				4500	/*
				4501	* Called from nohz_idle_balance() to update the load ratings before doing the
				4502	* idle balance.
				4503	*/
				4504	static void update_idle_cpu_load(struct rq *this_rq)
				4505	{
Jason Low	316c1608d	2015-04-28 13:00:20 -0700	[diff] [blame]	4506	unsigned long curr_jiffies = READ_ONCE(jiffies);
Yuyang Du	7ea241a	2015-07-15 08:04:42 +0800	[diff] [blame]	4507	unsigned long load = weighted_cpuload(cpu_of(this_rq));
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4508	unsigned long pending_updates;
				4509
				4510	/*
				4511	* bail if there's load or we're actually up-to-date.
				4512	*/
				4513	if (load \|\| curr_jiffies == this_rq->last_load_update_tick)
				4514	return;
				4515
				4516	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
				4517	this_rq->last_load_update_tick = curr_jiffies;
				4518
Byungchul Park	5954327	2015-10-14 18:47:35 +0900	[diff] [blame]	4519	__update_cpu_load(this_rq, load, pending_updates, 0);
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4520	}
				4521
				4522	/*
				4523	* Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
				4524	*/
Byungchul Park	525705d	2015-11-10 09:36:02 +0900	[diff] [blame]	4525	void update_cpu_load_nohz(int active)
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4526	{
				4527	struct rq *this_rq = this_rq();
Jason Low	316c1608d	2015-04-28 13:00:20 -0700	[diff] [blame]	4528	unsigned long curr_jiffies = READ_ONCE(jiffies);
Byungchul Park	525705d	2015-11-10 09:36:02 +0900	[diff] [blame]	4529	unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0;
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4530	unsigned long pending_updates;
				4531
				4532	if (curr_jiffies == this_rq->last_load_update_tick)
				4533	return;
				4534
				4535	raw_spin_lock(&this_rq->lock);
				4536	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
				4537	if (pending_updates) {
				4538	this_rq->last_load_update_tick = curr_jiffies;
				4539	/*
Byungchul Park	525705d	2015-11-10 09:36:02 +0900	[diff] [blame]	4540	* In the regular NOHZ case, we were idle, this means load 0.
				4541	* In the NOHZ_FULL case, we were non-idle, we should consider
				4542	* its weighted load.
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4543	*/
Byungchul Park	525705d	2015-11-10 09:36:02 +0900	[diff] [blame]	4544	__update_cpu_load(this_rq, load, pending_updates, active);
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4545	}
				4546	raw_spin_unlock(&this_rq->lock);
				4547	}
				4548	#endif /* CONFIG_NO_HZ */
				4549
				4550	/*
				4551	* Called from scheduler_tick()
				4552	*/
				4553	void update_cpu_load_active(struct rq *this_rq)
				4554	{
Yuyang Du	7ea241a	2015-07-15 08:04:42 +0800	[diff] [blame]	4555	unsigned long load = weighted_cpuload(cpu_of(this_rq));
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4556	/*
				4557	* See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
				4558	*/
				4559	this_rq->last_load_update_tick = jiffies;
Byungchul Park	5954327	2015-10-14 18:47:35 +0900	[diff] [blame]	4560	__update_cpu_load(this_rq, load, 1, 1);
Peter Zijlstra	3289bdb	2015-04-14 13:19:42 +0200	[diff] [blame]	4561	}
				4562
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4563	/*
				4564	* Return a low guess at the load of a migration-source cpu weighted
				4565	* according to the scheduling class and "nice" value.
				4566	*
				4567	* We want to under-estimate the load of migration sources, to
				4568	* balance conservatively.
				4569	*/
				4570	static unsigned long source_load(int cpu, int type)
				4571	{
				4572	struct rq *rq = cpu_rq(cpu);
				4573	unsigned long total = weighted_cpuload(cpu);
				4574
				4575	if (type == 0 \|\| !sched_feat(LB_BIAS))
				4576	return total;
				4577
				4578	return min(rq->cpu_load[type-1], total);
				4579	}
				4580
				4581	/*
				4582	* Return a high guess at the load of a migration-target cpu weighted
				4583	* according to the scheduling class and "nice" value.
				4584	*/
				4585	static unsigned long target_load(int cpu, int type)
				4586	{
				4587	struct rq *rq = cpu_rq(cpu);
				4588	unsigned long total = weighted_cpuload(cpu);
				4589
				4590	if (type == 0 \|\| !sched_feat(LB_BIAS))
				4591	return total;
				4592
				4593	return max(rq->cpu_load[type-1], total);
				4594	}
				4595
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	4596	static unsigned long capacity_of(int cpu)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4597	{
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	4598	return cpu_rq(cpu)->cpu_capacity;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4599	}
				4600
Vincent Guittot	ca6d75e	2015-02-27 16:54:09 +0100	[diff] [blame]	4601	static unsigned long capacity_orig_of(int cpu)
				4602	{
				4603	return cpu_rq(cpu)->cpu_capacity_orig;
				4604	}
				4605
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4606	static unsigned long cpu_avg_load_per_task(int cpu)
				4607	{
				4608	struct rq *rq = cpu_rq(cpu);
Jason Low	316c1608d	2015-04-28 13:00:20 -0700	[diff] [blame]	4609	unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
Yuyang Du	7ea241a	2015-07-15 08:04:42 +0800	[diff] [blame]	4610	unsigned long load_avg = weighted_cpuload(cpu);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4611
				4612	if (nr_running)
Alex Shi	b92486c	2013-06-20 10:18:50 +0800	[diff] [blame]	4613	return load_avg / nr_running;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4614
				4615	return 0;
				4616	}
				4617
Michael Wang	6247041	2013-07-04 12:55:51 +0800	[diff] [blame]	4618	static void record_wakee(struct task_struct *p)
				4619	{
				4620	/*
				4621	* Rough decay (wiping) for cost saving, don't worry
				4622	* about the boundary, really active task won't care
				4623	* about the loss.
				4624	*/
Manuel Schölling	2538d96	2014-05-22 19:45:23 +0200	[diff] [blame]	4625	if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
Rik van Riel	096aa33	2014-05-16 00:13:32 -0400	[diff] [blame]	4626	current->wakee_flips >>= 1;
Michael Wang	6247041	2013-07-04 12:55:51 +0800	[diff] [blame]	4627	current->wakee_flip_decay_ts = jiffies;
				4628	}
				4629
				4630	if (current->last_wakee != p) {
				4631	current->last_wakee = p;
				4632	current->wakee_flips++;
				4633	}
				4634	}
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	4635
Peter Zijlstra	74f8e4b	2011-04-05 17:23:47 +0200	[diff] [blame]	4636	static void task_waking_fair(struct task_struct *p)
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	4637	{
				4638	struct sched_entity *se = &p->se;
				4639	struct cfs_rq *cfs_rq = cfs_rq_of(se);
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	4640	u64 min_vruntime;
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	4641
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	4642	#ifndef CONFIG_64BIT
				4643	u64 min_vruntime_copy;
Peter Zijlstra	74f8e4b	2011-04-05 17:23:47 +0200	[diff] [blame]	4644
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	4645	do {
				4646	min_vruntime_copy = cfs_rq->min_vruntime_copy;
				4647	smp_rmb();
				4648	min_vruntime = cfs_rq->min_vruntime;
				4649	} while (min_vruntime != min_vruntime_copy);
				4650	#else
				4651	min_vruntime = cfs_rq->min_vruntime;
				4652	#endif
				4653
				4654	se->vruntime -= min_vruntime;
Michael Wang	6247041	2013-07-04 12:55:51 +0800	[diff] [blame]	4655	record_wakee(p);
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	4656	}
				4657
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	4658	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	f5bfb7d	2008-06-27 13:41:39 +0200	[diff] [blame]	4659	/*
				4660	* effective_load() calculates the load change as seen from the root_task_group
				4661	*
				4662	* Adding load to a group doesn't make a group heavier, but can cause movement
				4663	* of group shares between cpus. Assuming the shares were perfectly aligned one
				4664	* can calculate the shift in shares.
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	4665	*
				4666	* Calculate the effective load difference if @wl is added (subtracted) to @tg
				4667	* on this @cpu and results in a total addition (subtraction) of @wg to the
				4668	* total group weight.
				4669	*
				4670	* Given a runqueue weight distribution (rw_i) we can compute a shares
				4671	* distribution (s_i) using:
				4672	*
				4673	* s_i = rw_i / \Sum rw_j (1)
				4674	*
				4675	* Suppose we have 4 CPUs and our @tg is a direct child of the root group and
				4676	* has 7 equal weight tasks, distributed as below (rw_i), with the resulting
				4677	* shares distribution (s_i):
				4678	*
				4679	* rw_i = { 2, 4, 1, 0 }
				4680	* s_i = { 2/7, 4/7, 1/7, 0 }
				4681	*
				4682	* As per wake_affine() we're interested in the load of two CPUs (the CPU the
				4683	* task used to run on and the CPU the waker is running on), we need to
				4684	* compute the effect of waking a task on either CPU and, in case of a sync
				4685	* wakeup, compute the effect of the current task going to sleep.
				4686	*
				4687	* So for a change of @wl to the local @cpu with an overall group weight change
				4688	* of @wl we can compute the new shares distribution (s'_i) using:
				4689	*
				4690	* s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
				4691	*
				4692	* Suppose we're interested in CPUs 0 and 1, and want to compute the load
				4693	* differences in waking a task to CPU 0. The additional task changes the
				4694	* weight and shares distributions like:
				4695	*
				4696	* rw'_i = { 3, 4, 1, 0 }
				4697	* s'_i = { 3/8, 4/8, 1/8, 0 }
				4698	*
				4699	* We can then compute the difference in effective weight by using:
				4700	*
				4701	* dw_i = S * (s'_i - s_i) (3)
				4702	*
				4703	* Where 'S' is the group weight as seen by its parent.
				4704	*
				4705	* Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
				4706	* times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
				4707	* 4/7) times the weight of the group.
Peter Zijlstra	f5bfb7d	2008-06-27 13:41:39 +0200	[diff] [blame]	4708	*/
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	4709	static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	4710	{
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	4711	struct sched_entity *se = tg->se[cpu];
Peter Zijlstra	f1d239f	2008-06-27 13:41:38 +0200	[diff] [blame]	4712
Rik van Riel	9722c2d	2014-01-06 11:39:12 +0000	[diff] [blame]	4713	if (!tg->parent) /* the trivial, non-cgroup case */
Peter Zijlstra	f1d239f	2008-06-27 13:41:38 +0200	[diff] [blame]	4714	return wl;
				4715
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	4716	for_each_sched_entity(se) {
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	4717	long w, W;
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	4718
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	4719	tg = se->my_q->tg;
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	4720
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	4721	/*
				4722	* W = @wg + \Sum rw_j
				4723	*/
				4724	W = wg + calc_tg_weight(tg, se->my_q);
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	4725
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	4726	/*
				4727	* w = rw_i + @wl
				4728	*/
Yuyang Du	7ea241a	2015-07-15 08:04:42 +0800	[diff] [blame]	4729	w = cfs_rq_load_avg(se->my_q) + wl;
Peter Zijlstra	940959e	2008-09-23 15:33:42 +0200	[diff] [blame]	4730
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	4731	/*
				4732	* wl = S * s'_i; see (2)
				4733	*/
				4734	if (W > 0 && w < W)
Yuyang Du	32a8df4	2014-12-19 08:29:56 +0800	[diff] [blame]	4735	wl = (w * (long)tg->shares) / W;
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	4736	else
				4737	wl = tg->shares;
Peter Zijlstra	940959e	2008-09-23 15:33:42 +0200	[diff] [blame]	4738
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	4739	/*
				4740	* Per the above, wl is the new se->load.weight value; since
				4741	* those are clipped to [MIN_SHARES, ...) do so now. See
				4742	* calc_cfs_shares().
				4743	*/
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	4744	if (wl < MIN_SHARES)
				4745	wl = MIN_SHARES;
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	4746
				4747	/*
				4748	* wl = dw_i = S * (s'_i - s_i); see (3)
				4749	*/
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	4750	wl -= se->avg.load_avg;
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	4751
				4752	/*
				4753	* Recursively apply this logic to all parent groups to compute
				4754	* the final effective load change on the root group. Since
				4755	* only the @tg group gets extra weight, all parent groups can
				4756	* only redistribute existing shares. @wl is the shift in shares
				4757	* resulting from this level per the above.
				4758	*/
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	4759	wg = 0;
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	4760	}
				4761
				4762	return wl;
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	4763	}
				4764	#else
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	4765
Mel Gorman	58d081b	2013-10-07 11:29:10 +0100	[diff] [blame]	4766	static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	4767	{
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	4768	return wl;
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	4769	}
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	4770
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	4771	#endif
				4772
Mike Galbraith	63b0e9e	2015-07-14 17:39:50 +0200	[diff] [blame]	4773	/*
				4774	* Detect M:N waker/wakee relationships via a switching-frequency heuristic.
				4775	* A waker of many should wake a different task than the one last awakened
				4776	* at a frequency roughly N times higher than one of its wakees. In order
				4777	* to determine whether we should let the load spread vs consolodating to
				4778	* shared cache, we look for a minimum 'flip' frequency of llc_size in one
				4779	* partner, and a factor of lls_size higher frequency in the other. With
				4780	* both conditions met, we can be relatively sure that the relationship is
				4781	* non-monogamous, with partner count exceeding socket size. Waker/wakee
				4782	* being client/server, worker/dispatcher, interrupt source or whatever is
				4783	* irrelevant, spread criteria is apparent partner count exceeds socket size.
				4784	*/
Michael Wang	6247041	2013-07-04 12:55:51 +0800	[diff] [blame]	4785	static int wake_wide(struct task_struct *p)
				4786	{
Mike Galbraith	63b0e9e	2015-07-14 17:39:50 +0200	[diff] [blame]	4787	unsigned int master = current->wakee_flips;
				4788	unsigned int slave = p->wakee_flips;
Peter Zijlstra	7d9ffa8	2013-07-04 12:56:46 +0800	[diff] [blame]	4789	int factor = this_cpu_read(sd_llc_size);
Michael Wang	6247041	2013-07-04 12:55:51 +0800	[diff] [blame]	4790
Mike Galbraith	63b0e9e	2015-07-14 17:39:50 +0200	[diff] [blame]	4791	if (master < slave)
				4792	swap(master, slave);
				4793	if (slave < factor \|\| master < slave * factor)
				4794	return 0;
				4795	return 1;
Michael Wang	6247041	2013-07-04 12:55:51 +0800	[diff] [blame]	4796	}
				4797
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	4798	static int wake_affine(struct sched_domain sd, struct task_struct p, int sync)
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	4799	{
Paul Turner	e37b6a7	2011-01-21 20:44:59 -0800	[diff] [blame]	4800	s64 this_load, load;
Vincent Guittot	bd61c98	2014-08-26 13:06:50 +0200	[diff] [blame]	4801	s64 this_eff_load, prev_eff_load;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	4802	int idx, this_cpu, prev_cpu;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	4803	struct task_group *tg;
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	4804	unsigned long weight;
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	4805	int balanced;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	4806
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	4807	idx = sd->wake_idx;
				4808	this_cpu = smp_processor_id();
				4809	prev_cpu = task_cpu(p);
				4810	load = source_load(prev_cpu, idx);
				4811	this_load = target_load(this_cpu, idx);
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	4812
				4813	/*
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	4814	* If sync wakeup then subtract the (maximum possible)
				4815	* effect of the currently running task from the load
				4816	* of the current CPU:
				4817	*/
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	4818	if (sync) {
				4819	tg = task_group(current);
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	4820	weight = current->se.avg.load_avg;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	4821
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	4822	this_load += effective_load(tg, this_cpu, -weight, -weight);
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	4823	load += effective_load(tg, prev_cpu, 0, -weight);
				4824	}
				4825
				4826	tg = task_group(p);
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	4827	weight = p->se.avg.load_avg;
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	4828
Peter Zijlstra	71a29aa	2009-09-07 18:28:05 +0200	[diff] [blame]	4829	/*
				4830	* In low-load situations, where prev_cpu is idle and this_cpu is idle
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	4831	* due to the sync cause above having dropped this_load to 0, we'll
				4832	* always have an imbalance, but there's really nothing you can do
				4833	* about that, so that's good too.
Peter Zijlstra	71a29aa	2009-09-07 18:28:05 +0200	[diff] [blame]	4834	*
				4835	* Otherwise check if either cpus are near enough in load to allow this
				4836	* task to be woken on this_cpu.
				4837	*/
Vincent Guittot	bd61c98	2014-08-26 13:06:50 +0200	[diff] [blame]	4838	this_eff_load = 100;
				4839	this_eff_load *= capacity_of(prev_cpu);
Peter Zijlstra	e51fd5e	2010-05-31 12:37:30 +0200	[diff] [blame]	4840
Vincent Guittot	bd61c98	2014-08-26 13:06:50 +0200	[diff] [blame]	4841	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
				4842	prev_eff_load *= capacity_of(this_cpu);
				4843
				4844	if (this_load > 0) {
Peter Zijlstra	e51fd5e	2010-05-31 12:37:30 +0200	[diff] [blame]	4845	this_eff_load *= this_load +
				4846	effective_load(tg, this_cpu, weight, weight);
				4847
Peter Zijlstra	e51fd5e	2010-05-31 12:37:30 +0200	[diff] [blame]	4848	prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
Vincent Guittot	bd61c98	2014-08-26 13:06:50 +0200	[diff] [blame]	4849	}
Peter Zijlstra	e51fd5e	2010-05-31 12:37:30 +0200	[diff] [blame]	4850
Vincent Guittot	bd61c98	2014-08-26 13:06:50 +0200	[diff] [blame]	4851	balanced = this_eff_load <= prev_eff_load;
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	4852
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	4853	schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	4854
Vincent Guittot	05bfb65	2014-08-26 13:06:45 +0200	[diff] [blame]	4855	if (!balanced)
				4856	return 0;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	4857
Vincent Guittot	05bfb65	2014-08-26 13:06:45 +0200	[diff] [blame]	4858	schedstat_inc(sd, ttwu_move_affine);
				4859	schedstat_inc(p, se.statistics.nr_wakeups_affine);
				4860
				4861	return 1;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	4862	}
				4863
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	4864	/*
				4865	* find_idlest_group finds and returns the least busy CPU group within the
				4866	* domain.
				4867	*/
				4868	static struct sched_group *
Peter Zijlstra	78e7ed5	2009-09-03 13:16:51 +0200	[diff] [blame]	4869	find_idlest_group(struct sched_domain sd, struct task_struct p,
Vincent Guittot	c44f2a0	2013-10-18 13:52:21 +0200	[diff] [blame]	4870	int this_cpu, int sd_flag)
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	4871	{
Andi Kleen	b3bd3de	2010-08-10 14:17:51 -0700	[diff] [blame]	4872	struct sched_group idlest = NULL, group = sd->groups;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	4873	unsigned long min_load = ULONG_MAX, this_load = 0;
Vincent Guittot	c44f2a0	2013-10-18 13:52:21 +0200	[diff] [blame]	4874	int load_idx = sd->forkexec_idx;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	4875	int imbalance = 100 + (sd->imbalance_pct-100)/2;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	4876
Vincent Guittot	c44f2a0	2013-10-18 13:52:21 +0200	[diff] [blame]	4877	if (sd_flag & SD_BALANCE_WAKE)
				4878	load_idx = sd->wake_idx;
				4879
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	4880	do {
				4881	unsigned long load, avg_load;
				4882	int local_group;
				4883	int i;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	4884
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	4885	/* Skip over this group if it has no CPUs allowed */
				4886	if (!cpumask_intersects(sched_group_cpus(group),
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	4887	tsk_cpus_allowed(p)))
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	4888	continue;
				4889
				4890	local_group = cpumask_test_cpu(this_cpu,
				4891	sched_group_cpus(group));
				4892
				4893	/* Tally up the load of all CPUs in the group */
				4894	avg_load = 0;
				4895
				4896	for_each_cpu(i, sched_group_cpus(group)) {
				4897	/* Bias balancing toward cpus of our domain */
				4898	if (local_group)
				4899	load = source_load(i, load_idx);
				4900	else
				4901	load = target_load(i, load_idx);
				4902
				4903	avg_load += load;
				4904	}
				4905
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	4906	/* Adjust by relative CPU capacity of the group */
Nicolas Pitre	ca8ce3d	2014-05-26 18:19:39 -0400	[diff] [blame]	4907	avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	4908
				4909	if (local_group) {
				4910	this_load = avg_load;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	4911	} else if (avg_load < min_load) {
				4912	min_load = avg_load;
				4913	idlest = group;
				4914	}
				4915	} while (group = group->next, group != sd->groups);
				4916
				4917	if (!idlest \|\| 100this_load < imbalancemin_load)
				4918	return NULL;
				4919	return idlest;
				4920	}
				4921
				4922	/*
				4923	* find_idlest_cpu - find the idlest cpu among the cpus in group.
				4924	*/
				4925	static int
				4926	find_idlest_cpu(struct sched_group group, struct task_struct p, int this_cpu)
				4927	{
				4928	unsigned long load, min_load = ULONG_MAX;
Nicolas Pitre	83a0a96	2014-09-04 11:32:10 -0400	[diff] [blame]	4929	unsigned int min_exit_latency = UINT_MAX;
				4930	u64 latest_idle_timestamp = 0;
				4931	int least_loaded_cpu = this_cpu;
				4932	int shallowest_idle_cpu = -1;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	4933	int i;
				4934
				4935	/* Traverse only the allowed CPUs */
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	4936	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
Nicolas Pitre	83a0a96	2014-09-04 11:32:10 -0400	[diff] [blame]	4937	if (idle_cpu(i)) {
				4938	struct rq *rq = cpu_rq(i);
				4939	struct cpuidle_state *idle = idle_get_state(rq);
				4940	if (idle && idle->exit_latency < min_exit_latency) {
				4941	/*
				4942	* We give priority to a CPU whose idle state
				4943	* has the smallest exit latency irrespective
				4944	* of any idle timestamp.
				4945	*/
				4946	min_exit_latency = idle->exit_latency;
				4947	latest_idle_timestamp = rq->idle_stamp;
				4948	shallowest_idle_cpu = i;
				4949	} else if ((!idle \|\| idle->exit_latency == min_exit_latency) &&
				4950	rq->idle_stamp > latest_idle_timestamp) {
				4951	/*
				4952	* If equal or no active idle state, then
				4953	* the most recently idled CPU might have
				4954	* a warmer cache.
				4955	*/
				4956	latest_idle_timestamp = rq->idle_stamp;
				4957	shallowest_idle_cpu = i;
				4958	}
Yao Dongdong	9f96742	2014-10-28 04:08:06 +0000	[diff] [blame]	4959	} else if (shallowest_idle_cpu == -1) {
Nicolas Pitre	83a0a96	2014-09-04 11:32:10 -0400	[diff] [blame]	4960	load = weighted_cpuload(i);
				4961	if (load < min_load \|\| (load == min_load && i == this_cpu)) {
				4962	min_load = load;
				4963	least_loaded_cpu = i;
				4964	}
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	4965	}
				4966	}
				4967
Nicolas Pitre	83a0a96	2014-09-04 11:32:10 -0400	[diff] [blame]	4968	return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	4969	}
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	4970
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	4971	/*
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	4972	* Try and locate an idle CPU in the sched_domain.
				4973	*/
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	4974	static int select_idle_sibling(struct task_struct *p, int target)
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	4975	{
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	4976	struct sched_domain *sd;
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	4977	struct sched_group *sg;
Mike Galbraith	e0a79f5	2013-01-28 12:19:25 +0100	[diff] [blame]	4978	int i = task_cpu(p);
				4979
				4980	if (idle_cpu(target))
				4981	return target;
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	4982
				4983	/*
Mike Galbraith	e0a79f5	2013-01-28 12:19:25 +0100	[diff] [blame]	4984	* If the prevous cpu is cache affine and idle, don't be stupid.
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	4985	*/
Mike Galbraith	e0a79f5	2013-01-28 12:19:25 +0100	[diff] [blame]	4986	if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
				4987	return i;
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	4988
				4989	/*
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	4990	* Otherwise, iterate the domains and find an elegible idle cpu.
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	4991	*/
Peter Zijlstra	518cd62	2011-12-07 15:07:31 +0100	[diff] [blame]	4992	sd = rcu_dereference(per_cpu(sd_llc, target));
Suresh Siddha	77e8136	2011-11-17 11:08:23 -0800	[diff] [blame]	4993	for_each_lower_domain(sd) {
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	4994	sg = sd->groups;
				4995	do {
				4996	if (!cpumask_intersects(sched_group_cpus(sg),
				4997	tsk_cpus_allowed(p)))
				4998	goto next;
Mike Galbraith	970e178	2012-06-12 05:18:32 +0200	[diff] [blame]	4999
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	5000	for_each_cpu(i, sched_group_cpus(sg)) {
Mike Galbraith	e0a79f5	2013-01-28 12:19:25 +0100	[diff] [blame]	5001	if (i == target \|\| !idle_cpu(i))
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	5002	goto next;
				5003	}
				5004
				5005	target = cpumask_first_and(sched_group_cpus(sg),
				5006	tsk_cpus_allowed(p));
				5007	goto done;
				5008	next:
				5009	sg = sg->next;
				5010	} while (sg != sd->groups);
				5011	}
				5012	done:
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	5013	return target;
				5014	}
Dietmar Eggemann	231678b	2015-08-14 17:23:13 +0100	[diff] [blame]	5015
Vincent Guittot	8bb5b00	2015-03-04 08:48:47 +0100	[diff] [blame]	5016	/*
Dietmar Eggemann	9e91d61	2015-08-14 17:23:12 +0100	[diff] [blame]	5017	* cpu_util returns the amount of capacity of a CPU that is used by CFS
Vincent Guittot	8bb5b00	2015-03-04 08:48:47 +0100	[diff] [blame]	5018	* tasks. The unit of the return value must be the one of capacity so we can
Dietmar Eggemann	9e91d61	2015-08-14 17:23:12 +0100	[diff] [blame]	5019	* compare the utilization with the capacity of the CPU that is available for
				5020	* CFS task (ie cpu_capacity).
Dietmar Eggemann	231678b	2015-08-14 17:23:13 +0100	[diff] [blame]	5021	*
				5022	* cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
				5023	* recent utilization of currently non-runnable tasks on a CPU. It represents
				5024	* the amount of utilization of a CPU in the range [0..capacity_orig] where
				5025	* capacity_orig is the cpu_capacity available at the highest frequency
				5026	* (arch_scale_freq_capacity()).
				5027	* The utilization of a CPU converges towards a sum equal to or less than the
				5028	* current capacity (capacity_curr <= capacity_orig) of the CPU because it is
				5029	* the running time on this CPU scaled by capacity_curr.
				5030	*
				5031	* Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
				5032	* higher than capacity_orig because of unfortunate rounding in
				5033	* cfs.avg.util_avg or just after migrating tasks and new task wakeups until
				5034	* the average stabilizes with the new running time. We need to check that the
				5035	* utilization stays within the range of [0..capacity_orig] and cap it if
				5036	* necessary. Without utilization capping, a group could be seen as overloaded
				5037	* (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
				5038	* available capacity. We allow utilization to overshoot capacity_curr (but not
				5039	* capacity_orig) as it useful for predicting the capacity required after task
				5040	* migrations (scheduler-driven DVFS).
Vincent Guittot	8bb5b00	2015-03-04 08:48:47 +0100	[diff] [blame]	5041	*/
Dietmar Eggemann	9e91d61	2015-08-14 17:23:12 +0100	[diff] [blame]	5042	static int cpu_util(int cpu)
Vincent Guittot	8bb5b00	2015-03-04 08:48:47 +0100	[diff] [blame]	5043	{
Dietmar Eggemann	9e91d61	2015-08-14 17:23:12 +0100	[diff] [blame]	5044	unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
Vincent Guittot	8bb5b00	2015-03-04 08:48:47 +0100	[diff] [blame]	5045	unsigned long capacity = capacity_orig_of(cpu);
				5046
Dietmar Eggemann	231678b	2015-08-14 17:23:13 +0100	[diff] [blame]	5047	return (util >= capacity) ? capacity : util;
Vincent Guittot	8bb5b00	2015-03-04 08:48:47 +0100	[diff] [blame]	5048	}
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	5049
				5050	/*
Morten Rasmussen	de91b9c	2014-02-18 14:14:24 +0000	[diff] [blame]	5051	* select_task_rq_fair: Select target runqueue for the waking task in domains
				5052	* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
				5053	* SD_BALANCE_FORK, or SD_BALANCE_EXEC.
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5054	*
Morten Rasmussen	de91b9c	2014-02-18 14:14:24 +0000	[diff] [blame]	5055	* Balances load by selecting the idlest cpu in the idlest group, or under
				5056	* certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5057	*
Morten Rasmussen	de91b9c	2014-02-18 14:14:24 +0000	[diff] [blame]	5058	* Returns the target cpu number.
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5059	*
				5060	* preempt must be disabled.
				5061	*/
Peter Zijlstra	0017d73	2010-03-24 18:34:10 +0100	[diff] [blame]	5062	static int
Peter Zijlstra	ac66f54	2013-10-07 11:29:16 +0100	[diff] [blame]	5063	select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5064	{
Peter Zijlstra	29cd8ba	2009-09-17 09:01:14 +0200	[diff] [blame]	5065	struct sched_domain tmp, affine_sd = NULL, *sd = NULL;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	5066	int cpu = smp_processor_id();
Mike Galbraith	63b0e9e	2015-07-14 17:39:50 +0200	[diff] [blame]	5067	int new_cpu = prev_cpu;
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	5068	int want_affine = 0;
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	5069	int sync = wake_flags & WF_SYNC;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	5070
Kirill Tkhai	a8edd07	2014-09-12 17:41:16 +0400	[diff] [blame]	5071	if (sd_flag & SD_BALANCE_WAKE)
Mike Galbraith	63b0e9e	2015-07-14 17:39:50 +0200	[diff] [blame]	5072	want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	5073
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5074	rcu_read_lock();
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5075	for_each_domain(cpu, tmp) {
Peter Zijlstra	e4f4288	2009-12-16 18:04:34 +0100	[diff] [blame]	5076	if (!(tmp->flags & SD_LOAD_BALANCE))
Mike Galbraith	63b0e9e	2015-07-14 17:39:50 +0200	[diff] [blame]	5077	break;
Peter Zijlstra	e4f4288	2009-12-16 18:04:34 +0100	[diff] [blame]	5078
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5079	/*
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	5080	* If both cpu and prev_cpu are part of this domain,
				5081	* cpu is a valid SD_WAKE_AFFINE target.
Peter Zijlstra	fe3bcfe	2009-11-12 15:55:29 +0100	[diff] [blame]	5082	*/
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	5083	if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
				5084	cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
				5085	affine_sd = tmp;
Alex Shi	f03542a	2012-07-26 08:55:34 +0800	[diff] [blame]	5086	break;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	5087	}
				5088
Alex Shi	f03542a	2012-07-26 08:55:34 +0800	[diff] [blame]	5089	if (tmp->flags & sd_flag)
Peter Zijlstra	29cd8ba	2009-09-17 09:01:14 +0200	[diff] [blame]	5090	sd = tmp;
Mike Galbraith	63b0e9e	2015-07-14 17:39:50 +0200	[diff] [blame]	5091	else if (!want_affine)
				5092	break;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	5093	}
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5094
Mike Galbraith	63b0e9e	2015-07-14 17:39:50 +0200	[diff] [blame]	5095	if (affine_sd) {
				5096	sd = NULL; /* Prefer wake_affine over balance flags */
				5097	if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
				5098	new_cpu = cpu;
Mike Galbraith	8b911ac	2010-03-11 17:17:16 +0100	[diff] [blame]	5099	}
Peter Zijlstra	3b64089	2009-09-16 13:44:33 +0200	[diff] [blame]	5100
Mike Galbraith	63b0e9e	2015-07-14 17:39:50 +0200	[diff] [blame]	5101	if (!sd) {
				5102	if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
				5103	new_cpu = select_idle_sibling(p, new_cpu);
				5104
				5105	} else while (sd) {
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5106	struct sched_group *group;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	5107	int weight;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5108
Peter Zijlstra	0763a66	2009-09-14 19:37:39 +0200	[diff] [blame]	5109	if (!(sd->flags & sd_flag)) {
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5110	sd = sd->child;
				5111	continue;
				5112	}
				5113
Vincent Guittot	c44f2a0	2013-10-18 13:52:21 +0200	[diff] [blame]	5114	group = find_idlest_group(sd, p, cpu, sd_flag);
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5115	if (!group) {
				5116	sd = sd->child;
				5117	continue;
				5118	}
				5119
Peter Zijlstra	d7c33c4	2009-09-11 12:45:38 +0200	[diff] [blame]	5120	new_cpu = find_idlest_cpu(group, p, cpu);
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5121	if (new_cpu == -1 \|\| new_cpu == cpu) {
				5122	/* Now try balancing at a lower domain level of cpu */
				5123	sd = sd->child;
				5124	continue;
				5125	}
				5126
				5127	/* Now try balancing at a lower domain level of new_cpu */
				5128	cpu = new_cpu;
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	5129	weight = sd->span_weight;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5130	sd = NULL;
				5131	for_each_domain(cpu, tmp) {
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	5132	if (weight <= tmp->span_weight)
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5133	break;
Peter Zijlstra	0763a66	2009-09-14 19:37:39 +0200	[diff] [blame]	5134	if (tmp->flags & sd_flag)
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	5135	sd = tmp;
				5136	}
				5137	/* while loop will break here if sd == NULL */
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	5138	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5139	rcu_read_unlock();
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	5140
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	5141	return new_cpu;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	5142	}
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	5143
				5144	/*
				5145	* Called immediately before a task is migrated to a new cpu; task_cpu(p) and
				5146	* cfs_rq_of(p) references at time of call are still valid and identify the
Byungchul Park	525628c	2015-11-18 09:34:59 +0900	[diff] [blame]	5147	* previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	5148	*/
xiaofeng.yan	5a4fd03	2015-09-23 14:55:59 +0800	[diff] [blame]	5149	static void migrate_task_rq_fair(struct task_struct *p)
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	5150	{
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	5151	/*
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	5152	* We are supposed to update the task to "current" time, then its up to date
				5153	* and ready to go to new CPU/cfs_rq. But we have difficulty in getting
				5154	* what current time is, so simply throw away the out-of-date time. This
				5155	* will result in the wakee task is less decayed, but giving the wakee more
				5156	* load sounds not bad.
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	5157	*/
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	5158	remove_entity_load_avg(&p->se);
				5159
				5160	/* Tell new CPU we are migrated */
				5161	p->se.avg.last_update_time = 0;
Ben Segall	3944a92	2014-05-15 15:59:20 -0700	[diff] [blame]	5162
				5163	/* We have migrated, no longer consider this task hot */
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	5164	p->se.exec_start = 0;
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	5165	}
Yuyang Du	1269557	2015-07-15 08:04:40 +0800	[diff] [blame]	5166
				5167	static void task_dead_fair(struct task_struct *p)
				5168	{
				5169	remove_entity_load_avg(&p->se);
				5170	}
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	5171	#endif /* CONFIG_SMP */
				5172
Peter Zijlstra	e52fb7c	2009-01-14 12:39:19 +0100	[diff] [blame]	5173	static unsigned long
				5174	wakeup_gran(struct sched_entity curr, struct sched_entity se)
Peter Zijlstra	0bbd333	2008-04-19 19:44:57 +0200	[diff] [blame]	5175	{
				5176	unsigned long gran = sysctl_sched_wakeup_granularity;
				5177
				5178	/*
Peter Zijlstra	e52fb7c	2009-01-14 12:39:19 +0100	[diff] [blame]	5179	* Since its curr running now, convert the gran from real-time
				5180	* to virtual-time in his units.
Mike Galbraith	13814d4	2010-03-11 17:17:04 +0100	[diff] [blame]	5181	*
				5182	* By using 'se' instead of 'curr' we penalize light tasks, so
				5183	* they get preempted easier. That is, if 'se' < 'curr' then
				5184	* the resulting gran will be larger, therefore penalizing the
				5185	* lighter, if otoh 'se' > 'curr' then the resulting gran will
				5186	* be smaller, again penalizing the lighter task.
				5187	*
				5188	* This is especially important for buddies when the leftmost
				5189	* task is higher priority than the buddy.
Peter Zijlstra	0bbd333	2008-04-19 19:44:57 +0200	[diff] [blame]	5190	*/
Shaohua Li	f4ad9bd	2011-04-08 12:53:09 +0800	[diff] [blame]	5191	return calc_delta_fair(gran, se);
Peter Zijlstra	0bbd333	2008-04-19 19:44:57 +0200	[diff] [blame]	5192	}
				5193
				5194	/*
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	5195	* Should 'se' preempt 'curr'.
				5196	*
				5197	* \|s1
				5198	* \|s2
				5199	* \|s3
				5200	* g
				5201	* \|<--->\|c
				5202	*
				5203	* w(c, s1) = -1
				5204	* w(c, s2) = 0
				5205	* w(c, s3) = 1
				5206	*
				5207	*/
				5208	static int
				5209	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se)
				5210	{
				5211	s64 gran, vdiff = curr->vruntime - se->vruntime;
				5212
				5213	if (vdiff <= 0)
				5214	return -1;
				5215
Peter Zijlstra	e52fb7c	2009-01-14 12:39:19 +0100	[diff] [blame]	5216	gran = wakeup_gran(curr, se);
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	5217	if (vdiff > gran)
				5218	return 1;
				5219
				5220	return 0;
				5221	}
				5222
Peter Zijlstra	0247909	2008-11-04 21:25:10 +0100	[diff] [blame]	5223	static void set_last_buddy(struct sched_entity *se)
				5224	{
Venkatesh Pallipadi	69c80f3	2011-04-13 18:21:09 -0700	[diff] [blame]	5225	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
				5226	return;
				5227
				5228	for_each_sched_entity(se)
				5229	cfs_rq_of(se)->last = se;
Peter Zijlstra	0247909	2008-11-04 21:25:10 +0100	[diff] [blame]	5230	}
				5231
				5232	static void set_next_buddy(struct sched_entity *se)
				5233	{
Venkatesh Pallipadi	69c80f3	2011-04-13 18:21:09 -0700	[diff] [blame]	5234	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
				5235	return;
				5236
				5237	for_each_sched_entity(se)
				5238	cfs_rq_of(se)->next = se;
Peter Zijlstra	0247909	2008-11-04 21:25:10 +0100	[diff] [blame]	5239	}
				5240
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	5241	static void set_skip_buddy(struct sched_entity *se)
				5242	{
Venkatesh Pallipadi	69c80f3	2011-04-13 18:21:09 -0700	[diff] [blame]	5243	for_each_sched_entity(se)
				5244	cfs_rq_of(se)->skip = se;
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	5245	}
				5246
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	5247	/*
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5248	* Preempt the current task with a newly woken task if needed:
				5249	*/
Peter Zijlstra	5a9b86f	2009-09-16 13:47:58 +0200	[diff] [blame]	5250	static void check_preempt_wakeup(struct rq rq, struct task_struct p, int wake_flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5251	{
				5252	struct task_struct *curr = rq->curr;
Srivatsa Vaddagiri	8651a86	2007-10-15 17:00:12 +0200	[diff] [blame]	5253	struct sched_entity se = &curr->se, pse = &p->se;
Mike Galbraith	03e89e4	2008-12-16 08:45:30 +0100	[diff] [blame]	5254	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	5255	int scale = cfs_rq->nr_running >= sched_nr_latency;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	5256	int next_buddy_marked = 0;
Mike Galbraith	03e89e4	2008-12-16 08:45:30 +0100	[diff] [blame]	5257
Ingo Molnar	4ae7d5c	2008-03-19 01:42:00 +0100	[diff] [blame]	5258	if (unlikely(se == pse))
				5259	return;
				5260
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	5261	/*
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	5262	* This is possible from callers such as attach_tasks(), in which we
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	5263	* unconditionally check_prempt_curr() after an enqueue (which may have
				5264	* lead to a throttle). This both saves work and prevents false
				5265	* next-buddy nomination below.
				5266	*/
				5267	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
				5268	return;
				5269
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	5270	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
Mike Galbraith	3cb63d5	2009-09-11 12:01:17 +0200	[diff] [blame]	5271	set_next_buddy(pse);
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	5272	next_buddy_marked = 1;
				5273	}
Peter Zijlstra	57fdc26	2008-09-23 15:33:45 +0200	[diff] [blame]	5274
Bharata B Rao	aec0a51	2008-08-28 14:42:49 +0530	[diff] [blame]	5275	/*
				5276	* We can come here with TIF_NEED_RESCHED already set from new task
				5277	* wake up path.
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	5278	*
				5279	* Note: this also catches the edge-case of curr being in a throttled
				5280	* group (e.g. via set_curr_task), since update_curr() (in the
				5281	* enqueue of curr) will have resulted in resched being set. This
				5282	* prevents us from potentially nominating it as a false LAST_BUDDY
				5283	* below.
Bharata B Rao	aec0a51	2008-08-28 14:42:49 +0530	[diff] [blame]	5284	*/
				5285	if (test_tsk_need_resched(curr))
				5286	return;
				5287
Darren Hart	a2f5c9a	2011-02-22 13:04:33 -0800	[diff] [blame]	5288	/* Idle tasks are by definition preempted by non-idle tasks. */
				5289	if (unlikely(curr->policy == SCHED_IDLE) &&
				5290	likely(p->policy != SCHED_IDLE))
				5291	goto preempt;
				5292
Ingo Molnar	91c234b	2007-10-15 17:00:18 +0200	[diff] [blame]	5293	/*
Darren Hart	a2f5c9a	2011-02-22 13:04:33 -0800	[diff] [blame]	5294	* Batch and idle tasks do not preempt non-idle tasks (their preemption
				5295	* is driven by the tick):
Ingo Molnar	91c234b	2007-10-15 17:00:18 +0200	[diff] [blame]	5296	*/
Ingo Molnar	8ed92e5	2012-10-14 14:28:50 +0200	[diff] [blame]	5297	if (unlikely(p->policy != SCHED_NORMAL) \|\| !sched_feat(WAKEUP_PREEMPTION))
Ingo Molnar	91c234b	2007-10-15 17:00:18 +0200	[diff] [blame]	5298	return;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5299
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	5300	find_matching_se(&se, &pse);
Paul Turner	9bbd737	2011-07-05 19:07:21 -0700	[diff] [blame]	5301	update_curr(cfs_rq_of(se));
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	5302	BUG_ON(!pse);
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	5303	if (wakeup_preempt_entity(se, pse) == 1) {
				5304	/*
				5305	* Bias pick_next to pick the sched entity that is
				5306	* triggering this preemption.
				5307	*/
				5308	if (!next_buddy_marked)
				5309	set_next_buddy(pse);
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	5310	goto preempt;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	5311	}
Jupyung Lee	a65ac74	2009-11-17 18:51:40 +0900	[diff] [blame]	5312
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	5313	return;
				5314
				5315	preempt:
Kirill Tkhai	8875125	2014-06-29 00:03:57 +0400	[diff] [blame]	5316	resched_curr(rq);
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	5317	/*
				5318	* Only set the backward buddy when the current task is still
				5319	* on the rq. This can happen when a wakeup gets interleaved
				5320	* with schedule on the ->pre_schedule() or idle_balance()
				5321	* point, either of which can * drop the rq lock.
				5322	*
				5323	* Also, during early boot the idle thread is in the fair class,
				5324	* for obvious reasons its a bad idea to schedule back to it.
				5325	*/
				5326	if (unlikely(!se->on_rq \|\| curr == rq->idle))
				5327	return;
				5328
				5329	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
				5330	set_last_buddy(se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5331	}
				5332
Peter Zijlstra	606dba2	2012-02-11 06:05:00 +0100	[diff] [blame]	5333	static struct task_struct *
				5334	pick_next_task_fair(struct rq rq, struct task_struct prev)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5335	{
				5336	struct cfs_rq *cfs_rq = &rq->cfs;
				5337	struct sched_entity *se;
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	5338	struct task_struct *p;
Peter Zijlstra	37e117c	2014-02-14 12:25:08 +0100	[diff] [blame]	5339	int new_tasks;
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	5340
Peter Zijlstra	6e83125	2014-02-11 16:11:48 +0100	[diff] [blame]	5341	again:
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	5342	#ifdef CONFIG_FAIR_GROUP_SCHED
				5343	if (!cfs_rq->nr_running)
Peter Zijlstra	38033c3	2014-01-23 20:32:21 +0100	[diff] [blame]	5344	goto idle;
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	5345
Peter Zijlstra	3f1d2a3	2014-02-12 10:49:30 +0100	[diff] [blame]	5346	if (prev->sched_class != &fair_sched_class)
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	5347	goto simple;
				5348
				5349	/*
				5350	* Because of the set_next_buddy() in dequeue_task_fair() it is rather
				5351	* likely that a next task is from the same cgroup as the current.
				5352	*
				5353	* Therefore attempt to avoid putting and setting the entire cgroup
				5354	* hierarchy, only change the part that actually changes.
				5355	*/
				5356
				5357	do {
				5358	struct sched_entity *curr = cfs_rq->curr;
				5359
				5360	/*
				5361	* Since we got here without doing put_prev_entity() we also
				5362	* have to consider cfs_rq->curr. If it is still a runnable
				5363	* entity, update_curr() will update its vruntime, otherwise
				5364	* forget we've ever seen it.
				5365	*/
Ben Segall	54d2736	2015-04-06 15:28:10 -0700	[diff] [blame]	5366	if (curr) {
				5367	if (curr->on_rq)
				5368	update_curr(cfs_rq);
				5369	else
				5370	curr = NULL;
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	5371
Ben Segall	54d2736	2015-04-06 15:28:10 -0700	[diff] [blame]	5372	/*
				5373	* This call to check_cfs_rq_runtime() will do the
				5374	* throttle and dequeue its entity in the parent(s).
				5375	* Therefore the 'simple' nr_running test will indeed
				5376	* be correct.
				5377	*/
				5378	if (unlikely(check_cfs_rq_runtime(cfs_rq)))
				5379	goto simple;
				5380	}
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	5381
				5382	se = pick_next_entity(cfs_rq, curr);
				5383	cfs_rq = group_cfs_rq(se);
				5384	} while (cfs_rq);
				5385
				5386	p = task_of(se);
				5387
				5388	/*
				5389	* Since we haven't yet done put_prev_entity and if the selected task
				5390	* is a different task than we started out with, try and touch the
				5391	* least amount of cfs_rqs.
				5392	*/
				5393	if (prev != p) {
				5394	struct sched_entity *pse = &prev->se;
				5395
				5396	while (!(cfs_rq = is_same_group(se, pse))) {
				5397	int se_depth = se->depth;
				5398	int pse_depth = pse->depth;
				5399
				5400	if (se_depth <= pse_depth) {
				5401	put_prev_entity(cfs_rq_of(pse), pse);
				5402	pse = parent_entity(pse);
				5403	}
				5404	if (se_depth >= pse_depth) {
				5405	set_next_entity(cfs_rq_of(se), se);
				5406	se = parent_entity(se);
				5407	}
				5408	}
				5409
				5410	put_prev_entity(cfs_rq, pse);
				5411	set_next_entity(cfs_rq, se);
				5412	}
				5413
				5414	if (hrtick_enabled(rq))
				5415	hrtick_start_fair(rq, p);
				5416
				5417	return p;
				5418	simple:
				5419	cfs_rq = &rq->cfs;
				5420	#endif
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5421
Tim Blechmann	36ace27	2009-11-24 11:55:45 +0100	[diff] [blame]	5422	if (!cfs_rq->nr_running)
Peter Zijlstra	38033c3	2014-01-23 20:32:21 +0100	[diff] [blame]	5423	goto idle;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5424
Peter Zijlstra	3f1d2a3	2014-02-12 10:49:30 +0100	[diff] [blame]	5425	put_prev_task(rq, prev);
Peter Zijlstra	606dba2	2012-02-11 06:05:00 +0100	[diff] [blame]	5426
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5427	do {
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	5428	se = pick_next_entity(cfs_rq, NULL);
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	5429	set_next_entity(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5430	cfs_rq = group_cfs_rq(se);
				5431	} while (cfs_rq);
				5432
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	5433	p = task_of(se);
Peter Zijlstra	678d571	2012-02-11 06:05:00 +0100	[diff] [blame]	5434
Mike Galbraith	b39e66e	2011-11-22 15:20:07 +0100	[diff] [blame]	5435	if (hrtick_enabled(rq))
				5436	hrtick_start_fair(rq, p);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	5437
				5438	return p;
Peter Zijlstra	38033c3	2014-01-23 20:32:21 +0100	[diff] [blame]	5439
				5440	idle:
Peter Zijlstra	cbce1a6	2015-06-11 14:46:54 +0200	[diff] [blame]	5441	/*
				5442	* This is OK, because current is on_cpu, which avoids it being picked
				5443	* for load-balance and preemption/IRQs are still disabled avoiding
				5444	* further scheduler activity on it and we're being very careful to
				5445	* re-start the picking loop.
				5446	*/
				5447	lockdep_unpin_lock(&rq->lock);
Kirill Tkhai	e4aa358	2014-03-06 13:31:55 +0400	[diff] [blame]	5448	new_tasks = idle_balance(rq);
Peter Zijlstra	cbce1a6	2015-06-11 14:46:54 +0200	[diff] [blame]	5449	lockdep_pin_lock(&rq->lock);
Peter Zijlstra	37e117c	2014-02-14 12:25:08 +0100	[diff] [blame]	5450	/*
				5451	* Because idle_balance() releases (and re-acquires) rq->lock, it is
				5452	* possible for any higher priority task to appear. In that case we
				5453	* must re-start the pick_next_entity() loop.
				5454	*/
Kirill Tkhai	e4aa358	2014-03-06 13:31:55 +0400	[diff] [blame]	5455	if (new_tasks < 0)
Peter Zijlstra	37e117c	2014-02-14 12:25:08 +0100	[diff] [blame]	5456	return RETRY_TASK;
				5457
Kirill Tkhai	e4aa358	2014-03-06 13:31:55 +0400	[diff] [blame]	5458	if (new_tasks > 0)
Peter Zijlstra	38033c3	2014-01-23 20:32:21 +0100	[diff] [blame]	5459	goto again;
Peter Zijlstra	38033c3	2014-01-23 20:32:21 +0100	[diff] [blame]	5460
				5461	return NULL;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5462	}
				5463
				5464	/*
				5465	* Account for a descheduled task:
				5466	*/
Ingo Molnar	31ee529	2007-08-09 11:16:49 +0200	[diff] [blame]	5467	static void put_prev_task_fair(struct rq rq, struct task_struct prev)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5468	{
				5469	struct sched_entity *se = &prev->se;
				5470	struct cfs_rq *cfs_rq;
				5471
				5472	for_each_sched_entity(se) {
				5473	cfs_rq = cfs_rq_of(se);
Ingo Molnar	ab6cde2	2007-08-09 11:16:48 +0200	[diff] [blame]	5474	put_prev_entity(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5475	}
				5476	}
				5477
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	5478	/*
				5479	* sched_yield() is very simple
				5480	*
				5481	* The magic of dealing with the ->skip buddy is in pick_next_entity.
				5482	*/
				5483	static void yield_task_fair(struct rq *rq)
				5484	{
				5485	struct task_struct *curr = rq->curr;
				5486	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
				5487	struct sched_entity *se = &curr->se;
				5488
				5489	/*
				5490	* Are we the only task in the tree?
				5491	*/
				5492	if (unlikely(rq->nr_running == 1))
				5493	return;
				5494
				5495	clear_buddies(cfs_rq, se);
				5496
				5497	if (curr->policy != SCHED_BATCH) {
				5498	update_rq_clock(rq);
				5499	/*
				5500	* Update run-time statistics of the 'current'.
				5501	*/
				5502	update_curr(cfs_rq);
Mike Galbraith	916671c	2011-11-22 15:21:26 +0100	[diff] [blame]	5503	/*
				5504	* Tell update_rq_clock() that we've just updated,
				5505	* so we don't do microscopic update in schedule()
				5506	* and double the fastpath cost.
				5507	*/
Peter Zijlstra	9edfbfe	2015-01-05 11:18:11 +0100	[diff] [blame]	5508	rq_clock_skip_update(rq, true);
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	5509	}
				5510
				5511	set_skip_buddy(se);
				5512	}
				5513
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	5514	static bool yield_to_task_fair(struct rq rq, struct task_struct p, bool preempt)
				5515	{
				5516	struct sched_entity *se = &p->se;
				5517
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	5518	/* throttled hierarchies are not runnable */
				5519	if (!se->on_rq \|\| throttled_hierarchy(cfs_rq_of(se)))
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	5520	return false;
				5521
				5522	/* Tell the scheduler that we'd really like pse to run next. */
				5523	set_next_buddy(se);
				5524
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	5525	yield_task_fair(rq);
				5526
				5527	return true;
				5528	}
				5529
Peter Williams	681f3e6	2007-10-24 18:23:51 +0200	[diff] [blame]	5530	#ifdef CONFIG_SMP
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5531	/**************************************************
Peter Zijlstra	e9c84cb	2012-07-03 13:53:26 +0200	[diff] [blame]	5532	* Fair scheduling class load-balancing methods.
				5533	*
				5534	* BASICS
				5535	*
				5536	* The purpose of load-balancing is to achieve the same basic fairness the
				5537	* per-cpu scheduler provides, namely provide a proportional amount of compute
				5538	* time to each task. This is expressed in the following equation:
				5539	*
				5540	* W_i,n/P_i == W_j,n/P_j for all i,j (1)
				5541	*
				5542	* Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
				5543	* W_i,0 is defined as:
				5544	*
				5545	* W_i,0 = \Sum_j w_i,j (2)
				5546	*
				5547	* Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
				5548	* is derived from the nice value as per prio_to_weight[].
				5549	*
				5550	* The weight average is an exponential decay average of the instantaneous
				5551	* weight:
				5552	*
				5553	* W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
				5554	*
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	5555	* C_i is the compute capacity of cpu i, typically it is the
Peter Zijlstra	e9c84cb	2012-07-03 13:53:26 +0200	[diff] [blame]	5556	* fraction of 'recent' time available for SCHED_OTHER task execution. But it
				5557	* can also include other factors [XXX].
				5558	*
				5559	* To achieve this balance we define a measure of imbalance which follows
				5560	* directly from (1):
				5561	*
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	5562	* imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4)
Peter Zijlstra	e9c84cb	2012-07-03 13:53:26 +0200	[diff] [blame]	5563	*
				5564	* We them move tasks around to minimize the imbalance. In the continuous
				5565	* function space it is obvious this converges, in the discrete case we get
				5566	* a few fun cases generally called infeasible weight scenarios.
				5567	*
				5568	* [XXX expand on:
				5569	* - infeasible weights;
				5570	* - local vs global optima in the discrete case. ]
				5571	*
				5572	*
				5573	* SCHED DOMAINS
				5574	*
				5575	* In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
				5576	* for all i,j solution, we create a tree of cpus that follows the hardware
				5577	* topology where each level pairs two lower groups (or better). This results
				5578	* in O(log n) layers. Furthermore we reduce the number of cpus going up the
				5579	* tree to only the first of the previous level and we decrease the frequency
				5580	* of load-balance at each level inv. proportional to the number of cpus in
				5581	* the groups.
				5582	*
				5583	* This yields:
				5584	*
				5585	* log_2 n 1 n
				5586	* \Sum { --- * --- * 2^i } = O(n) (5)
				5587	* i = 0 2^i 2^i
				5588	* `- size of each group
				5589	* \| \| `- number of cpus doing load-balance
				5590	* \| `- freq
				5591	* `- sum over all levels
				5592	*
				5593	* Coupled with a limit on how many tasks we can migrate every balance pass,
				5594	* this makes (5) the runtime complexity of the balancer.
				5595	*
				5596	* An important property here is that each CPU is still (indirectly) connected
				5597	* to every other cpu in at most O(log n) steps:
				5598	*
				5599	* The adjacency matrix of the resulting graph is given by:
				5600	*
				5601	* log_2 n
				5602	* A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
				5603	* k = 0
				5604	*
				5605	* And you'll find that:
				5606	*
				5607	* A^(log_2 n)_i,j != 0 for all i,j (7)
				5608	*
				5609	* Showing there's indeed a path between every cpu in at most O(log n) steps.
				5610	* The task movement gives a factor of O(m), giving a convergence complexity
				5611	* of:
				5612	*
				5613	* O(nm log n), n := nr_cpus, m := nr_tasks (8)
				5614	*
				5615	*
				5616	* WORK CONSERVING
				5617	*
				5618	* In order to avoid CPUs going idle while there's still work to do, new idle
				5619	* balancing is more aggressive and has the newly idle cpu iterate up the domain
				5620	* tree itself instead of relying on other CPUs to bring it work.
				5621	*
				5622	* This adds some complexity to both (5) and (8) but it reduces the total idle
				5623	* time.
				5624	*
				5625	* [XXX more?]
				5626	*
				5627	*
				5628	* CGROUPS
				5629	*
				5630	* Cgroups make a horror show out of (2), instead of a simple sum we get:
				5631	*
				5632	* s_k,i
				5633	* W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
				5634	* S_k
				5635	*
				5636	* Where
				5637	*
				5638	* s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
				5639	*
				5640	* w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
				5641	*
				5642	* The big problem is S_k, its a global sum needed to compute a local (W_i)
				5643	* property.
				5644	*
				5645	* [XXX write more on how we solve this.. _after_ merging pjt's patches that
				5646	* rewrite all of this once again.]
				5647	*/
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5648
Hiroshi Shimamoto	ed387b7	2012-01-31 11:40:32 +0900	[diff] [blame]	5649	static unsigned long __read_mostly max_load_balance_interval = HZ/10;
				5650
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	5651	enum fbq_type { regular, remote, all };
				5652
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	5653	#define LBF_ALL_PINNED 0x01
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	5654	#define LBF_NEED_BREAK 0x02
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	5655	#define LBF_DST_PINNED 0x04
				5656	#define LBF_SOME_PINNED 0x08
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	5657
				5658	struct lb_env {
				5659	struct sched_domain *sd;
				5660
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	5661	struct rq *src_rq;
Prashanth Nageshappa	85c1e7d	2012-06-19 17:47:34 +0530	[diff] [blame]	5662	int src_cpu;
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	5663
				5664	int dst_cpu;
				5665	struct rq *dst_rq;
				5666
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5667	struct cpumask *dst_grpmask;
				5668	int new_dst_cpu;
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	5669	enum cpu_idle_type idle;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5670	long imbalance;
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	5671	/* The set of CPUs under consideration for load-balancing */
				5672	struct cpumask *cpus;
				5673
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	5674	unsigned int flags;
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	5675
				5676	unsigned int loop;
				5677	unsigned int loop_break;
				5678	unsigned int loop_max;
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	5679
				5680	enum fbq_type fbq_type;
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	5681	struct list_head tasks;
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	5682	};
				5683
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5684	/*
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5685	* Is this task likely cache-hot:
				5686	*/
Hillf Danton	5d5e2b1	2014-06-10 10:58:43 +0200	[diff] [blame]	5687	static int task_hot(struct task_struct p, struct lb_env env)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5688	{
				5689	s64 delta;
				5690
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	5691	lockdep_assert_held(&env->src_rq->lock);
				5692
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5693	if (p->sched_class != &fair_sched_class)
				5694	return 0;
				5695
				5696	if (unlikely(p->policy == SCHED_IDLE))
				5697	return 0;
				5698
				5699	/*
				5700	* Buddy candidates are cache hot:
				5701	*/
Hillf Danton	5d5e2b1	2014-06-10 10:58:43 +0200	[diff] [blame]	5702	if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5703	(&p->se == cfs_rq_of(&p->se)->next \|\|
				5704	&p->se == cfs_rq_of(&p->se)->last))
				5705	return 1;
				5706
				5707	if (sysctl_sched_migration_cost == -1)
				5708	return 1;
				5709	if (sysctl_sched_migration_cost == 0)
				5710	return 0;
				5711
Hillf Danton	5d5e2b1	2014-06-10 10:58:43 +0200	[diff] [blame]	5712	delta = rq_clock_task(env->src_rq) - p->se.exec_start;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5713
				5714	return delta < (s64)sysctl_sched_migration_cost;
				5715	}
				5716
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	5717	#ifdef CONFIG_NUMA_BALANCING
Rik van Riel	c1ceac6	2015-05-14 22:59:36 -0400	[diff] [blame]	5718	/*
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	5719	* Returns 1, if task migration degrades locality
				5720	* Returns 0, if task migration improves locality i.e migration preferred.
				5721	* Returns -1, if task migration is not affected by locality.
Rik van Riel	c1ceac6	2015-05-14 22:59:36 -0400	[diff] [blame]	5722	*/
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	5723	static int migrate_degrades_locality(struct task_struct p, struct lb_env env)
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	5724	{
Rik van Riel	b1ad065	2014-05-15 13:03:06 -0400	[diff] [blame]	5725	struct numa_group *numa_group = rcu_dereference(p->numa_group);
Rik van Riel	c1ceac6	2015-05-14 22:59:36 -0400	[diff] [blame]	5726	unsigned long src_faults, dst_faults;
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	5727	int src_nid, dst_nid;
				5728
Srikar Dronamraju	2a59572	2015-08-11 21:54:21 +0530	[diff] [blame]	5729	if (!static_branch_likely(&sched_numa_balancing))
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	5730	return -1;
				5731
Srikar Dronamraju	c3b9bc5	2015-08-11 16:30:12 +0530	[diff] [blame]	5732	if (!p->numa_faults \|\| !(env->sd->flags & SD_NUMA))
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	5733	return -1;
Mel Gorman	7a0f308	2013-10-07 11:29:01 +0100	[diff] [blame]	5734
				5735	src_nid = cpu_to_node(env->src_cpu);
				5736	dst_nid = cpu_to_node(env->dst_cpu);
				5737
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	5738	if (src_nid == dst_nid)
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	5739	return -1;
Mel Gorman	7a0f308	2013-10-07 11:29:01 +0100	[diff] [blame]	5740
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	5741	/* Migrating away from the preferred node is always bad. */
				5742	if (src_nid == p->numa_preferred_nid) {
				5743	if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
				5744	return 1;
				5745	else
				5746	return -1;
				5747	}
Mel Gorman	83e1d2c	2013-10-07 11:29:27 +0100	[diff] [blame]	5748
Rik van Riel	c1ceac6	2015-05-14 22:59:36 -0400	[diff] [blame]	5749	/* Encourage migration to the preferred node. */
				5750	if (dst_nid == p->numa_preferred_nid)
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	5751	return 0;
Rik van Riel	c1ceac6	2015-05-14 22:59:36 -0400	[diff] [blame]	5752
				5753	if (numa_group) {
				5754	src_faults = group_faults(p, src_nid);
				5755	dst_faults = group_faults(p, dst_nid);
				5756	} else {
				5757	src_faults = task_faults(p, src_nid);
				5758	dst_faults = task_faults(p, dst_nid);
				5759	}
				5760
				5761	return dst_faults < src_faults;
Mel Gorman	7a0f308	2013-10-07 11:29:01 +0100	[diff] [blame]	5762	}
				5763
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	5764	#else
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	5765	static inline int migrate_degrades_locality(struct task_struct *p,
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	5766	struct lb_env *env)
				5767	{
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	5768	return -1;
Mel Gorman	7a0f308	2013-10-07 11:29:01 +0100	[diff] [blame]	5769	}
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	5770	#endif
				5771
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5772	/*
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5773	* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
				5774	*/
				5775	static
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5776	int can_migrate_task(struct task_struct p, struct lb_env env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5777	{
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	5778	int tsk_cache_hot;
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	5779
				5780	lockdep_assert_held(&env->src_rq->lock);
				5781
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5782	/*
				5783	* We do not migrate tasks that are:
Joonsoo Kim	d319808	2013-04-23 17:27:40 +0900	[diff] [blame]	5784	* 1) throttled_lb_pair, or
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5785	* 2) cannot be migrated to this CPU due to cpus_allowed, or
Joonsoo Kim	d319808	2013-04-23 17:27:40 +0900	[diff] [blame]	5786	* 3) running (obviously), or
				5787	* 4) are cache-hot on their current CPU.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5788	*/
Joonsoo Kim	d319808	2013-04-23 17:27:40 +0900	[diff] [blame]	5789	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
				5790	return 0;
				5791
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	5792	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	5793	int cpu;
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5794
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	5795	schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5796
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	5797	env->flags \|= LBF_SOME_PINNED;
				5798
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5799	/*
				5800	* Remember if this task can be migrated to any other cpu in
				5801	* our sched_group. We may want to revisit it if we couldn't
				5802	* meet load balance goals by pulling other tasks on src_cpu.
				5803	*
				5804	* Also avoid computing new_dst_cpu if we have already computed
				5805	* one in current iteration.
				5806	*/
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	5807	if (!env->dst_grpmask \|\| (env->flags & LBF_DST_PINNED))
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5808	return 0;
				5809
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	5810	/* Prevent to re-select dst_cpu via env's cpus */
				5811	for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
				5812	if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	5813	env->flags \|= LBF_DST_PINNED;
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	5814	env->new_dst_cpu = cpu;
				5815	break;
				5816	}
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5817	}
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	5818
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5819	return 0;
				5820	}
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5821
				5822	/* Record that we found atleast one task that could run on dst_cpu */
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5823	env->flags &= ~LBF_ALL_PINNED;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5824
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	5825	if (task_running(env->src_rq, p)) {
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	5826	schedstat_inc(p, se.statistics.nr_failed_migrations_running);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5827	return 0;
				5828	}
				5829
				5830	/*
				5831	* Aggressive migration if:
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	5832	* 1) destination numa is preferred
				5833	* 2) task is cache cold, or
				5834	* 3) too many balance attempts have failed.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5835	*/
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	5836	tsk_cache_hot = migrate_degrades_locality(p, env);
				5837	if (tsk_cache_hot == -1)
				5838	tsk_cache_hot = task_hot(p, env);
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	5839
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	5840	if (tsk_cache_hot <= 0 \|\|
Kirill Tkhai	7a96c23	2014-09-22 22:36:12 +0400	[diff] [blame]	5841	env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
Srikar Dronamraju	2a1ed24	2015-06-16 17:25:59 +0530	[diff] [blame]	5842	if (tsk_cache_hot == 1) {
Mel Gorman	3a7053b	2013-10-07 11:29:00 +0100	[diff] [blame]	5843	schedstat_inc(env->sd, lb_hot_gained[env->idle]);
				5844	schedstat_inc(p, se.statistics.nr_forced_migrations);
				5845	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5846	return 1;
				5847	}
				5848
Zhang Hang	4e2dcb7	2013-04-10 14:04:55 +0800	[diff] [blame]	5849	schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
				5850	return 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5851	}
				5852
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	5853	/*
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	5854	* detach_task() -- detach the task for the migration specified in env
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	5855	*/
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	5856	static void detach_task(struct task_struct p, struct lb_env env)
				5857	{
				5858	lockdep_assert_held(&env->src_rq->lock);
				5859
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	5860	p->on_rq = TASK_ON_RQ_MIGRATING;
Joonwoo Park	3ea94de	2015-11-12 19:38:54 -0800	[diff] [blame]	5861	deactivate_task(env->src_rq, p, 0);
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	5862	set_task_cpu(p, env->dst_cpu);
				5863	}
				5864
				5865	/*
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	5866	* detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	5867	* part of active balancing operations within "domain".
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	5868	*
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	5869	* Returns a task if successful and NULL otherwise.
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	5870	*/
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	5871	static struct task_struct detach_one_task(struct lb_env env)
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	5872	{
				5873	struct task_struct p, n;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	5874
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	5875	lockdep_assert_held(&env->src_rq->lock);
				5876
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	5877	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	5878	if (!can_migrate_task(p, env))
				5879	continue;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	5880
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	5881	detach_task(p, env);
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	5882
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	5883	/*
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	5884	* Right now, this is only the second place where
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	5885	* lb_gained[env->idle] is updated (other is detach_tasks)
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	5886	* so we can safely collect stats here rather than
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	5887	* inside detach_tasks().
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	5888	*/
				5889	schedstat_inc(env->sd, lb_gained[env->idle]);
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	5890	return p;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	5891	}
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	5892	return NULL;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	5893	}
				5894
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	5895	static const unsigned int sched_nr_migrate_break = 32;
				5896
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	5897	/*
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	5898	* detach_tasks() -- tries to detach up to imbalance weighted load from
				5899	* busiest_rq, as part of a balancing operation within domain "sd".
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	5900	*
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	5901	* Returns number of detached tasks if successful and 0 otherwise.
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	5902	*/
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	5903	static int detach_tasks(struct lb_env *env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5904	{
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	5905	struct list_head *tasks = &env->src_rq->cfs_tasks;
				5906	struct task_struct *p;
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	5907	unsigned long load;
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	5908	int detached = 0;
				5909
				5910	lockdep_assert_held(&env->src_rq->lock);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5911
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5912	if (env->imbalance <= 0)
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	5913	return 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5914
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	5915	while (!list_empty(tasks)) {
Yuyang Du	985d3a4	2015-07-06 06:11:51 +0800	[diff] [blame]	5916	/*
				5917	* We don't want to steal all, otherwise we may be treated likewise,
				5918	* which could at worst lead to a livelock crash.
				5919	*/
				5920	if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
				5921	break;
				5922
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	5923	p = list_first_entry(tasks, struct task_struct, se.group_node);
				5924
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	5925	env->loop++;
				5926	/* We've more or less seen every task there is, call it quits */
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	5927	if (env->loop > env->loop_max)
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	5928	break;
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	5929
				5930	/* take a breather every nr_migrate tasks */
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	5931	if (env->loop > env->loop_break) {
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	5932	env->loop_break += sched_nr_migrate_break;
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5933	env->flags \|= LBF_NEED_BREAK;
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	5934	break;
Peter Zijlstra	a195f00	2011-09-22 15:30:18 +0200	[diff] [blame]	5935	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5936
Joonsoo Kim	d319808	2013-04-23 17:27:40 +0900	[diff] [blame]	5937	if (!can_migrate_task(p, env))
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	5938	goto next;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5939
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	5940	load = task_h_load(p);
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	5941
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	5942	if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	5943	goto next;
				5944
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5945	if ((load / 2) > env->imbalance)
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	5946	goto next;
				5947
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	5948	detach_task(p, env);
				5949	list_add(&p->se.group_node, &env->tasks);
				5950
				5951	detached++;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5952	env->imbalance -= load;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5953
				5954	#ifdef CONFIG_PREEMPT
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	5955	/*
				5956	* NEWIDLE balancing is a source of latency, so preemptible
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	5957	* kernels will stop after the first task is detached to minimize
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	5958	* the critical section.
				5959	*/
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	5960	if (env->idle == CPU_NEWLY_IDLE)
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	5961	break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5962	#endif
				5963
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	5964	/*
				5965	* We only want to steal up to the prescribed amount of
				5966	* weighted load.
				5967	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5968	if (env->imbalance <= 0)
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	5969	break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5970
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	5971	continue;
				5972	next:
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	5973	list_move_tail(&p->se.group_node, tasks);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5974	}
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	5975
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5976	/*
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	5977	* Right now, this is one of only two places we collect this stat
				5978	* so we can safely collect detach_one_task() stats here rather
				5979	* than inside detach_one_task().
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5980	*/
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	5981	schedstat_add(env->sd, lb_gained[env->idle], detached);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5982
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	5983	return detached;
				5984	}
				5985
				5986	/*
				5987	* attach_task() -- attach the task detached by detach_task() to its new rq.
				5988	*/
				5989	static void attach_task(struct rq rq, struct task_struct p)
				5990	{
				5991	lockdep_assert_held(&rq->lock);
				5992
				5993	BUG_ON(task_rq(p) != rq);
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	5994	activate_task(rq, p, 0);
Joonwoo Park	3ea94de	2015-11-12 19:38:54 -0800	[diff] [blame]	5995	p->on_rq = TASK_ON_RQ_QUEUED;
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	5996	check_preempt_curr(rq, p, 0);
				5997	}
				5998
				5999	/*
				6000	* attach_one_task() -- attaches the task returned from detach_one_task() to
				6001	* its new rq.
				6002	*/
				6003	static void attach_one_task(struct rq rq, struct task_struct p)
				6004	{
				6005	raw_spin_lock(&rq->lock);
				6006	attach_task(rq, p);
				6007	raw_spin_unlock(&rq->lock);
				6008	}
				6009
				6010	/*
				6011	* attach_tasks() -- attaches all tasks detached by detach_tasks() to their
				6012	* new rq.
				6013	*/
				6014	static void attach_tasks(struct lb_env *env)
				6015	{
				6016	struct list_head *tasks = &env->tasks;
				6017	struct task_struct *p;
				6018
				6019	raw_spin_lock(&env->dst_rq->lock);
				6020
				6021	while (!list_empty(tasks)) {
				6022	p = list_first_entry(tasks, struct task_struct, se.group_node);
				6023	list_del_init(&p->se.group_node);
				6024
				6025	attach_task(env->dst_rq, p);
				6026	}
				6027
				6028	raw_spin_unlock(&env->dst_rq->lock);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6029	}
				6030
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	6031	#ifdef CONFIG_FAIR_GROUP_SCHED
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	6032	static void update_blocked_averages(int cpu)
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	6033	{
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	6034	struct rq *rq = cpu_rq(cpu);
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	6035	struct cfs_rq *cfs_rq;
				6036	unsigned long flags;
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	6037
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	6038	raw_spin_lock_irqsave(&rq->lock, flags);
				6039	update_rq_clock(rq);
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	6040
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	6041	/*
				6042	* Iterates the task_group tree in a bottom up fashion, see
				6043	* list_add_leaf_cfs_rq() for details.
				6044	*/
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	6045	for_each_leaf_cfs_rq(rq, cfs_rq) {
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	6046	/* throttled entities do not contribute to load */
				6047	if (throttled_hierarchy(cfs_rq))
				6048	continue;
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	6049
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	6050	if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
				6051	update_tg_load_avg(cfs_rq, 0);
				6052	}
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	6053	raw_spin_unlock_irqrestore(&rq->lock, flags);
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	6054	}
				6055
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	6056	/*
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	6057	* Compute the hierarchical load factor for cfs_rq and all its ascendants.
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	6058	* This needs to be done in a top-down fashion because the load of a child
				6059	* group is a fraction of its parents load.
				6060	*/
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	6061	static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	6062	{
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	6063	struct rq *rq = rq_of(cfs_rq);
				6064	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
Peter Zijlstra	a35b646	2012-08-08 21:46:40 +0200	[diff] [blame]	6065	unsigned long now = jiffies;
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	6066	unsigned long load;
Peter Zijlstra	a35b646	2012-08-08 21:46:40 +0200	[diff] [blame]	6067
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	6068	if (cfs_rq->last_h_load_update == now)
Peter Zijlstra	a35b646	2012-08-08 21:46:40 +0200	[diff] [blame]	6069	return;
				6070
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	6071	cfs_rq->h_load_next = NULL;
				6072	for_each_sched_entity(se) {
				6073	cfs_rq = cfs_rq_of(se);
				6074	cfs_rq->h_load_next = se;
				6075	if (cfs_rq->last_h_load_update == now)
				6076	break;
				6077	}
Peter Zijlstra	a35b646	2012-08-08 21:46:40 +0200	[diff] [blame]	6078
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	6079	if (!se) {
Yuyang Du	7ea241a	2015-07-15 08:04:42 +0800	[diff] [blame]	6080	cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	6081	cfs_rq->last_h_load_update = now;
				6082	}
				6083
				6084	while ((se = cfs_rq->h_load_next) != NULL) {
				6085	load = cfs_rq->h_load;
Yuyang Du	7ea241a	2015-07-15 08:04:42 +0800	[diff] [blame]	6086	load = div64_ul(load * se->avg.load_avg,
				6087	cfs_rq_load_avg(cfs_rq) + 1);
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	6088	cfs_rq = group_cfs_rq(se);
				6089	cfs_rq->h_load = load;
				6090	cfs_rq->last_h_load_update = now;
				6091	}
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	6092	}
				6093
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	6094	static unsigned long task_h_load(struct task_struct *p)
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	6095	{
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	6096	struct cfs_rq *cfs_rq = task_cfs_rq(p);
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	6097
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	6098	update_cfs_rq_h_load(cfs_rq);
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	6099	return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
Yuyang Du	7ea241a	2015-07-15 08:04:42 +0800	[diff] [blame]	6100	cfs_rq_load_avg(cfs_rq) + 1);
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	6101	}
				6102	#else
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	6103	static inline void update_blocked_averages(int cpu)
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	6104	{
Vincent Guittot	6c1d47c	2015-07-15 08:04:38 +0800	[diff] [blame]	6105	struct rq *rq = cpu_rq(cpu);
				6106	struct cfs_rq *cfs_rq = &rq->cfs;
				6107	unsigned long flags;
				6108
				6109	raw_spin_lock_irqsave(&rq->lock, flags);
				6110	update_rq_clock(rq);
				6111	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
				6112	raw_spin_unlock_irqrestore(&rq->lock, flags);
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	6113	}
				6114
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	6115	static unsigned long task_h_load(struct task_struct *p)
				6116	{
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	6117	return p->se.avg.load_avg;
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	6118	}
				6119	#endif
				6120
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6121	/******** Helpers for find_busiest_group **********************/
Rik van Riel	caeb178	2014-07-28 14:16:28 -0400	[diff] [blame]	6122
				6123	enum group_type {
				6124	group_other = 0,
				6125	group_imbalanced,
				6126	group_overloaded,
				6127	};
				6128
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6129	/*
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6130	* sg_lb_stats - stats of a sched_group required for load_balancing
				6131	*/
				6132	struct sg_lb_stats {
				6133	unsigned long avg_load; /Avg load across the CPUs of the group /
				6134	unsigned long group_load; /* Total load over the CPUs of the group */
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6135	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6136	unsigned long load_per_task;
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	6137	unsigned long group_capacity;
Dietmar Eggemann	9e91d61	2015-08-14 17:23:12 +0100	[diff] [blame]	6138	unsigned long group_util; /* Total utilization of the group */
Peter Zijlstra	147c5fc	2013-08-19 15:22:57 +0200	[diff] [blame]	6139	unsigned int sum_nr_running; /* Nr tasks running in the group */
Peter Zijlstra	147c5fc	2013-08-19 15:22:57 +0200	[diff] [blame]	6140	unsigned int idle_cpus;
				6141	unsigned int group_weight;
Rik van Riel	caeb178	2014-07-28 14:16:28 -0400	[diff] [blame]	6142	enum group_type group_type;
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	6143	int group_no_capacity;
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	6144	#ifdef CONFIG_NUMA_BALANCING
				6145	unsigned int nr_numa_running;
				6146	unsigned int nr_preferred_running;
				6147	#endif
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6148	};
				6149
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6150	/*
				6151	* sd_lb_stats - Structure to store the statistics of a sched_domain
				6152	* during load balancing.
				6153	*/
				6154	struct sd_lb_stats {
				6155	struct sched_group busiest; / Busiest group in this sd */
				6156	struct sched_group local; / Local group in this sd */
				6157	unsigned long total_load; /* Total load of all groups in sd */
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	6158	unsigned long total_capacity; /* Total capacity of all groups in sd */
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6159	unsigned long avg_load; /* Average load across all groups in sd */
				6160
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6161	struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
Peter Zijlstra	147c5fc	2013-08-19 15:22:57 +0200	[diff] [blame]	6162	struct sg_lb_stats local_stat; /* Statistics of the local group */
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6163	};
				6164
Peter Zijlstra	147c5fc	2013-08-19 15:22:57 +0200	[diff] [blame]	6165	static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
				6166	{
				6167	/*
				6168	* Skimp on the clearing to avoid duplicate work. We can avoid clearing
				6169	* local_stat because update_sg_lb_stats() does a full clear/assignment.
				6170	* We must however clear busiest_stat::avg_load because
				6171	* update_sd_pick_busiest() reads this before assignment.
				6172	*/
				6173	*sds = (struct sd_lb_stats){
				6174	.busiest = NULL,
				6175	.local = NULL,
				6176	.total_load = 0UL,
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	6177	.total_capacity = 0UL,
Peter Zijlstra	147c5fc	2013-08-19 15:22:57 +0200	[diff] [blame]	6178	.busiest_stat = {
				6179	.avg_load = 0UL,
Rik van Riel	caeb178	2014-07-28 14:16:28 -0400	[diff] [blame]	6180	.sum_nr_running = 0,
				6181	.group_type = group_other,
Peter Zijlstra	147c5fc	2013-08-19 15:22:57 +0200	[diff] [blame]	6182	},
				6183	};
				6184	}
				6185
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6186	/**
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6187	* get_sd_load_idx - Obtain the load index for a given sched domain.
				6188	* @sd: The sched_domain whose load_idx is to be obtained.
Kamalesh Babulal	ed1b773	2013-10-13 23:06:15 +0530	[diff] [blame]	6189	* @idle: The idle status of the CPU for whose sd load_idx is obtained.
Yacine Belkadi	e69f618	2013-07-12 20:45:47 +0200	[diff] [blame]	6190	*
				6191	* Return: The load index.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6192	*/
				6193	static inline int get_sd_load_idx(struct sched_domain *sd,
				6194	enum cpu_idle_type idle)
				6195	{
				6196	int load_idx;
				6197
				6198	switch (idle) {
				6199	case CPU_NOT_IDLE:
				6200	load_idx = sd->busy_idx;
				6201	break;
				6202
				6203	case CPU_NEWLY_IDLE:
				6204	load_idx = sd->newidle_idx;
				6205	break;
				6206	default:
				6207	load_idx = sd->idle_idx;
				6208	break;
				6209	}
				6210
				6211	return load_idx;
				6212	}
				6213
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	6214	static unsigned long scale_rt_capacity(int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6215	{
				6216	struct rq *rq = cpu_rq(cpu);
Vincent Guittot	b5b4860	2015-02-27 16:54:08 +0100	[diff] [blame]	6217	u64 total, used, age_stamp, avg;
Peter Zijlstra	cadefd3	2014-02-27 10:40:35 +0100	[diff] [blame]	6218	s64 delta;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6219
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	6220	/*
				6221	* Since we're reading these variables without serialization make sure
				6222	* we read them once before doing sanity checks on them.
				6223	*/
Jason Low	316c1608d	2015-04-28 13:00:20 -0700	[diff] [blame]	6224	age_stamp = READ_ONCE(rq->age_stamp);
				6225	avg = READ_ONCE(rq->rt_avg);
Peter Zijlstra	cebde6d	2015-01-05 11:18:10 +0100	[diff] [blame]	6226	delta = __rq_clock_broken(rq) - age_stamp;
Venkatesh Pallipadi	aa48380	2010-10-04 17:03:22 -0700	[diff] [blame]	6227
Peter Zijlstra	cadefd3	2014-02-27 10:40:35 +0100	[diff] [blame]	6228	if (unlikely(delta < 0))
				6229	delta = 0;
				6230
				6231	total = sched_avg_period() + delta;
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	6232
Vincent Guittot	b5b4860	2015-02-27 16:54:08 +0100	[diff] [blame]	6233	used = div_u64(avg, total);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6234
Vincent Guittot	b5b4860	2015-02-27 16:54:08 +0100	[diff] [blame]	6235	if (likely(used < SCHED_CAPACITY_SCALE))
				6236	return SCHED_CAPACITY_SCALE - used;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6237
Vincent Guittot	b5b4860	2015-02-27 16:54:08 +0100	[diff] [blame]	6238	return 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6239	}
				6240
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	6241	static void update_cpu_capacity(struct sched_domain *sd, int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6242	{
Morten Rasmussen	8cd5601	2015-08-14 17:23:10 +0100	[diff] [blame]	6243	unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6244	struct sched_group *sdg = sd->groups;
				6245
Vincent Guittot	ca6d75e	2015-02-27 16:54:09 +0100	[diff] [blame]	6246	cpu_rq(cpu)->cpu_capacity_orig = capacity;
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	6247
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	6248	capacity *= scale_rt_capacity(cpu);
Nicolas Pitre	ca8ce3d	2014-05-26 18:19:39 -0400	[diff] [blame]	6249	capacity >>= SCHED_CAPACITY_SHIFT;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6250
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	6251	if (!capacity)
				6252	capacity = 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6253
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	6254	cpu_rq(cpu)->cpu_capacity = capacity;
				6255	sdg->sgc->capacity = capacity;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6256	}
				6257
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	6258	void update_group_capacity(struct sched_domain *sd, int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6259	{
				6260	struct sched_domain *child = sd->child;
				6261	struct sched_group group, sdg = sd->groups;
Vincent Guittot	dc7ff76	2015-03-03 11:35:03 +0100	[diff] [blame]	6262	unsigned long capacity;
Vincent Guittot	4ec4412	2011-12-12 20:21:08 +0100	[diff] [blame]	6263	unsigned long interval;
				6264
				6265	interval = msecs_to_jiffies(sd->balance_interval);
				6266	interval = clamp(interval, 1UL, max_load_balance_interval);
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	6267	sdg->sgc->next_update = jiffies + interval;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6268
				6269	if (!child) {
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	6270	update_cpu_capacity(sd, cpu);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6271	return;
				6272	}
				6273
Vincent Guittot	dc7ff76	2015-03-03 11:35:03 +0100	[diff] [blame]	6274	capacity = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6275
Peter Zijlstra	74a5ce2	2012-05-23 18:00:43 +0200	[diff] [blame]	6276	if (child->flags & SD_OVERLAP) {
				6277	/*
				6278	* SD_OVERLAP domains cannot assume that child groups
				6279	* span the current group.
				6280	*/
				6281
Peter Zijlstra	863bffc	2013-08-28 11:44:39 +0200	[diff] [blame]	6282	for_each_cpu(cpu, sched_group_cpus(sdg)) {
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	6283	struct sched_group_capacity *sgc;
Srikar Dronamraju	9abf24d	2013-11-12 22:11:26 +0530	[diff] [blame]	6284	struct rq *rq = cpu_rq(cpu);
Peter Zijlstra	863bffc	2013-08-28 11:44:39 +0200	[diff] [blame]	6285
Srikar Dronamraju	9abf24d	2013-11-12 22:11:26 +0530	[diff] [blame]	6286	/*
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	6287	* build_sched_domains() -> init_sched_groups_capacity()
Srikar Dronamraju	9abf24d	2013-11-12 22:11:26 +0530	[diff] [blame]	6288	* gets here before we've attached the domains to the
				6289	* runqueues.
				6290	*
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	6291	* Use capacity_of(), which is set irrespective of domains
				6292	* in update_cpu_capacity().
Srikar Dronamraju	9abf24d	2013-11-12 22:11:26 +0530	[diff] [blame]	6293	*
Vincent Guittot	dc7ff76	2015-03-03 11:35:03 +0100	[diff] [blame]	6294	* This avoids capacity from being 0 and
Srikar Dronamraju	9abf24d	2013-11-12 22:11:26 +0530	[diff] [blame]	6295	* causing divide-by-zero issues on boot.
Srikar Dronamraju	9abf24d	2013-11-12 22:11:26 +0530	[diff] [blame]	6296	*/
				6297	if (unlikely(!rq->sd)) {
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	6298	capacity += capacity_of(cpu);
Srikar Dronamraju	9abf24d	2013-11-12 22:11:26 +0530	[diff] [blame]	6299	continue;
				6300	}
				6301
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	6302	sgc = rq->sd->groups->sgc;
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	6303	capacity += sgc->capacity;
Peter Zijlstra	863bffc	2013-08-28 11:44:39 +0200	[diff] [blame]	6304	}
Peter Zijlstra	74a5ce2	2012-05-23 18:00:43 +0200	[diff] [blame]	6305	} else {
				6306	/*
				6307	* !SD_OVERLAP domains can assume that child groups
				6308	* span the current group.
				6309	*/
				6310
				6311	group = child->groups;
				6312	do {
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	6313	capacity += group->sgc->capacity;
Peter Zijlstra	74a5ce2	2012-05-23 18:00:43 +0200	[diff] [blame]	6314	group = group->next;
				6315	} while (group != child->groups);
				6316	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6317
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	6318	sdg->sgc->capacity = capacity;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6319	}
				6320
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	6321	/*
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	6322	* Check whether the capacity of the rq has been noticeably reduced by side
				6323	* activity. The imbalance_pct is used for the threshold.
				6324	* Return true is the capacity is reduced
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	6325	*/
				6326	static inline int
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	6327	check_cpu_capacity(struct rq rq, struct sched_domain sd)
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	6328	{
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	6329	return ((rq->cpu_capacity * sd->imbalance_pct) <
				6330	(rq->cpu_capacity_orig * 100));
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	6331	}
				6332
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	6333	/*
				6334	* Group imbalance indicates (and tries to solve) the problem where balancing
				6335	* groups is inadequate due to tsk_cpus_allowed() constraints.
				6336	*
				6337	* Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
				6338	* cpumask covering 1 cpu of the first group and 3 cpus of the second group.
				6339	* Something like:
				6340	*
				6341	* { 0 1 2 3 } { 4 5 6 7 }
				6342	* * * * *
				6343	*
				6344	* If we were to balance group-wise we'd place two tasks in the first group and
				6345	* two tasks in the second group. Clearly this is undesired as it will overload
				6346	* cpu 3 and leave one of the cpus in the second group unused.
				6347	*
				6348	* The current solution to this issue is detecting the skew in the first group
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	6349	* by noticing the lower domain failed to reach balance and had difficulty
				6350	* moving tasks due to affinity constraints.
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	6351	*
				6352	* When this is so detected; this group becomes a candidate for busiest; see
Kamalesh Babulal	ed1b773	2013-10-13 23:06:15 +0530	[diff] [blame]	6353	* update_sd_pick_busiest(). And calculate_imbalance() and
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	6354	* find_busiest_group() avoid some of the usual balance conditions to allow it
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	6355	* to create an effective group imbalance.
				6356	*
				6357	* This is a somewhat tricky proposition since the next run might not find the
				6358	* group imbalance and decide the groups need to be balanced again. A most
				6359	* subtle and fragile situation.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6360	*/
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	6361
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	6362	static inline int sg_imbalanced(struct sched_group *group)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6363	{
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	6364	return group->sgc->imbalance;
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	6365	}
				6366
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	6367	/*
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	6368	* group_has_capacity returns true if the group has spare capacity that could
				6369	* be used by some tasks.
				6370	* We consider that a group has spare capacity if the * number of task is
Dietmar Eggemann	9e91d61	2015-08-14 17:23:12 +0100	[diff] [blame]	6371	* smaller than the number of CPUs or if the utilization is lower than the
				6372	* available capacity for CFS tasks.
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	6373	* For the latter, we use a threshold to stabilize the state, to take into
				6374	* account the variance of the tasks' load and to return true if the available
				6375	* capacity in meaningful for the load balancer.
				6376	* As an example, an available capacity of 1% can appear but it doesn't make
				6377	* any benefit for the load balance.
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	6378	*/
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	6379	static inline bool
				6380	group_has_capacity(struct lb_env env, struct sg_lb_stats sgs)
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	6381	{
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	6382	if (sgs->sum_nr_running < sgs->group_weight)
				6383	return true;
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	6384
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	6385	if ((sgs->group_capacity * 100) >
Dietmar Eggemann	9e91d61	2015-08-14 17:23:12 +0100	[diff] [blame]	6386	(sgs->group_util * env->sd->imbalance_pct))
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	6387	return true;
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	6388
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	6389	return false;
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	6390	}
				6391
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	6392	/*
				6393	* group_is_overloaded returns true if the group has more tasks than it can
				6394	* handle.
				6395	* group_is_overloaded is not equals to !group_has_capacity because a group
				6396	* with the exact right number of tasks, has no more spare capacity but is not
				6397	* overloaded so both group_has_capacity and group_is_overloaded return
				6398	* false.
				6399	*/
				6400	static inline bool
				6401	group_is_overloaded(struct lb_env env, struct sg_lb_stats sgs)
Rik van Riel	caeb178	2014-07-28 14:16:28 -0400	[diff] [blame]	6402	{
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	6403	if (sgs->sum_nr_running <= sgs->group_weight)
				6404	return false;
				6405
				6406	if ((sgs->group_capacity * 100) <
Dietmar Eggemann	9e91d61	2015-08-14 17:23:12 +0100	[diff] [blame]	6407	(sgs->group_util * env->sd->imbalance_pct))
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	6408	return true;
				6409
				6410	return false;
				6411	}
				6412
Leo Yan	79a89f9	2015-09-15 18:56:45 +0800	[diff] [blame]	6413	static inline enum
				6414	group_type group_classify(struct sched_group *group,
				6415	struct sg_lb_stats *sgs)
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	6416	{
				6417	if (sgs->group_no_capacity)
Rik van Riel	caeb178	2014-07-28 14:16:28 -0400	[diff] [blame]	6418	return group_overloaded;
				6419
				6420	if (sg_imbalanced(group))
				6421	return group_imbalanced;
				6422
				6423	return group_other;
				6424	}
				6425
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6426	/**
				6427	* update_sg_lb_stats - Update sched_group's statistics for load balancing.
				6428	* @env: The load balancing environment.
				6429	* @group: sched_group whose statistics are to be updated.
				6430	* @load_idx: Load index of sched_domain of this_cpu for load calc.
				6431	* @local_group: Does group contain this_cpu.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6432	* @sgs: variable to hold the statistics for this group.
Masanari Iida	cd3bd4e	2014-07-28 12:38:06 +0900	[diff] [blame]	6433	* @overload: Indicate more than one runnable task for any CPU.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6434	*/
				6435	static inline void update_sg_lb_stats(struct lb_env *env,
				6436	struct sched_group *group, int load_idx,
Tim Chen	4486edd	2014-06-23 12:16:49 -0700	[diff] [blame]	6437	int local_group, struct sg_lb_stats *sgs,
				6438	bool *overload)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6439	{
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	6440	unsigned long load;
Waiman Long	a426f99	2015-11-25 14:09:38 -0500	[diff] [blame]	6441	int i, nr_running;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6442
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	6443	memset(sgs, 0, sizeof(*sgs));
				6444
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	6445	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6446	struct rq *rq = cpu_rq(i);
				6447
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6448	/* Bias balancing toward cpus of our domain */
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	6449	if (local_group)
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	6450	load = target_load(i, load_idx);
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	6451	else
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6452	load = source_load(i, load_idx);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6453
				6454	sgs->group_load += load;
Dietmar Eggemann	9e91d61	2015-08-14 17:23:12 +0100	[diff] [blame]	6455	sgs->group_util += cpu_util(i);
Vincent Guittot	65fdac0	2014-08-26 13:06:46 +0200	[diff] [blame]	6456	sgs->sum_nr_running += rq->cfs.h_nr_running;
Tim Chen	4486edd	2014-06-23 12:16:49 -0700	[diff] [blame]	6457
Waiman Long	a426f99	2015-11-25 14:09:38 -0500	[diff] [blame]	6458	nr_running = rq->nr_running;
				6459	if (nr_running > 1)
Tim Chen	4486edd	2014-06-23 12:16:49 -0700	[diff] [blame]	6460	*overload = true;
				6461
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	6462	#ifdef CONFIG_NUMA_BALANCING
				6463	sgs->nr_numa_running += rq->nr_numa_running;
				6464	sgs->nr_preferred_running += rq->nr_preferred_running;
				6465	#endif
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6466	sgs->sum_weighted_load += weighted_cpuload(i);
Waiman Long	a426f99	2015-11-25 14:09:38 -0500	[diff] [blame]	6467	/*
				6468	* No need to call idle_cpu() if nr_running is not 0
				6469	*/
				6470	if (!nr_running && idle_cpu(i))
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	6471	sgs->idle_cpus++;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6472	}
				6473
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	6474	/* Adjust by relative CPU capacity of the group */
				6475	sgs->group_capacity = group->sgc->capacity;
Nicolas Pitre	ca8ce3d	2014-05-26 18:19:39 -0400	[diff] [blame]	6476	sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6477
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	6478	if (sgs->sum_nr_running)
Peter Zijlstra	38d0f77	2013-08-15 19:47:56 +0200	[diff] [blame]	6479	sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6480
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	6481	sgs->group_weight = group->group_weight;
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	6482
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	6483	sgs->group_no_capacity = group_is_overloaded(env, sgs);
Leo Yan	79a89f9	2015-09-15 18:56:45 +0800	[diff] [blame]	6484	sgs->group_type = group_classify(group, sgs);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6485	}
				6486
				6487	/**
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	6488	* update_sd_pick_busiest - return 1 on busiest group
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	6489	* @env: The load balancing environment.
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	6490	* @sds: sched_domain statistics
				6491	* @sg: sched_group candidate to be checked for being the busiest
Michael Neuling	b6b1229	2010-06-10 12:06:21 +1000	[diff] [blame]	6492	* @sgs: sched_group statistics
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	6493	*
				6494	* Determine if @sg is a busier group than the previously selected
				6495	* busiest group.
Yacine Belkadi	e69f618	2013-07-12 20:45:47 +0200	[diff] [blame]	6496	*
				6497	* Return: %true if @sg is a busier group than the previously selected
				6498	* busiest group. %false otherwise.
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	6499	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	6500	static bool update_sd_pick_busiest(struct lb_env *env,
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	6501	struct sd_lb_stats *sds,
				6502	struct sched_group *sg,
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	6503	struct sg_lb_stats *sgs)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	6504	{
Rik van Riel	caeb178	2014-07-28 14:16:28 -0400	[diff] [blame]	6505	struct sg_lb_stats *busiest = &sds->busiest_stat;
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	6506
Rik van Riel	caeb178	2014-07-28 14:16:28 -0400	[diff] [blame]	6507	if (sgs->group_type > busiest->group_type)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	6508	return true;
				6509
Rik van Riel	caeb178	2014-07-28 14:16:28 -0400	[diff] [blame]	6510	if (sgs->group_type < busiest->group_type)
				6511	return false;
				6512
				6513	if (sgs->avg_load <= busiest->avg_load)
				6514	return false;
				6515
				6516	/* This is the busiest node in its class. */
				6517	if (!(env->sd->flags & SD_ASYM_PACKING))
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	6518	return true;
				6519
				6520	/*
				6521	* ASYM_PACKING needs to move all the work to the lowest
				6522	* numbered CPUs in the group, therefore mark all groups
				6523	* higher than ourself as busy.
				6524	*/
Rik van Riel	caeb178	2014-07-28 14:16:28 -0400	[diff] [blame]	6525	if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	6526	if (!sds->busiest)
				6527	return true;
				6528
				6529	if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
				6530	return true;
				6531	}
				6532
				6533	return false;
				6534	}
				6535
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	6536	#ifdef CONFIG_NUMA_BALANCING
				6537	static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
				6538	{
				6539	if (sgs->sum_nr_running > sgs->nr_numa_running)
				6540	return regular;
				6541	if (sgs->sum_nr_running > sgs->nr_preferred_running)
				6542	return remote;
				6543	return all;
				6544	}
				6545
				6546	static inline enum fbq_type fbq_classify_rq(struct rq *rq)
				6547	{
				6548	if (rq->nr_running > rq->nr_numa_running)
				6549	return regular;
				6550	if (rq->nr_running > rq->nr_preferred_running)
				6551	return remote;
				6552	return all;
				6553	}
				6554	#else
				6555	static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
				6556	{
				6557	return all;
				6558	}
				6559
				6560	static inline enum fbq_type fbq_classify_rq(struct rq *rq)
				6561	{
				6562	return regular;
				6563	}
				6564	#endif /* CONFIG_NUMA_BALANCING */
				6565
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	6566	/**
Hui Kang	461819a	2011-10-11 23:00:59 -0400	[diff] [blame]	6567	* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	6568	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6569	* @sds: variable to hold the statistics for this sched_domain.
				6570	*/
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	6571	static inline void update_sd_lb_stats(struct lb_env env, struct sd_lb_stats sds)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6572	{
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	6573	struct sched_domain *child = env->sd->child;
				6574	struct sched_group *sg = env->sd->groups;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6575	struct sg_lb_stats tmp_sgs;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6576	int load_idx, prefer_sibling = 0;
Tim Chen	4486edd	2014-06-23 12:16:49 -0700	[diff] [blame]	6577	bool overload = false;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6578
				6579	if (child && child->flags & SD_PREFER_SIBLING)
				6580	prefer_sibling = 1;
				6581
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	6582	load_idx = get_sd_load_idx(env->sd, env->idle);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6583
				6584	do {
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6585	struct sg_lb_stats *sgs = &tmp_sgs;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6586	int local_group;
				6587
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	6588	local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6589	if (local_group) {
				6590	sds->local = sg;
				6591	sgs = &sds->local_stat;
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	6592
				6593	if (env->idle != CPU_NEWLY_IDLE \|\|
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	6594	time_after_eq(jiffies, sg->sgc->next_update))
				6595	update_group_capacity(env->sd, env->dst_cpu);
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6596	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6597
Tim Chen	4486edd	2014-06-23 12:16:49 -0700	[diff] [blame]	6598	update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
				6599	&overload);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6600
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	6601	if (local_group)
				6602	goto next_group;
				6603
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6604	/*
				6605	* In case the child domain prefers tasks go to siblings
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	6606	* first, lower the sg capacity so that we'll try
Nikhil Rao	75dd321	2010-10-15 13:12:30 -0700	[diff] [blame]	6607	* and move all the excess tasks away. We lower the capacity
				6608	* of a group only if the local group has the capacity to fit
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	6609	* these excess tasks. The extra check prevents the case where
				6610	* you always pull from the heaviest group when it is already
				6611	* under-utilized (possible with a large weight task outweighs
				6612	* the tasks on the system).
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6613	*/
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	6614	if (prefer_sibling && sds->local &&
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	6615	group_has_capacity(env, &sds->local_stat) &&
				6616	(sgs->sum_nr_running > 1)) {
				6617	sgs->group_no_capacity = 1;
Leo Yan	79a89f9	2015-09-15 18:56:45 +0800	[diff] [blame]	6618	sgs->group_type = group_classify(sg, sgs);
Wanpeng Li	cb0b9f2	2014-11-05 07:44:50 +0800	[diff] [blame]	6619	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6620
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	6621	if (update_sd_pick_busiest(env, sds, sg, sgs)) {
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	6622	sds->busiest = sg;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6623	sds->busiest_stat = *sgs;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6624	}
				6625
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	6626	next_group:
				6627	/* Now, start updating sd_lb_stats */
				6628	sds->total_load += sgs->group_load;
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	6629	sds->total_capacity += sgs->group_capacity;
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	6630
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	6631	sg = sg->next;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	6632	} while (sg != env->sd->groups);
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	6633
				6634	if (env->sd->flags & SD_NUMA)
				6635	env->fbq_type = fbq_classify_group(&sds->busiest_stat);
Tim Chen	4486edd	2014-06-23 12:16:49 -0700	[diff] [blame]	6636
				6637	if (!env->sd->parent) {
				6638	/* update overload indicator if we are at root domain */
				6639	if (env->dst_rq->rd->overload != overload)
				6640	env->dst_rq->rd->overload = overload;
				6641	}
				6642
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	6643	}
				6644
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	6645	/**
				6646	* check_asym_packing - Check to see if the group is packed into the
				6647	* sched doman.
				6648	*
				6649	* This is primarily intended to used at the sibling level. Some
				6650	* cores like POWER7 prefer to use lower numbered SMT threads. In the
				6651	* case of POWER7, it can move to lower SMT modes only when higher
				6652	* threads are idle. When in lower SMT modes, the threads will
				6653	* perform better since they share less core resources. Hence when we
				6654	* have idle threads, we want them to be the higher ones.
				6655	*
				6656	* This packing function is run on idle threads. It checks to see if
				6657	* the busiest CPU in this domain (core in the P7 case) has a higher
				6658	* CPU number than the packing function is being run on. Here we are
				6659	* assuming lower CPU number will be equivalent to lower a SMT thread
				6660	* number.
				6661	*
Yacine Belkadi	e69f618	2013-07-12 20:45:47 +0200	[diff] [blame]	6662	* Return: 1 when packing is required and a task should be moved to
Michael Neuling	b6b1229	2010-06-10 12:06:21 +1000	[diff] [blame]	6663	* this CPU. The amount of the imbalance is returned in *imbalance.
				6664	*
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	6665	* @env: The load balancing environment.
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	6666	* @sds: Statistics of the sched_domain which is to be packed
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	6667	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	6668	static int check_asym_packing(struct lb_env env, struct sd_lb_stats sds)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	6669	{
				6670	int busiest_cpu;
				6671
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	6672	if (!(env->sd->flags & SD_ASYM_PACKING))
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	6673	return 0;
				6674
				6675	if (!sds->busiest)
				6676	return 0;
				6677
				6678	busiest_cpu = group_first_cpu(sds->busiest);
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	6679	if (env->dst_cpu > busiest_cpu)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	6680	return 0;
				6681
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	6682	env->imbalance = DIV_ROUND_CLOSEST(
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	6683	sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
Nicolas Pitre	ca8ce3d	2014-05-26 18:19:39 -0400	[diff] [blame]	6684	SCHED_CAPACITY_SCALE);
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	6685
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	6686	return 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6687	}
				6688
				6689	/**
				6690	* fix_small_imbalance - Calculate the minor imbalance that exists
				6691	* amongst the groups of a sched_domain, during
				6692	* load balancing.
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	6693	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6694	* @sds: Statistics of the sched_domain whose imbalance is to be calculated.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6695	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	6696	static inline
				6697	void fix_small_imbalance(struct lb_env env, struct sd_lb_stats sds)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6698	{
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	6699	unsigned long tmp, capa_now = 0, capa_move = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6700	unsigned int imbn = 2;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	6701	unsigned long scaled_busy_load_per_task;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6702	struct sg_lb_stats local, busiest;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6703
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6704	local = &sds->local_stat;
				6705	busiest = &sds->busiest_stat;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6706
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6707	if (!local->sum_nr_running)
				6708	local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
				6709	else if (busiest->load_per_task > local->load_per_task)
				6710	imbn = 1;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	6711
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6712	scaled_busy_load_per_task =
Nicolas Pitre	ca8ce3d	2014-05-26 18:19:39 -0400	[diff] [blame]	6713	(busiest->load_per_task * SCHED_CAPACITY_SCALE) /
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	6714	busiest->group_capacity;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6715
Vladimir Davydov	3029ede	2013-09-15 17:49:14 +0400	[diff] [blame]	6716	if (busiest->avg_load + scaled_busy_load_per_task >=
				6717	local->avg_load + (scaled_busy_load_per_task * imbn)) {
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6718	env->imbalance = busiest->load_per_task;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6719	return;
				6720	}
				6721
				6722	/*
				6723	* OK, we don't have enough imbalance to justify moving tasks,
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	6724	* however we may be able to increase total CPU capacity used by
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6725	* moving them.
				6726	*/
				6727
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	6728	capa_now += busiest->group_capacity *
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6729	min(busiest->load_per_task, busiest->avg_load);
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	6730	capa_now += local->group_capacity *
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6731	min(local->load_per_task, local->avg_load);
Nicolas Pitre	ca8ce3d	2014-05-26 18:19:39 -0400	[diff] [blame]	6732	capa_now /= SCHED_CAPACITY_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6733
				6734	/* Amount of load we'd subtract */
Vincent Guittot	a2cd426	2014-03-11 17:26:06 +0100	[diff] [blame]	6735	if (busiest->avg_load > scaled_busy_load_per_task) {
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	6736	capa_move += busiest->group_capacity *
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6737	min(busiest->load_per_task,
Vincent Guittot	a2cd426	2014-03-11 17:26:06 +0100	[diff] [blame]	6738	busiest->avg_load - scaled_busy_load_per_task);
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6739	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6740
				6741	/* Amount of load we'd add */
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	6742	if (busiest->avg_load * busiest->group_capacity <
Nicolas Pitre	ca8ce3d	2014-05-26 18:19:39 -0400	[diff] [blame]	6743	busiest->load_per_task * SCHED_CAPACITY_SCALE) {
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	6744	tmp = (busiest->avg_load * busiest->group_capacity) /
				6745	local->group_capacity;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6746	} else {
Nicolas Pitre	ca8ce3d	2014-05-26 18:19:39 -0400	[diff] [blame]	6747	tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	6748	local->group_capacity;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6749	}
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	6750	capa_move += local->group_capacity *
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	6751	min(local->load_per_task, local->avg_load + tmp);
Nicolas Pitre	ca8ce3d	2014-05-26 18:19:39 -0400	[diff] [blame]	6752	capa_move /= SCHED_CAPACITY_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6753
				6754	/* Move if we gain throughput */
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	6755	if (capa_move > capa_now)
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6756	env->imbalance = busiest->load_per_task;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6757	}
				6758
				6759	/**
				6760	* calculate_imbalance - Calculate the amount of imbalance present within the
				6761	* groups of a given sched_domain during load balance.
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	6762	* @env: load balance environment
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6763	* @sds: statistics of the sched_domain whose imbalance is to be calculated.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6764	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	6765	static inline void calculate_imbalance(struct lb_env env, struct sd_lb_stats sds)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6766	{
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	6767	unsigned long max_pull, load_above_capacity = ~0UL;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6768	struct sg_lb_stats local, busiest;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	6769
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6770	local = &sds->local_stat;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6771	busiest = &sds->busiest_stat;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6772
Rik van Riel	caeb178	2014-07-28 14:16:28 -0400	[diff] [blame]	6773	if (busiest->group_type == group_imbalanced) {
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	6774	/*
				6775	* In the group_imb case we cannot rely on group-wide averages
				6776	* to ensure cpu-load equilibrium, look at wider averages. XXX
				6777	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6778	busiest->load_per_task =
				6779	min(busiest->load_per_task, sds->avg_load);
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	6780	}
				6781
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6782	/*
				6783	* In the presence of smp nice balancing, certain scenarios can have
				6784	* max load less than avg load(as we skip the groups at or below
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	6785	* its cpu_capacity, while calculating max_load..)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6786	*/
Vladimir Davydov	b188555	2013-09-15 17:49:13 +0400	[diff] [blame]	6787	if (busiest->avg_load <= sds->avg_load \|\|
				6788	local->avg_load >= sds->avg_load) {
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	6789	env->imbalance = 0;
				6790	return fix_small_imbalance(env, sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6791	}
				6792
Peter Zijlstra	9a5d9ba	2014-07-29 17:15:11 +0200	[diff] [blame]	6793	/*
				6794	* If there aren't any idle cpus, avoid creating some.
				6795	*/
				6796	if (busiest->group_type == group_overloaded &&
				6797	local->group_type == group_overloaded) {
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	6798	load_above_capacity = busiest->sum_nr_running *
				6799	SCHED_LOAD_SCALE;
				6800	if (load_above_capacity > busiest->group_capacity)
				6801	load_above_capacity -= busiest->group_capacity;
				6802	else
				6803	load_above_capacity = ~0UL;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	6804	}
				6805
				6806	/*
				6807	* We're trying to get all the cpus to the average_load, so we don't
				6808	* want to push ourselves above the average load, nor do we wish to
				6809	* reduce the max loaded cpu below the average load. At the same time,
				6810	* we also don't want to reduce the group load below the group capacity
				6811	* (so that we can implement power-savings policies etc). Thus we look
				6812	* for the minimum possible imbalance.
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	6813	*/
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	6814	max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6815
				6816	/* How much load to actually move to equalise the imbalance */
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6817	env->imbalance = min(
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	6818	max_pull * busiest->group_capacity,
				6819	(sds->avg_load - local->avg_load) * local->group_capacity
Nicolas Pitre	ca8ce3d	2014-05-26 18:19:39 -0400	[diff] [blame]	6820	) / SCHED_CAPACITY_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6821
				6822	/*
				6823	* if *imbalance is less than the average load per runnable task
Lucas De Marchi	25985ed	2011-03-30 22:57:33 -0300	[diff] [blame]	6824	* there is no guarantee that any tasks will be moved so we'll have
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6825	* a think about bumping its value to force at least one task to be
				6826	* moved
				6827	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6828	if (env->imbalance < busiest->load_per_task)
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	6829	return fix_small_imbalance(env, sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6830	}
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	6831
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6832	/***** find_busiest_group() helpers end here *******************/
				6833
				6834	/**
				6835	* find_busiest_group - Returns the busiest group within the sched_domain
				6836	* if there is an imbalance. If there isn't an imbalance, and
				6837	* the user has opted for power-savings, it returns a group whose
				6838	* CPUs can be put to idle by rebalancing those tasks elsewhere, if
				6839	* such a group exists.
				6840	*
				6841	* Also calculates the amount of weighted load which should be moved
				6842	* to restore balance.
				6843	*
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	6844	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6845	*
Yacine Belkadi	e69f618	2013-07-12 20:45:47 +0200	[diff] [blame]	6846	* Return: - The busiest group if imbalance exists.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6847	* - If no imbalance and user has opted for power-savings balance,
				6848	* return the least loaded group whose CPUs can be
				6849	* put to idle by rebalancing its tasks onto our group.
				6850	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6851	static struct sched_group find_busiest_group(struct lb_env env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6852	{
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6853	struct sg_lb_stats local, busiest;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6854	struct sd_lb_stats sds;
				6855
Peter Zijlstra	147c5fc	2013-08-19 15:22:57 +0200	[diff] [blame]	6856	init_sd_lb_stats(&sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6857
				6858	/*
				6859	* Compute the various statistics relavent for load balancing at
				6860	* this level.
				6861	*/
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	6862	update_sd_lb_stats(env, &sds);
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6863	local = &sds.local_stat;
				6864	busiest = &sds.busiest_stat;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6865
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	6866	/* ASYM feature bypasses nice load balance check */
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	6867	if ((env->idle == CPU_IDLE \|\| env->idle == CPU_NEWLY_IDLE) &&
				6868	check_asym_packing(env, &sds))
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	6869	return sds.busiest;
				6870
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	6871	/* There is no busy sibling group to pull tasks from */
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6872	if (!sds.busiest \|\| busiest->sum_nr_running == 0)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6873	goto out_balanced;
				6874
Nicolas Pitre	ca8ce3d	2014-05-26 18:19:39 -0400	[diff] [blame]	6875	sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
				6876	/ sds.total_capacity;
Ken Chen	b0432d8	2011-04-07 17:23:22 -0700	[diff] [blame]	6877
Peter Zijlstra	866ab43	2011-02-21 18:56:47 +0100	[diff] [blame]	6878	/*
				6879	* If the busiest group is imbalanced the below checks don't
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	6880	* work because they assume all things are equal, which typically
Peter Zijlstra	866ab43	2011-02-21 18:56:47 +0100	[diff] [blame]	6881	* isn't true due to cpus_allowed constraints and the like.
				6882	*/
Rik van Riel	caeb178	2014-07-28 14:16:28 -0400	[diff] [blame]	6883	if (busiest->group_type == group_imbalanced)
Peter Zijlstra	866ab43	2011-02-21 18:56:47 +0100	[diff] [blame]	6884	goto force_balance;
				6885
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	6886	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	6887	if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
				6888	busiest->group_no_capacity)
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	6889	goto force_balance;
				6890
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	6891	/*
Zhihui Zhang	9c58c79	2014-09-20 21:24:36 -0400	[diff] [blame]	6892	* If the local group is busier than the selected busiest group
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	6893	* don't try and pull any tasks.
				6894	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6895	if (local->avg_load >= busiest->avg_load)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6896	goto out_balanced;
				6897
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	6898	/*
				6899	* Don't pull any tasks if this group is already above the domain
				6900	* average load.
				6901	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6902	if (local->avg_load >= sds.avg_load)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6903	goto out_balanced;
				6904
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	6905	if (env->idle == CPU_IDLE) {
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	6906	/*
Vincent Guittot	43f4d66	2014-10-01 15:38:55 +0200	[diff] [blame]	6907	* This cpu is idle. If the busiest group is not overloaded
				6908	* and there is no imbalance between this and busiest group
				6909	* wrt idle cpus, it is balanced. The imbalance becomes
				6910	* significant if the diff is greater than 1 otherwise we
				6911	* might end up to just move the imbalance on another group
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	6912	*/
Vincent Guittot	43f4d66	2014-10-01 15:38:55 +0200	[diff] [blame]	6913	if ((busiest->group_type != group_overloaded) &&
				6914	(local->idle_cpus <= (busiest->idle_cpus + 1)))
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	6915	goto out_balanced;
Peter Zijlstra	c186faf	2011-02-21 18:52:53 +0100	[diff] [blame]	6916	} else {
				6917	/*
				6918	* In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
				6919	* imbalance_pct to be conservative.
				6920	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	6921	if (100 * busiest->avg_load <=
				6922	env->sd->imbalance_pct * local->avg_load)
Peter Zijlstra	c186faf	2011-02-21 18:52:53 +0100	[diff] [blame]	6923	goto out_balanced;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	6924	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6925
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	6926	force_balance:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6927	/* Looks like there is an imbalance. Compute it */
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	6928	calculate_imbalance(env, &sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6929	return sds.busiest;
				6930
				6931	out_balanced:
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	6932	env->imbalance = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6933	return NULL;
				6934	}
				6935
				6936	/*
				6937	* find_busiest_queue - find the busiest runqueue among the cpus in group.
				6938	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	6939	static struct rq find_busiest_queue(struct lb_env env,
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	6940	struct sched_group *group)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6941	{
				6942	struct rq busiest = NULL, rq;
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	6943	unsigned long busiest_load = 0, busiest_capacity = 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6944	int i;
				6945
Peter Zijlstra	6906a40	2013-08-19 15:20:21 +0200	[diff] [blame]	6946	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	6947	unsigned long capacity, wl;
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	6948	enum fbq_type rt;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6949
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	6950	rq = cpu_rq(i);
				6951	rt = fbq_classify_rq(rq);
				6952
				6953	/*
				6954	* We classify groups/runqueues into three groups:
				6955	* - regular: there are !numa tasks
				6956	* - remote: there are numa tasks that run on the 'wrong' node
				6957	* - all: there is no distinction
				6958	*
				6959	* In order to avoid migrating ideally placed numa tasks,
				6960	* ignore those when there's better options.
				6961	*
				6962	* If we ignore the actual busiest queue to migrate another
				6963	* task, the next balance pass can still reduce the busiest
				6964	* queue by moving tasks around inside the node.
				6965	*
				6966	* If we cannot move enough load due to this classification
				6967	* the next pass will adjust the group classification and
				6968	* allow migration of more tasks.
				6969	*
				6970	* Both cases only affect the total convergence complexity.
				6971	*/
				6972	if (rt > env->fbq_type)
				6973	continue;
				6974
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	6975	capacity = capacity_of(i);
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	6976
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	6977	wl = weighted_cpuload(i);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6978
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	6979	/*
				6980	* When comparing with imbalance, use weighted_cpuload()
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	6981	* which is not scaled with the cpu capacity.
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	6982	*/
Vincent Guittot	ea67821	2015-02-27 16:54:11 +0100	[diff] [blame]	6983
				6984	if (rq->nr_running == 1 && wl > env->imbalance &&
				6985	!check_cpu_capacity(rq, env->sd))
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6986	continue;
				6987
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	6988	/*
				6989	* For the load comparisons with the other cpu's, consider
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	6990	* the weighted_cpuload() scaled with the cpu capacity, so
				6991	* that the load can be moved away from the cpu that is
				6992	* potentially running at a lower capacity.
Joonsoo Kim	95a79b8	2013-08-06 17:36:41 +0900	[diff] [blame]	6993	*
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	6994	* Thus we're looking for max(wl_i / capacity_i), crosswise
Joonsoo Kim	95a79b8	2013-08-06 17:36:41 +0900	[diff] [blame]	6995	* multiplication to rid ourselves of the division works out
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	6996	* to: wl_i * capacity_j > wl_j * capacity_i; where j is
				6997	* our previous maximum.
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	6998	*/
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	6999	if (wl * busiest_capacity > busiest_load * capacity) {
Joonsoo Kim	95a79b8	2013-08-06 17:36:41 +0900	[diff] [blame]	7000	busiest_load = wl;
Nicolas Pitre	ced549f	2014-05-26 18:19:38 -0400	[diff] [blame]	7001	busiest_capacity = capacity;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7002	busiest = rq;
				7003	}
				7004	}
				7005
				7006	return busiest;
				7007	}
				7008
				7009	/*
				7010	* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
				7011	* so long as it is large enough.
				7012	*/
				7013	#define MAX_PINNED_INTERVAL 512
				7014
				7015	/* Working cpumask for load_balance and load_balance_newidle. */
Joonsoo Kim	e6252c3	2013-04-23 17:27:41 +0900	[diff] [blame]	7016	DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7017
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7018	static int need_active_balance(struct lb_env *env)
Peter Zijlstra	1af3ed3	2009-12-23 15:10:31 +0100	[diff] [blame]	7019	{
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7020	struct sched_domain *sd = env->sd;
				7021
				7022	if (env->idle == CPU_NEWLY_IDLE) {
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7023
				7024	/*
				7025	* ASYM_PACKING needs to force migrate tasks from busy but
				7026	* higher numbered CPUs in order to pack all tasks in the
				7027	* lowest numbered CPUs.
				7028	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7029	if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	7030	return 1;
Peter Zijlstra	1af3ed3	2009-12-23 15:10:31 +0100	[diff] [blame]	7031	}
				7032
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	7033	/*
				7034	* The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
				7035	* It's worth migrating the task if the src_cpu's capacity is reduced
				7036	* because of other sched_class or IRQs if more capacity stays
				7037	* available on dst_cpu.
				7038	*/
				7039	if ((env->idle != CPU_NOT_IDLE) &&
				7040	(env->src_rq->cfs.h_nr_running == 1)) {
				7041	if ((check_cpu_capacity(env->src_rq, sd)) &&
				7042	(capacity_of(env->src_cpu)sd->imbalance_pct < capacity_of(env->dst_cpu)100))
				7043	return 1;
				7044	}
				7045
Peter Zijlstra	1af3ed3	2009-12-23 15:10:31 +0100	[diff] [blame]	7046	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
				7047	}
				7048
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	7049	static int active_load_balance_cpu_stop(void *data);
				7050
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	7051	static int should_we_balance(struct lb_env *env)
				7052	{
				7053	struct sched_group *sg = env->sd->groups;
				7054	struct cpumask sg_cpus, sg_mask;
				7055	int cpu, balance_cpu = -1;
				7056
				7057	/*
				7058	* In the newly idle case, we will allow all the cpu's
				7059	* to do the newly idle load balance.
				7060	*/
				7061	if (env->idle == CPU_NEWLY_IDLE)
				7062	return 1;
				7063
				7064	sg_cpus = sched_group_cpus(sg);
				7065	sg_mask = sched_group_mask(sg);
				7066	/* Try to find first idle cpu */
				7067	for_each_cpu_and(cpu, sg_cpus, env->cpus) {
				7068	if (!cpumask_test_cpu(cpu, sg_mask) \|\| !idle_cpu(cpu))
				7069	continue;
				7070
				7071	balance_cpu = cpu;
				7072	break;
				7073	}
				7074
				7075	if (balance_cpu == -1)
				7076	balance_cpu = group_balance_cpu(sg);
				7077
				7078	/*
				7079	* First idle cpu or the first cpu(busiest) in this sched group
				7080	* is eligible for doing load balancing at this and above domains.
				7081	*/
Joonsoo Kim	b0cff9d	2013-09-10 15:54:49 +0900	[diff] [blame]	7082	return balance_cpu == env->dst_cpu;
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	7083	}
				7084
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7085	/*
				7086	* Check this_cpu to ensure it is balanced within domain. Attempt to move
				7087	* tasks if there is an imbalance.
				7088	*/
				7089	static int load_balance(int this_cpu, struct rq *this_rq,
				7090	struct sched_domain *sd, enum cpu_idle_type idle,
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	7091	int *continue_balancing)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7092	{
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	7093	int ld_moved, cur_ld_moved, active_balance = 0;
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	7094	struct sched_domain *sd_parent = sd->parent;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7095	struct sched_group *group;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7096	struct rq *busiest;
				7097	unsigned long flags;
Christoph Lameter	4ba2968	2014-08-26 19:12:21 -0500	[diff] [blame]	7098	struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7099
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	7100	struct lb_env env = {
				7101	.sd = sd,
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	7102	.dst_cpu = this_cpu,
				7103	.dst_rq = this_rq,
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	7104	.dst_grpmask = sched_group_cpus(sd->groups),
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	7105	.idle = idle,
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	7106	.loop_break = sched_nr_migrate_break,
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	7107	.cpus = cpus,
Peter Zijlstra	0ec8aa0	2013-10-07 11:29:33 +0100	[diff] [blame]	7108	.fbq_type = all,
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	7109	.tasks = LIST_HEAD_INIT(env.tasks),
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	7110	};
				7111
Joonsoo Kim	cfc0311	2013-04-23 17:27:39 +0900	[diff] [blame]	7112	/*
				7113	* For NEWLY_IDLE load_balancing, we don't need to consider
				7114	* other cpus in our group
				7115	*/
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	7116	if (idle == CPU_NEWLY_IDLE)
Joonsoo Kim	cfc0311	2013-04-23 17:27:39 +0900	[diff] [blame]	7117	env.dst_grpmask = NULL;
Joonsoo Kim	cfc0311	2013-04-23 17:27:39 +0900	[diff] [blame]	7118
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7119	cpumask_copy(cpus, cpu_active_mask);
				7120
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7121	schedstat_inc(sd, lb_count[idle]);
				7122
				7123	redo:
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	7124	if (!should_we_balance(&env)) {
				7125	*continue_balancing = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7126	goto out_balanced;
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	7127	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7128
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	7129	group = find_busiest_group(&env);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7130	if (!group) {
				7131	schedstat_inc(sd, lb_nobusyg[idle]);
				7132	goto out_balanced;
				7133	}
				7134
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	7135	busiest = find_busiest_queue(&env, group);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7136	if (!busiest) {
				7137	schedstat_inc(sd, lb_nobusyq[idle]);
				7138	goto out_balanced;
				7139	}
				7140
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	7141	BUG_ON(busiest == env.dst_rq);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7142
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7143	schedstat_add(sd, lb_imbalance[idle], env.imbalance);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7144
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	7145	env.src_cpu = busiest->cpu;
				7146	env.src_rq = busiest;
				7147
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7148	ld_moved = 0;
				7149	if (busiest->nr_running > 1) {
				7150	/*
				7151	* Attempt to move tasks. If find_busiest_group has found
				7152	* an imbalance but busiest->nr_running <= 1, the group is
				7153	* still unbalanced. ld_moved simply stays zero, so it is
				7154	* correctly treated as an imbalance.
				7155	*/
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	7156	env.flags \|= LBF_ALL_PINNED;
Peter Zijlstra	c82513e	2012-04-26 13:12:27 +0200	[diff] [blame]	7157	env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	7158
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	7159	more_balance:
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	7160	raw_spin_lock_irqsave(&busiest->lock, flags);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	7161
				7162	/*
				7163	* cur_ld_moved - load moved in current iteration
				7164	* ld_moved - cumulative load moved across iterations
				7165	*/
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	7166	cur_ld_moved = detach_tasks(&env);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7167
				7168	/*
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	7169	* We've detached some tasks from busiest_rq. Every
				7170	* task is masked "TASK_ON_RQ_MIGRATING", so we can safely
				7171	* unlock busiest->lock, and we are able to be sure
				7172	* that nobody can manipulate the tasks in parallel.
				7173	* See task_rq_lock() family for the details.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7174	*/
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	7175
				7176	raw_spin_unlock(&busiest->lock);
				7177
				7178	if (cur_ld_moved) {
				7179	attach_tasks(&env);
				7180	ld_moved += cur_ld_moved;
				7181	}
				7182
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7183	local_irq_restore(flags);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	7184
Joonsoo Kim	f1cd085	2013-04-23 17:27:37 +0900	[diff] [blame]	7185	if (env.flags & LBF_NEED_BREAK) {
				7186	env.flags &= ~LBF_NEED_BREAK;
				7187	goto more_balance;
				7188	}
				7189
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	7190	/*
				7191	* Revisit (affine) tasks on src_cpu that couldn't be moved to
				7192	* us and move them to an alternate dst_cpu in our sched_group
				7193	* where they can run. The upper limit on how many times we
				7194	* iterate on same src_cpu is dependent on number of cpus in our
				7195	* sched_group.
				7196	*
				7197	* This changes load balance semantics a bit on who can move
				7198	* load to a given_cpu. In addition to the given_cpu itself
				7199	* (or a ilb_cpu acting on its behalf where given_cpu is
				7200	* nohz-idle), we now have balance_cpu in a position to move
				7201	* load to given_cpu. In rare situations, this may cause
				7202	* conflicts (balance_cpu and given_cpu/ilb_cpu deciding
				7203	* _independently_ and at _same_ time to move some load to
				7204	* given_cpu) causing exceess load to be moved to given_cpu.
				7205	* This however should not happen so much in practice and
				7206	* moreover subsequent load balance cycles should correct the
				7207	* excess load moved.
				7208	*/
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	7209	if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	7210
Vladimir Davydov	7aff2e3	2013-09-15 21:30:13 +0400	[diff] [blame]	7211	/* Prevent to re-select dst_cpu via env's cpus */
				7212	cpumask_clear_cpu(env.dst_cpu, env.cpus);
				7213
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	7214	env.dst_rq = cpu_rq(env.new_dst_cpu);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	7215	env.dst_cpu = env.new_dst_cpu;
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	7216	env.flags &= ~LBF_DST_PINNED;
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	7217	env.loop = 0;
				7218	env.loop_break = sched_nr_migrate_break;
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	7219
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	7220	/*
				7221	* Go back to "more_balance" rather than "redo" since we
				7222	* need to continue with same src_cpu.
				7223	*/
				7224	goto more_balance;
				7225	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7226
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	7227	/*
				7228	* We failed to reach balance because of affinity.
				7229	*/
				7230	if (sd_parent) {
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7231	int *group_imbalance = &sd_parent->groups->sgc->imbalance;
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	7232
Vincent Guittot	afdeee0	2014-08-26 13:06:44 +0200	[diff] [blame]	7233	if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	7234	*group_imbalance = 1;
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	7235	}
				7236
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7237	/* All tasks on this runqueue were pinned by CPU affinity */
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	7238	if (unlikely(env.flags & LBF_ALL_PINNED)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7239	cpumask_clear_cpu(cpu_of(busiest), cpus);
Prashanth Nageshappa	bbf18b1	2012-06-19 17:52:07 +0530	[diff] [blame]	7240	if (!cpumask_empty(cpus)) {
				7241	env.loop = 0;
				7242	env.loop_break = sched_nr_migrate_break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7243	goto redo;
Prashanth Nageshappa	bbf18b1	2012-06-19 17:52:07 +0530	[diff] [blame]	7244	}
Vincent Guittot	afdeee0	2014-08-26 13:06:44 +0200	[diff] [blame]	7245	goto out_all_pinned;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7246	}
				7247	}
				7248
				7249	if (!ld_moved) {
				7250	schedstat_inc(sd, lb_failed[idle]);
Venkatesh Pallipadi	58b26c4	2010-09-10 18:19:17 -0700	[diff] [blame]	7251	/*
				7252	* Increment the failure counter only on periodic balance.
				7253	* We do not want newidle balance, which can be very
				7254	* frequent, pollute the failure counter causing
				7255	* excessive cache_hot migrations and active balances.
				7256	*/
				7257	if (idle != CPU_NEWLY_IDLE)
				7258	sd->nr_balance_failed++;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7259
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7260	if (need_active_balance(&env)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7261	raw_spin_lock_irqsave(&busiest->lock, flags);
				7262
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	7263	/* don't kick the active_load_balance_cpu_stop,
				7264	* if the curr task on busiest cpu can't be
				7265	* moved to this_cpu
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7266	*/
				7267	if (!cpumask_test_cpu(this_cpu,
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	7268	tsk_cpus_allowed(busiest->curr))) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7269	raw_spin_unlock_irqrestore(&busiest->lock,
				7270	flags);
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	7271	env.flags \|= LBF_ALL_PINNED;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7272	goto out_one_pinned;
				7273	}
				7274
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	7275	/*
				7276	* ->active_balance synchronizes accesses to
				7277	* ->active_balance_work. Once set, it's cleared
				7278	* only after active load balance is finished.
				7279	*/
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7280	if (!busiest->active_balance) {
				7281	busiest->active_balance = 1;
				7282	busiest->push_cpu = this_cpu;
				7283	active_balance = 1;
				7284	}
				7285	raw_spin_unlock_irqrestore(&busiest->lock, flags);
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	7286
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7287	if (active_balance) {
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	7288	stop_one_cpu_nowait(cpu_of(busiest),
				7289	active_load_balance_cpu_stop, busiest,
				7290	&busiest->active_balance_work);
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	7291	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7292
				7293	/*
				7294	* We've kicked active balancing, reset the failure
				7295	* counter.
				7296	*/
				7297	sd->nr_balance_failed = sd->cache_nice_tries+1;
				7298	}
				7299	} else
				7300	sd->nr_balance_failed = 0;
				7301
				7302	if (likely(!active_balance)) {
				7303	/* We were unbalanced, so reset the balancing interval */
				7304	sd->balance_interval = sd->min_interval;
				7305	} else {
				7306	/*
				7307	* If we've begun active balancing, start to back off. This
				7308	* case may not be covered by the all_pinned logic if there
				7309	* is only 1 task on the busy runqueue (because we don't call
Kirill Tkhai	163122b	2014-08-20 13:48:29 +0400	[diff] [blame]	7310	* detach_tasks).
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7311	*/
				7312	if (sd->balance_interval < sd->max_interval)
				7313	sd->balance_interval *= 2;
				7314	}
				7315
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7316	goto out;
				7317
				7318	out_balanced:
Vincent Guittot	afdeee0	2014-08-26 13:06:44 +0200	[diff] [blame]	7319	/*
				7320	* We reach balance although we may have faced some affinity
				7321	* constraints. Clear the imbalance flag if it was set.
				7322	*/
				7323	if (sd_parent) {
				7324	int *group_imbalance = &sd_parent->groups->sgc->imbalance;
				7325
				7326	if (*group_imbalance)
				7327	*group_imbalance = 0;
				7328	}
				7329
				7330	out_all_pinned:
				7331	/*
				7332	* We reach balance because all tasks are pinned at this level so
				7333	* we can't migrate them. Let the imbalance flag set so parent level
				7334	* can try to migrate them.
				7335	*/
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7336	schedstat_inc(sd, lb_balanced[idle]);
				7337
				7338	sd->nr_balance_failed = 0;
				7339
				7340	out_one_pinned:
				7341	/* tune up the balancing interval */
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	7342	if (((env.flags & LBF_ALL_PINNED) &&
Peter Zijlstra	5b54b56	2011-09-22 15:23:13 +0200	[diff] [blame]	7343	sd->balance_interval < MAX_PINNED_INTERVAL) \|\|
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7344	(sd->balance_interval < sd->max_interval))
				7345	sd->balance_interval *= 2;
				7346
Venkatesh Pallipadi	46e49b3	2011-02-14 14:38:50 -0800	[diff] [blame]	7347	ld_moved = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7348	out:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7349	return ld_moved;
				7350	}
				7351
Jason Low	52a08ef	2014-05-08 17:49:22 -0700	[diff] [blame]	7352	static inline unsigned long
				7353	get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
				7354	{
				7355	unsigned long interval = sd->balance_interval;
				7356
				7357	if (cpu_busy)
				7358	interval *= sd->busy_factor;
				7359
				7360	/* scale ms to jiffies */
				7361	interval = msecs_to_jiffies(interval);
				7362	interval = clamp(interval, 1UL, max_load_balance_interval);
				7363
				7364	return interval;
				7365	}
				7366
				7367	static inline void
				7368	update_next_balance(struct sched_domain sd, int cpu_busy, unsigned long next_balance)
				7369	{
				7370	unsigned long interval, next;
				7371
				7372	interval = get_sd_balance_interval(sd, cpu_busy);
				7373	next = sd->last_balance + interval;
				7374
				7375	if (time_after(*next_balance, next))
				7376	*next_balance = next;
				7377	}
				7378
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7379	/*
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7380	* idle_balance is called by schedule() if this_cpu is about to become
				7381	* idle. Attempts to pull tasks from other CPUs.
				7382	*/
Peter Zijlstra	6e83125	2014-02-11 16:11:48 +0100	[diff] [blame]	7383	static int idle_balance(struct rq *this_rq)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7384	{
Jason Low	52a08ef	2014-05-08 17:49:22 -0700	[diff] [blame]	7385	unsigned long next_balance = jiffies + HZ;
				7386	int this_cpu = this_rq->cpu;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7387	struct sched_domain *sd;
				7388	int pulled_task = 0;
Jason Low	9bd721c	2013-09-13 11:26:52 -0700	[diff] [blame]	7389	u64 curr_cost = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7390
Peter Zijlstra	6e83125	2014-02-11 16:11:48 +0100	[diff] [blame]	7391	/*
				7392	* We must set idle_stamp _before_ calling idle_balance(), such that we
				7393	* measure the duration of idle_balance() as idle time.
				7394	*/
				7395	this_rq->idle_stamp = rq_clock(this_rq);
				7396
Tim Chen	4486edd	2014-06-23 12:16:49 -0700	[diff] [blame]	7397	if (this_rq->avg_idle < sysctl_sched_migration_cost \|\|
				7398	!this_rq->rd->overload) {
Jason Low	52a08ef	2014-05-08 17:49:22 -0700	[diff] [blame]	7399	rcu_read_lock();
				7400	sd = rcu_dereference_check_sched_domain(this_rq->sd);
				7401	if (sd)
				7402	update_next_balance(sd, 0, &next_balance);
				7403	rcu_read_unlock();
				7404
Peter Zijlstra	6e83125	2014-02-11 16:11:48 +0100	[diff] [blame]	7405	goto out;
Jason Low	52a08ef	2014-05-08 17:49:22 -0700	[diff] [blame]	7406	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7407
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	7408	raw_spin_unlock(&this_rq->lock);
				7409
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	7410	update_blocked_averages(this_cpu);
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	7411	rcu_read_lock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7412	for_each_domain(this_cpu, sd) {
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	7413	int continue_balancing = 1;
Jason Low	9bd721c	2013-09-13 11:26:52 -0700	[diff] [blame]	7414	u64 t0, domain_cost;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7415
				7416	if (!(sd->flags & SD_LOAD_BALANCE))
				7417	continue;
				7418
Jason Low	52a08ef	2014-05-08 17:49:22 -0700	[diff] [blame]	7419	if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
				7420	update_next_balance(sd, 0, &next_balance);
Jason Low	9bd721c	2013-09-13 11:26:52 -0700	[diff] [blame]	7421	break;
Jason Low	52a08ef	2014-05-08 17:49:22 -0700	[diff] [blame]	7422	}
Jason Low	9bd721c	2013-09-13 11:26:52 -0700	[diff] [blame]	7423
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	7424	if (sd->flags & SD_BALANCE_NEWIDLE) {
Jason Low	9bd721c	2013-09-13 11:26:52 -0700	[diff] [blame]	7425	t0 = sched_clock_cpu(this_cpu);
				7426
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	7427	pulled_task = load_balance(this_cpu, this_rq,
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	7428	sd, CPU_NEWLY_IDLE,
				7429	&continue_balancing);
Jason Low	9bd721c	2013-09-13 11:26:52 -0700	[diff] [blame]	7430
				7431	domain_cost = sched_clock_cpu(this_cpu) - t0;
				7432	if (domain_cost > sd->max_newidle_lb_cost)
				7433	sd->max_newidle_lb_cost = domain_cost;
				7434
				7435	curr_cost += domain_cost;
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	7436	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7437
Jason Low	52a08ef	2014-05-08 17:49:22 -0700	[diff] [blame]	7438	update_next_balance(sd, 0, &next_balance);
Jason Low	39a4d9c	2014-04-23 18:30:35 -0700	[diff] [blame]	7439
				7440	/*
				7441	* Stop searching for tasks to pull if there are
				7442	* now runnable tasks on this rq.
				7443	*/
				7444	if (pulled_task \|\| this_rq->nr_running > 0)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7445	break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7446	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	7447	rcu_read_unlock();
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	7448
				7449	raw_spin_lock(&this_rq->lock);
				7450
Jason Low	0e5b533	2014-04-28 15:45:54 -0700	[diff] [blame]	7451	if (curr_cost > this_rq->max_idle_balance_cost)
				7452	this_rq->max_idle_balance_cost = curr_cost;
				7453
Daniel Lezcano	e5fc661	2014-01-17 10:04:02 +0100	[diff] [blame]	7454	/*
Jason Low	0e5b533	2014-04-28 15:45:54 -0700	[diff] [blame]	7455	* While browsing the domains, we released the rq lock, a task could
				7456	* have been enqueued in the meantime. Since we're not going idle,
				7457	* pretend we pulled a task.
Daniel Lezcano	e5fc661	2014-01-17 10:04:02 +0100	[diff] [blame]	7458	*/
Jason Low	0e5b533	2014-04-28 15:45:54 -0700	[diff] [blame]	7459	if (this_rq->cfs.h_nr_running && !pulled_task)
Peter Zijlstra	6e83125	2014-02-11 16:11:48 +0100	[diff] [blame]	7460	pulled_task = 1;
Daniel Lezcano	e5fc661	2014-01-17 10:04:02 +0100	[diff] [blame]	7461
Peter Zijlstra	6e83125	2014-02-11 16:11:48 +0100	[diff] [blame]	7462	out:
Jason Low	52a08ef	2014-05-08 17:49:22 -0700	[diff] [blame]	7463	/* Move the next balance forward */
				7464	if (time_after(this_rq->next_balance, next_balance))
				7465	this_rq->next_balance = next_balance;
				7466
Kirill Tkhai	e4aa358	2014-03-06 13:31:55 +0400	[diff] [blame]	7467	/* Is there a task of a high priority class? */
Kirill Tkhai	4638364	2014-03-15 02:15:07 +0400	[diff] [blame]	7468	if (this_rq->nr_running != this_rq->cfs.h_nr_running)
Kirill Tkhai	e4aa358	2014-03-06 13:31:55 +0400	[diff] [blame]	7469	pulled_task = -1;
				7470
Dietmar Eggemann	38c6ade	2015-10-20 13:04:41 +0100	[diff] [blame]	7471	if (pulled_task)
Peter Zijlstra	6e83125	2014-02-11 16:11:48 +0100	[diff] [blame]	7472	this_rq->idle_stamp = 0;
				7473
Daniel Lezcano	3c4017c	2014-01-17 10:04:03 +0100	[diff] [blame]	7474	return pulled_task;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7475	}
				7476
				7477	/*
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	7478	* active_load_balance_cpu_stop is run by cpu stopper. It pushes
				7479	* running tasks off the busiest CPU onto idle CPUs. It requires at
				7480	* least 1 task to be running on each physical CPU where possible, and
				7481	* avoids physical / logical imbalances.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7482	*/
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	7483	static int active_load_balance_cpu_stop(void *data)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7484	{
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	7485	struct rq *busiest_rq = data;
				7486	int busiest_cpu = cpu_of(busiest_rq);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7487	int target_cpu = busiest_rq->push_cpu;
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	7488	struct rq *target_rq = cpu_rq(target_cpu);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7489	struct sched_domain *sd;
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	7490	struct task_struct *p = NULL;
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	7491
				7492	raw_spin_lock_irq(&busiest_rq->lock);
				7493
				7494	/* make sure the requested cpu hasn't gone down in the meantime */
				7495	if (unlikely(busiest_cpu != smp_processor_id() \|\|
				7496	!busiest_rq->active_balance))
				7497	goto out_unlock;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7498
				7499	/* Is there any task to move? */
				7500	if (busiest_rq->nr_running <= 1)
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	7501	goto out_unlock;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7502
				7503	/*
				7504	* This condition is "impossible", if it occurs
				7505	* we need to fix it. Originally reported by
				7506	* Bjorn Helgaas on a 128-cpu setup.
				7507	*/
				7508	BUG_ON(busiest_rq == target_rq);
				7509
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7510	/* Search for an sd spanning us and the target CPU. */
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	7511	rcu_read_lock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7512	for_each_domain(target_cpu, sd) {
				7513	if ((sd->flags & SD_LOAD_BALANCE) &&
				7514	cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
				7515	break;
				7516	}
				7517
				7518	if (likely(sd)) {
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	7519	struct lb_env env = {
				7520	.sd = sd,
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	7521	.dst_cpu = target_cpu,
				7522	.dst_rq = target_rq,
				7523	.src_cpu = busiest_rq->cpu,
				7524	.src_rq = busiest_rq,
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	7525	.idle = CPU_IDLE,
				7526	};
				7527
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7528	schedstat_inc(sd, alb_count);
				7529
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	7530	p = detach_one_task(&env);
				7531	if (p)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7532	schedstat_inc(sd, alb_pushed);
				7533	else
				7534	schedstat_inc(sd, alb_failed);
				7535	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	7536	rcu_read_unlock();
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	7537	out_unlock:
				7538	busiest_rq->active_balance = 0;
Kirill Tkhai	e5673f2	2014-08-20 13:48:01 +0400	[diff] [blame]	7539	raw_spin_unlock(&busiest_rq->lock);
				7540
				7541	if (p)
				7542	attach_one_task(target_rq, p);
				7543
				7544	local_irq_enable();
				7545
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	7546	return 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7547	}
				7548
Mike Galbraith	d987fc7	2011-12-05 10:01:47 +0100	[diff] [blame]	7549	static inline int on_null_domain(struct rq *rq)
				7550	{
				7551	return unlikely(!rcu_dereference_sched(rq->sd));
				7552	}
				7553
Frederic Weisbecker	3451d02	2011-08-10 23:21:01 +0200	[diff] [blame]	7554	#ifdef CONFIG_NO_HZ_COMMON
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	7555	/*
				7556	* idle load balancing details
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	7557	* - When one of the busy CPUs notice that there may be an idle rebalancing
				7558	* needed, they will kick the idle load balancer, which then does idle
				7559	* load balancing for all the idle CPUs.
				7560	*/
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7561	static struct {
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	7562	cpumask_var_t idle_cpus_mask;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	7563	atomic_t nr_cpus;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	7564	unsigned long next_balance; /* in jiffy units */
				7565	} nohz ____cacheline_aligned;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7566
Daniel Lezcano	3dd0337	2014-01-06 12:34:41 +0100	[diff] [blame]	7567	static inline int find_new_ilb(void)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7568	{
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	7569	int ilb = cpumask_first(nohz.idle_cpus_mask);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7570
Suresh Siddha	786d6dc	2011-12-01 17:07:35 -0800	[diff] [blame]	7571	if (ilb < nr_cpu_ids && idle_cpu(ilb))
				7572	return ilb;
				7573
				7574	return nr_cpu_ids;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7575	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7576
				7577	/*
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	7578	* Kick a CPU to do the nohz balancing, if it is time for it. We pick the
				7579	* nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
				7580	* CPU (if there is one).
				7581	*/
Daniel Lezcano	0aeeeeb	2014-01-06 12:34:42 +0100	[diff] [blame]	7582	static void nohz_balancer_kick(void)
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	7583	{
				7584	int ilb_cpu;
				7585
				7586	nohz.next_balance++;
				7587
Daniel Lezcano	3dd0337	2014-01-06 12:34:41 +0100	[diff] [blame]	7588	ilb_cpu = find_new_ilb();
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	7589
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	7590	if (ilb_cpu >= nr_cpu_ids)
				7591	return;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	7592
Suresh Siddha	cd490c5	2011-12-06 11:26:34 -0800	[diff] [blame]	7593	if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	7594	return;
				7595	/*
				7596	* Use smp_send_reschedule() instead of resched_cpu().
				7597	* This way we generate a sched IPI on the target cpu which
				7598	* is idle. And the softirq performing nohz idle load balance
				7599	* will be run before returning from the IPI.
				7600	*/
				7601	smp_send_reschedule(ilb_cpu);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	7602	return;
				7603	}
				7604
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	7605	static inline void nohz_balance_exit_idle(int cpu)
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	7606	{
				7607	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
Mike Galbraith	d987fc7	2011-12-05 10:01:47 +0100	[diff] [blame]	7608	/*
				7609	* Completely isolated CPUs don't ever set, so we must test.
				7610	*/
				7611	if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
				7612	cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
				7613	atomic_dec(&nohz.nr_cpus);
				7614	}
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	7615	clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
				7616	}
				7617	}
				7618
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	7619	static inline void set_cpu_sd_state_busy(void)
				7620	{
				7621	struct sched_domain *sd;
Preeti U Murthy	37dc6b5	2013-10-30 08:42:52 +0530	[diff] [blame]	7622	int cpu = smp_processor_id();
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	7623
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	7624	rcu_read_lock();
Preeti U Murthy	37dc6b5	2013-10-30 08:42:52 +0530	[diff] [blame]	7625	sd = rcu_dereference(per_cpu(sd_busy, cpu));
Vincent Guittot	25f55d9	2013-04-23 16:59:02 +0200	[diff] [blame]	7626
				7627	if (!sd \|\| !sd->nohz_idle)
				7628	goto unlock;
				7629	sd->nohz_idle = 0;
				7630
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7631	atomic_inc(&sd->groups->sgc->nr_busy_cpus);
Vincent Guittot	25f55d9	2013-04-23 16:59:02 +0200	[diff] [blame]	7632	unlock:
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	7633	rcu_read_unlock();
				7634	}
				7635
				7636	void set_cpu_sd_state_idle(void)
				7637	{
				7638	struct sched_domain *sd;
Preeti U Murthy	37dc6b5	2013-10-30 08:42:52 +0530	[diff] [blame]	7639	int cpu = smp_processor_id();
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	7640
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	7641	rcu_read_lock();
Preeti U Murthy	37dc6b5	2013-10-30 08:42:52 +0530	[diff] [blame]	7642	sd = rcu_dereference(per_cpu(sd_busy, cpu));
Vincent Guittot	25f55d9	2013-04-23 16:59:02 +0200	[diff] [blame]	7643
				7644	if (!sd \|\| sd->nohz_idle)
				7645	goto unlock;
				7646	sd->nohz_idle = 1;
				7647
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7648	atomic_dec(&sd->groups->sgc->nr_busy_cpus);
Vincent Guittot	25f55d9	2013-04-23 16:59:02 +0200	[diff] [blame]	7649	unlock:
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	7650	rcu_read_unlock();
				7651	}
				7652
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	7653	/*
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	7654	* This routine will record that the cpu is going idle with tick stopped.
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	7655	* This info will be used in performing idle load balancing in the future.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7656	*/
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	7657	void nohz_balance_enter_idle(int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7658	{
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	7659	/*
				7660	* If this cpu is going down, then nothing needs to be done.
				7661	*/
				7662	if (!cpu_active(cpu))
				7663	return;
				7664
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	7665	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
				7666	return;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7667
Mike Galbraith	d987fc7	2011-12-05 10:01:47 +0100	[diff] [blame]	7668	/*
				7669	* If we're a completely isolated CPU, we don't play.
				7670	*/
				7671	if (on_null_domain(cpu_rq(cpu)))
				7672	return;
				7673
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	7674	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
				7675	atomic_inc(&nohz.nr_cpus);
				7676	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7677	}
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	7678
Paul Gortmaker	0db0628	2013-06-19 14:53:51 -0400	[diff] [blame]	7679	static int sched_ilb_notifier(struct notifier_block *nfb,
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	7680	unsigned long action, void *hcpu)
				7681	{
				7682	switch (action & ~CPU_TASKS_FROZEN) {
				7683	case CPU_DYING:
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	7684	nohz_balance_exit_idle(smp_processor_id());
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	7685	return NOTIFY_OK;
				7686	default:
				7687	return NOTIFY_DONE;
				7688	}
				7689	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7690	#endif
				7691
				7692	static DEFINE_SPINLOCK(balancing);
				7693
Peter Zijlstra	49c022e	2011-04-05 10:14:25 +0200	[diff] [blame]	7694	/*
				7695	* Scale the max load_balance interval with the number of CPUs in the system.
				7696	* This trades load-balance latency on larger machines for less cross talk.
				7697	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	7698	void update_max_interval(void)
Peter Zijlstra	49c022e	2011-04-05 10:14:25 +0200	[diff] [blame]	7699	{
				7700	max_load_balance_interval = HZ*num_online_cpus()/10;
				7701	}
				7702
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7703	/*
				7704	* It checks each scheduling domain to see if it is due to be balanced,
				7705	* and initiates a balancing operation if so.
				7706	*
Libin	b9b0853	2013-04-01 19:14:01 +0800	[diff] [blame]	7707	* Balancing parameters are set up in init_sched_domains.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7708	*/
Daniel Lezcano	f7ed0a8	2014-01-06 12:34:43 +0100	[diff] [blame]	7709	static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7710	{
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	7711	int continue_balancing = 1;
Daniel Lezcano	f7ed0a8	2014-01-06 12:34:43 +0100	[diff] [blame]	7712	int cpu = rq->cpu;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7713	unsigned long interval;
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	7714	struct sched_domain *sd;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7715	/* Earliest time when we have to do rebalance again */
				7716	unsigned long next_balance = jiffies + 60*HZ;
				7717	int update_next_balance = 0;
Jason Low	f48627e	2013-09-13 11:26:53 -0700	[diff] [blame]	7718	int need_serialize, need_decay = 0;
				7719	u64 max_cost = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7720
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	7721	update_blocked_averages(cpu);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	7722
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	7723	rcu_read_lock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7724	for_each_domain(cpu, sd) {
Jason Low	f48627e	2013-09-13 11:26:53 -0700	[diff] [blame]	7725	/*
				7726	* Decay the newidle max times here because this is a regular
				7727	* visit to all the domains. Decay ~1% per second.
				7728	*/
				7729	if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
				7730	sd->max_newidle_lb_cost =
				7731	(sd->max_newidle_lb_cost * 253) / 256;
				7732	sd->next_decay_max_lb_cost = jiffies + HZ;
				7733	need_decay = 1;
				7734	}
				7735	max_cost += sd->max_newidle_lb_cost;
				7736
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7737	if (!(sd->flags & SD_LOAD_BALANCE))
				7738	continue;
				7739
Jason Low	f48627e	2013-09-13 11:26:53 -0700	[diff] [blame]	7740	/*
				7741	* Stop the load balance at this level. There is another
				7742	* CPU in our sched group which is doing load balancing more
				7743	* actively.
				7744	*/
				7745	if (!continue_balancing) {
				7746	if (need_decay)
				7747	continue;
				7748	break;
				7749	}
				7750
Jason Low	52a08ef	2014-05-08 17:49:22 -0700	[diff] [blame]	7751	interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7752
				7753	need_serialize = sd->flags & SD_SERIALIZE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7754	if (need_serialize) {
				7755	if (!spin_trylock(&balancing))
				7756	goto out;
				7757	}
				7758
				7759	if (time_after_eq(jiffies, sd->last_balance + interval)) {
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	7760	if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7761	/*
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	7762	* The LBF_DST_PINNED logic could have changed
Joonsoo Kim	de5eb2d	2013-04-23 17:27:38 +0900	[diff] [blame]	7763	* env->dst_cpu, so we can't know our idle
				7764	* state even if we migrated tasks. Update it.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7765	*/
Joonsoo Kim	de5eb2d	2013-04-23 17:27:38 +0900	[diff] [blame]	7766	idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7767	}
				7768	sd->last_balance = jiffies;
Jason Low	52a08ef	2014-05-08 17:49:22 -0700	[diff] [blame]	7769	interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7770	}
				7771	if (need_serialize)
				7772	spin_unlock(&balancing);
				7773	out:
				7774	if (time_after(next_balance, sd->last_balance + interval)) {
				7775	next_balance = sd->last_balance + interval;
				7776	update_next_balance = 1;
				7777	}
Jason Low	f48627e	2013-09-13 11:26:53 -0700	[diff] [blame]	7778	}
				7779	if (need_decay) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7780	/*
Jason Low	f48627e	2013-09-13 11:26:53 -0700	[diff] [blame]	7781	* Ensure the rq-wide value also decays but keep it at a
				7782	* reasonable floor to avoid funnies with rq->avg_idle.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7783	*/
Jason Low	f48627e	2013-09-13 11:26:53 -0700	[diff] [blame]	7784	rq->max_idle_balance_cost =
				7785	max((u64)sysctl_sched_migration_cost, max_cost);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7786	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	7787	rcu_read_unlock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7788
				7789	/*
				7790	* next_balance will be updated only when there is a need.
				7791	* When the cpu is attached to null domain for ex, it will not be
				7792	* updated.
				7793	*/
Vincent Guittot	c5afb6a	2015-08-03 11:55:50 +0200	[diff] [blame]	7794	if (likely(update_next_balance)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7795	rq->next_balance = next_balance;
Vincent Guittot	c5afb6a	2015-08-03 11:55:50 +0200	[diff] [blame]	7796
				7797	#ifdef CONFIG_NO_HZ_COMMON
				7798	/*
				7799	* If this CPU has been elected to perform the nohz idle
				7800	* balance. Other idle CPUs have already rebalanced with
				7801	* nohz_idle_balance() and nohz.next_balance has been
				7802	* updated accordingly. This CPU is now running the idle load
				7803	* balance for itself and we need to update the
				7804	* nohz.next_balance accordingly.
				7805	*/
				7806	if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
				7807	nohz.next_balance = rq->next_balance;
				7808	#endif
				7809	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7810	}
				7811
Frederic Weisbecker	3451d02	2011-08-10 23:21:01 +0200	[diff] [blame]	7812	#ifdef CONFIG_NO_HZ_COMMON
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	7813	/*
Frederic Weisbecker	3451d02	2011-08-10 23:21:01 +0200	[diff] [blame]	7814	* In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	7815	* rebalancing for all the cpus for whom scheduler ticks are stopped.
				7816	*/
Daniel Lezcano	208cb16	2014-01-06 12:34:44 +0100	[diff] [blame]	7817	static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	7818	{
Daniel Lezcano	208cb16	2014-01-06 12:34:44 +0100	[diff] [blame]	7819	int this_cpu = this_rq->cpu;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	7820	struct rq *rq;
				7821	int balance_cpu;
Vincent Guittot	c5afb6a	2015-08-03 11:55:50 +0200	[diff] [blame]	7822	/* Earliest time when we have to do rebalance again */
				7823	unsigned long next_balance = jiffies + 60*HZ;
				7824	int update_next_balance = 0;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	7825
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	7826	if (idle != CPU_IDLE \|\|
				7827	!test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
				7828	goto end;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	7829
				7830	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
Suresh Siddha	8a6d42d	2011-12-06 11:19:37 -0800	[diff] [blame]	7831	if (balance_cpu == this_cpu \|\| !idle_cpu(balance_cpu))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	7832	continue;
				7833
				7834	/*
				7835	* If this cpu gets work to do, stop the load balancing
				7836	* work being done for other cpus. Next load
				7837	* balancing owner will pick it up.
				7838	*/
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	7839	if (need_resched())
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	7840	break;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	7841
Vincent Guittot	5ed4f1d	2012-09-13 06:11:26 +0200	[diff] [blame]	7842	rq = cpu_rq(balance_cpu);
				7843
Tim Chen	ed61bbc	2014-05-20 14:39:27 -0700	[diff] [blame]	7844	/*
				7845	* If time for next balance is due,
				7846	* do the balance.
				7847	*/
				7848	if (time_after_eq(jiffies, rq->next_balance)) {
				7849	raw_spin_lock_irq(&rq->lock);
				7850	update_rq_clock(rq);
				7851	update_idle_cpu_load(rq);
				7852	raw_spin_unlock_irq(&rq->lock);
				7853	rebalance_domains(rq, CPU_IDLE);
				7854	}
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	7855
Vincent Guittot	c5afb6a	2015-08-03 11:55:50 +0200	[diff] [blame]	7856	if (time_after(next_balance, rq->next_balance)) {
				7857	next_balance = rq->next_balance;
				7858	update_next_balance = 1;
				7859	}
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	7860	}
Vincent Guittot	c5afb6a	2015-08-03 11:55:50 +0200	[diff] [blame]	7861
				7862	/*
				7863	* next_balance will be updated only when there is a need.
				7864	* When the CPU is attached to null domain for ex, it will not be
				7865	* updated.
				7866	*/
				7867	if (likely(update_next_balance))
				7868	nohz.next_balance = next_balance;
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	7869	end:
				7870	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	7871	}
				7872
				7873	/*
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	7874	* Current heuristic for kicking the idle load balancer in the presence
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	7875	* of an idle cpu in the system.
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	7876	* - This rq has more than one task.
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	7877	* - This rq has at least one CFS task and the capacity of the CPU is
				7878	* significantly reduced because of RT tasks or IRQs.
				7879	* - At parent of LLC scheduler domain level, this cpu's scheduler group has
				7880	* multiple busy cpu.
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	7881	* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
				7882	* domain span are idle.
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	7883	*/
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	7884	static inline bool nohz_kick_needed(struct rq *rq)
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	7885	{
				7886	unsigned long now = jiffies;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	7887	struct sched_domain *sd;
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7888	struct sched_group_capacity *sgc;
Daniel Lezcano	4a72562	2014-01-06 12:34:39 +0100	[diff] [blame]	7889	int nr_busy, cpu = rq->cpu;
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	7890	bool kick = false;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	7891
Daniel Lezcano	4a72562	2014-01-06 12:34:39 +0100	[diff] [blame]	7892	if (unlikely(rq->idle_balance))
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	7893	return false;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	7894
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	7895	/*
				7896	* We may be recently in ticked or tickless idle mode. At the first
				7897	* busy tick after returning from idle, we will update the busy stats.
				7898	*/
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	7899	set_cpu_sd_state_busy();
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	7900	nohz_balance_exit_idle(cpu);
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	7901
				7902	/*
				7903	* None are in tickless mode and hence no need for NOHZ idle load
				7904	* balancing.
				7905	*/
				7906	if (likely(!atomic_read(&nohz.nr_cpus)))
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	7907	return false;
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	7908
				7909	if (time_before(now, nohz.next_balance))
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	7910	return false;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	7911
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	7912	if (rq->nr_running >= 2)
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	7913	return true;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	7914
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	7915	rcu_read_lock();
Preeti U Murthy	37dc6b5	2013-10-30 08:42:52 +0530	[diff] [blame]	7916	sd = rcu_dereference(per_cpu(sd_busy, cpu));
Preeti U Murthy	37dc6b5	2013-10-30 08:42:52 +0530	[diff] [blame]	7917	if (sd) {
Nicolas Pitre	63b2ca3	2014-05-26 18:19:37 -0400	[diff] [blame]	7918	sgc = sd->groups->sgc;
				7919	nr_busy = atomic_read(&sgc->nr_busy_cpus);
Preeti U Murthy	37dc6b5	2013-10-30 08:42:52 +0530	[diff] [blame]	7920
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	7921	if (nr_busy > 1) {
				7922	kick = true;
				7923	goto unlock;
				7924	}
				7925
				7926	}
				7927
				7928	sd = rcu_dereference(rq->sd);
				7929	if (sd) {
				7930	if ((rq->cfs.h_nr_running >= 1) &&
				7931	check_cpu_capacity(rq, sd)) {
				7932	kick = true;
				7933	goto unlock;
				7934	}
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	7935	}
Preeti U Murthy	37dc6b5	2013-10-30 08:42:52 +0530	[diff] [blame]	7936
				7937	sd = rcu_dereference(per_cpu(sd_asym, cpu));
Preeti U Murthy	37dc6b5	2013-10-30 08:42:52 +0530	[diff] [blame]	7938	if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	7939	sched_domain_span(sd)) < cpu)) {
				7940	kick = true;
				7941	goto unlock;
				7942	}
Preeti U Murthy	37dc6b5	2013-10-30 08:42:52 +0530	[diff] [blame]	7943
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	7944	unlock:
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	7945	rcu_read_unlock();
Vincent Guittot	1aaf90a	2015-02-27 16:54:14 +0100	[diff] [blame]	7946	return kick;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	7947	}
				7948	#else
Daniel Lezcano	208cb16	2014-01-06 12:34:44 +0100	[diff] [blame]	7949	static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	7950	#endif
				7951
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7952	/*
				7953	* run_rebalance_domains is triggered when needed from the scheduler tick.
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	7954	* Also triggered for nohz idle balancing (with nohz_balancing_kick set).
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7955	*/
				7956	static void run_rebalance_domains(struct softirq_action *h)
				7957	{
Daniel Lezcano	208cb16	2014-01-06 12:34:44 +0100	[diff] [blame]	7958	struct rq *this_rq = this_rq();
Suresh Siddha	6eb57e0	2011-10-03 15:09:01 -0700	[diff] [blame]	7959	enum cpu_idle_type idle = this_rq->idle_balance ?
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7960	CPU_IDLE : CPU_NOT_IDLE;
				7961
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7962	/*
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	7963	* If this cpu has a pending nohz_balance_kick, then do the
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7964	* balancing on behalf of the other idle cpus whose ticks are
Preeti U Murthy	d4573c3	2015-03-26 18:32:44 +0530	[diff] [blame]	7965	* stopped. Do nohz_idle_balance before rebalance_domains to
				7966	* give the idle cpus a chance to load balance. Else we may
				7967	* load balance only within the local sched_domain hierarchy
				7968	* and abort nohz_idle_balance altogether if we pull some load.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7969	*/
Daniel Lezcano	208cb16	2014-01-06 12:34:44 +0100	[diff] [blame]	7970	nohz_idle_balance(this_rq, idle);
Preeti U Murthy	d4573c3	2015-03-26 18:32:44 +0530	[diff] [blame]	7971	rebalance_domains(this_rq, idle);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7972	}
				7973
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7974	/*
				7975	* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7976	*/
Daniel Lezcano	7caff66	2014-01-06 12:34:38 +0100	[diff] [blame]	7977	void trigger_load_balance(struct rq *rq)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7978	{
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7979	/* Don't need to rebalance while attached to NULL domain */
Daniel Lezcano	c726099	2014-01-06 12:34:45 +0100	[diff] [blame]	7980	if (unlikely(on_null_domain(rq)))
				7981	return;
				7982
				7983	if (time_after_eq(jiffies, rq->next_balance))
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7984	raise_softirq(SCHED_SOFTIRQ);
Frederic Weisbecker	3451d02	2011-08-10 23:21:01 +0200	[diff] [blame]	7985	#ifdef CONFIG_NO_HZ_COMMON
Daniel Lezcano	c726099	2014-01-06 12:34:45 +0100	[diff] [blame]	7986	if (nohz_kick_needed(rq))
Daniel Lezcano	0aeeeeb	2014-01-06 12:34:42 +0100	[diff] [blame]	7987	nohz_balancer_kick();
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	7988	#endif
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	7989	}
				7990
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	7991	static void rq_online_fair(struct rq *rq)
				7992	{
				7993	update_sysctl();
Kirill Tkhai	0e59bda	2014-06-25 12:19:42 +0400	[diff] [blame]	7994
				7995	update_runtime_enabled(rq);
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	7996	}
				7997
				7998	static void rq_offline_fair(struct rq *rq)
				7999	{
				8000	update_sysctl();
Peter Boonstoppel	a4c96ae	2012-08-09 15:34:47 -0700	[diff] [blame]	8001
				8002	/* Ensure any throttled groups are reachable by pick_next_task */
				8003	unthrottle_offline_cfs_rqs(rq);
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	8004	}
				8005
Dhaval Giani	55e12e5	2008-06-24 23:39:43 +0530	[diff] [blame]	8006	#endif /* CONFIG_SMP */
Peter Williams	e1d1484	2007-10-24 18:23:51 +0200	[diff] [blame]	8007
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	8008	/*
				8009	* scheduler tick hitting a task of our scheduling class:
				8010	*/
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	8011	static void task_tick_fair(struct rq rq, struct task_struct curr, int queued)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	8012	{
				8013	struct cfs_rq *cfs_rq;
				8014	struct sched_entity *se = &curr->se;
				8015
				8016	for_each_sched_entity(se) {
				8017	cfs_rq = cfs_rq_of(se);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	8018	entity_tick(cfs_rq, se, queued);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	8019	}
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	8020
Srikar Dronamraju	b52da86	2015-10-02 07:48:25 +0530	[diff] [blame]	8021	if (static_branch_unlikely(&sched_numa_balancing))
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	8022	task_tick_numa(rq, curr);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	8023	}
				8024
				8025	/*
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	8026	* called on fork with the child task as argument from the parent's context
				8027	* - child not yet on the tasklist
				8028	* - preemption disabled
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	8029	*/
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	8030	static void task_fork_fair(struct task_struct *p)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	8031	{
Daisuke Nishimura	4fc420c	2011-12-15 14:36:55 +0900	[diff] [blame]	8032	struct cfs_rq *cfs_rq;
				8033	struct sched_entity se = &p->se, curr;
Ingo Molnar	00bf7bf	2007-10-15 17:00:14 +0200	[diff] [blame]	8034	int this_cpu = smp_processor_id();
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	8035	struct rq *rq = this_rq();
				8036	unsigned long flags;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	8037
Thomas Gleixner	05fa785	2009-11-17 14:28:38 +0100	[diff] [blame]	8038	raw_spin_lock_irqsave(&rq->lock, flags);
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	8039
Peter Zijlstra	861d034	2010-08-19 13:31:43 +0200	[diff] [blame]	8040	update_rq_clock(rq);
				8041
Daisuke Nishimura	4fc420c	2011-12-15 14:36:55 +0900	[diff] [blame]	8042	cfs_rq = task_cfs_rq(current);
				8043	curr = cfs_rq->curr;
				8044
Daisuke Nishimura	6c9a27f	2013-09-10 18:16:36 +0900	[diff] [blame]	8045	/*
				8046	* Not only the cpu but also the task_group of the parent might have
				8047	* been changed after parent->se.parent,cfs_rq were copied to
				8048	* child->se.parent,cfs_rq. So call __set_task_cpu() to make those
				8049	* of child point to valid ones.
				8050	*/
				8051	rcu_read_lock();
				8052	__set_task_cpu(p, this_cpu);
				8053	rcu_read_unlock();
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	8054
Ting Yang	7109c442	2007-08-28 12:53:24 +0200	[diff] [blame]	8055	update_curr(cfs_rq);
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	8056
Mike Galbraith	b5d9d73	2009-09-08 11:12:28 +0200	[diff] [blame]	8057	if (curr)
				8058	se->vruntime = curr->vruntime;
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	8059	place_entity(cfs_rq, se, 1);
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	8060
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	8061	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
Dmitry Adamushko	87fefa3	2007-10-15 17:00:08 +0200	[diff] [blame]	8062	/*
Ingo Molnar	edcb60a	2007-10-15 17:00:08 +0200	[diff] [blame]	8063	* Upon rescheduling, sched_class::put_prev_task() will place
				8064	* 'current' within the tree based on its new key value.
				8065	*/
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	8066	swap(curr->vruntime, se->vruntime);
Kirill Tkhai	8875125	2014-06-29 00:03:57 +0400	[diff] [blame]	8067	resched_curr(rq);
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	8068	}
				8069
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	8070	se->vruntime -= cfs_rq->min_vruntime;
				8071
Thomas Gleixner	05fa785	2009-11-17 14:28:38 +0100	[diff] [blame]	8072	raw_spin_unlock_irqrestore(&rq->lock, flags);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	8073	}
				8074
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	8075	/*
				8076	* Priority of the task has changed. Check to see if we preempt
				8077	* the current task.
				8078	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	8079	static void
				8080	prio_changed_fair(struct rq rq, struct task_struct p, int oldprio)
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	8081	{
Kirill Tkhai	da0c1e6	2014-08-20 13:47:32 +0400	[diff] [blame]	8082	if (!task_on_rq_queued(p))
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	8083	return;
				8084
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	8085	/*
				8086	* Reschedule if we are currently running on this runqueue and
				8087	* our priority decreased, or if we are not currently running on
				8088	* this runqueue and our priority is higher than the current's
				8089	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	8090	if (rq->curr == p) {
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	8091	if (p->prio > oldprio)
Kirill Tkhai	8875125	2014-06-29 00:03:57 +0400	[diff] [blame]	8092	resched_curr(rq);
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	8093	} else
Peter Zijlstra	15afe09	2008-09-20 23:38:02 +0200	[diff] [blame]	8094	check_preempt_curr(rq, p, 0);
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	8095	}
				8096
Byungchul Park	daa5940	2015-08-20 20:22:00 +0900	[diff] [blame]	8097	static inline bool vruntime_normalized(struct task_struct *p)
				8098	{
				8099	struct sched_entity *se = &p->se;
				8100
				8101	/*
				8102	* In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
				8103	* the dequeue_entity(.flags=0) will already have normalized the
				8104	* vruntime.
				8105	*/
				8106	if (p->on_rq)
				8107	return true;
				8108
				8109	/*
				8110	* When !on_rq, vruntime of the task has usually NOT been normalized.
				8111	* But there are some cases where it has already been normalized:
				8112	*
				8113	* - A forked child which is waiting for being woken up by
				8114	* wake_up_new_task().
				8115	* - A task which has been woken up by try_to_wake_up() and
				8116	* waiting for actually being woken up by sched_ttwu_pending().
				8117	*/
				8118	if (!se->sum_exec_runtime \|\| p->state == TASK_WAKING)
				8119	return true;
				8120
				8121	return false;
				8122	}
				8123
				8124	static void detach_task_cfs_rq(struct task_struct *p)
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	8125	{
				8126	struct sched_entity *se = &p->se;
				8127	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				8128
Byungchul Park	daa5940	2015-08-20 20:22:00 +0900	[diff] [blame]	8129	if (!vruntime_normalized(p)) {
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	8130	/*
				8131	* Fix up our vruntime so that the current sleep doesn't
				8132	* cause 'unlimited' sleep bonus.
				8133	*/
				8134	place_entity(cfs_rq, se, 0);
				8135	se->vruntime -= cfs_rq->min_vruntime;
				8136	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	8137
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	8138	/* Catch up with the cfs_rq and remove our load when we leave */
Byungchul Park	a05e8c5	2015-08-20 20:21:56 +0900	[diff] [blame]	8139	detach_entity_load_avg(cfs_rq, se);
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	8140	}
				8141
Byungchul Park	daa5940	2015-08-20 20:22:00 +0900	[diff] [blame]	8142	static void attach_task_cfs_rq(struct task_struct *p)
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	8143	{
Kirill Tkhai	f36c019	2014-08-06 12:06:01 +0400	[diff] [blame]	8144	struct sched_entity *se = &p->se;
Byungchul Park	daa5940	2015-08-20 20:22:00 +0900	[diff] [blame]	8145	struct cfs_rq *cfs_rq = cfs_rq_of(se);
Byungchul Park	7855a35	2015-08-10 18:02:55 +0900	[diff] [blame]	8146
				8147	#ifdef CONFIG_FAIR_GROUP_SCHED
Michael wang	eb7a59b	2014-02-20 11:14:53 +0800	[diff] [blame]	8148	/*
				8149	* Since the real-depth could have been changed (only FAIR
				8150	* class maintain depth value), reset depth properly.
				8151	*/
				8152	se->depth = se->parent ? se->parent->depth + 1 : 0;
				8153	#endif
Byungchul Park	7855a35	2015-08-10 18:02:55 +0900	[diff] [blame]	8154
Byungchul Park	6efdb10	2015-08-20 20:21:59 +0900	[diff] [blame]	8155	/* Synchronize task with its cfs_rq */
Byungchul Park	daa5940	2015-08-20 20:22:00 +0900	[diff] [blame]	8156	attach_entity_load_avg(cfs_rq, se);
Byungchul Park	6efdb10	2015-08-20 20:21:59 +0900	[diff] [blame]	8157
Byungchul Park	daa5940	2015-08-20 20:22:00 +0900	[diff] [blame]	8158	if (!vruntime_normalized(p))
				8159	se->vruntime += cfs_rq->min_vruntime;
				8160	}
Byungchul Park	7855a35	2015-08-10 18:02:55 +0900	[diff] [blame]	8161
Byungchul Park	daa5940	2015-08-20 20:22:00 +0900	[diff] [blame]	8162	static void switched_from_fair(struct rq rq, struct task_struct p)
				8163	{
				8164	detach_task_cfs_rq(p);
				8165	}
				8166
				8167	static void switched_to_fair(struct rq rq, struct task_struct p)
				8168	{
				8169	attach_task_cfs_rq(p);
				8170
				8171	if (task_on_rq_queued(p)) {
Byungchul Park	7855a35	2015-08-10 18:02:55 +0900	[diff] [blame]	8172	/*
Byungchul Park	daa5940	2015-08-20 20:22:00 +0900	[diff] [blame]	8173	* We were most likely switched from sched_rt, so
				8174	* kick off the schedule if running, otherwise just see
				8175	* if we can still preempt the current task.
Byungchul Park	7855a35	2015-08-10 18:02:55 +0900	[diff] [blame]	8176	*/
Byungchul Park	daa5940	2015-08-20 20:22:00 +0900	[diff] [blame]	8177	if (rq->curr == p)
				8178	resched_curr(rq);
				8179	else
				8180	check_preempt_curr(rq, p, 0);
Byungchul Park	7855a35	2015-08-10 18:02:55 +0900	[diff] [blame]	8181	}
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	8182	}
				8183
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	8184	/* Account for a task changing its policy or group.
				8185	*
				8186	* This routine is mostly called to set cfs_rq->curr field when a task
				8187	* migrates between groups/classes.
				8188	*/
				8189	static void set_curr_task_fair(struct rq *rq)
				8190	{
				8191	struct sched_entity *se = &rq->curr->se;
				8192
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	8193	for_each_sched_entity(se) {
				8194	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				8195
				8196	set_next_entity(cfs_rq, se);
				8197	/* ensure bandwidth has been allocated on our new cfs_rq */
				8198	account_cfs_rq_runtime(cfs_rq, 0);
				8199	}
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	8200	}
				8201
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	8202	void init_cfs_rq(struct cfs_rq *cfs_rq)
				8203	{
				8204	cfs_rq->tasks_timeline = RB_ROOT;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	8205	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
				8206	#ifndef CONFIG_64BIT
				8207	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
				8208	#endif
Alex Shi	141965c	2013-06-26 13:05:39 +0800	[diff] [blame]	8209	#ifdef CONFIG_SMP
Yuyang Du	9d89c25	2015-07-15 08:04:37 +0800	[diff] [blame]	8210	atomic_long_set(&cfs_rq->removed_load_avg, 0);
				8211	atomic_long_set(&cfs_rq->removed_util_avg, 0);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	8212	#endif
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	8213	}
				8214
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	8215	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	bc54da2	2015-08-31 17:13:55 +0200	[diff] [blame]	8216	static void task_move_group_fair(struct task_struct *p)
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	8217	{
Byungchul Park	daa5940	2015-08-20 20:22:00 +0900	[diff] [blame]	8218	detach_task_cfs_rq(p);
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	8219	set_task_rq(p, task_cpu(p));
Byungchul Park	6efdb10	2015-08-20 20:21:59 +0900	[diff] [blame]	8220
				8221	#ifdef CONFIG_SMP
				8222	/* Tell se's cfs_rq has been changed -- migrated */
				8223	p->se.avg.last_update_time = 0;
				8224	#endif
Byungchul Park	daa5940	2015-08-20 20:22:00 +0900	[diff] [blame]	8225	attach_task_cfs_rq(p);
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	8226	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	8227
				8228	void free_fair_sched_group(struct task_group *tg)
				8229	{
				8230	int i;
				8231
				8232	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
				8233
				8234	for_each_possible_cpu(i) {
				8235	if (tg->cfs_rq)
				8236	kfree(tg->cfs_rq[i]);
Yuyang Du	1269557	2015-07-15 08:04:40 +0800	[diff] [blame]	8237	if (tg->se) {
				8238	if (tg->se[i])
				8239	remove_entity_load_avg(tg->se[i]);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	8240	kfree(tg->se[i]);
Yuyang Du	1269557	2015-07-15 08:04:40 +0800	[diff] [blame]	8241	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	8242	}
				8243
				8244	kfree(tg->cfs_rq);
				8245	kfree(tg->se);
				8246	}
				8247
				8248	int alloc_fair_sched_group(struct task_group tg, struct task_group parent)
				8249	{
				8250	struct cfs_rq *cfs_rq;
				8251	struct sched_entity *se;
				8252	int i;
				8253
				8254	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
				8255	if (!tg->cfs_rq)
				8256	goto err;
				8257	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
				8258	if (!tg->se)
				8259	goto err;
				8260
				8261	tg->shares = NICE_0_LOAD;
				8262
				8263	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
				8264
				8265	for_each_possible_cpu(i) {
				8266	cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
				8267	GFP_KERNEL, cpu_to_node(i));
				8268	if (!cfs_rq)
				8269	goto err;
				8270
				8271	se = kzalloc_node(sizeof(struct sched_entity),
				8272	GFP_KERNEL, cpu_to_node(i));
				8273	if (!se)
				8274	goto err_free_rq;
				8275
				8276	init_cfs_rq(cfs_rq);
				8277	init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
Yuyang Du	540247f	2015-07-15 08:04:39 +0800	[diff] [blame]	8278	init_entity_runnable_average(se);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	8279	}
				8280
				8281	return 1;
				8282
				8283	err_free_rq:
				8284	kfree(cfs_rq);
				8285	err:
				8286	return 0;
				8287	}
				8288
				8289	void unregister_fair_sched_group(struct task_group *tg, int cpu)
				8290	{
				8291	struct rq *rq = cpu_rq(cpu);
				8292	unsigned long flags;
				8293
				8294	/*
				8295	* Only empty task groups can be destroyed; so we can speculatively
				8296	* check on_list without danger of it being re-added.
				8297	*/
				8298	if (!tg->cfs_rq[cpu]->on_list)
				8299	return;
				8300
				8301	raw_spin_lock_irqsave(&rq->lock, flags);
				8302	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
				8303	raw_spin_unlock_irqrestore(&rq->lock, flags);
				8304	}
				8305
				8306	void init_tg_cfs_entry(struct task_group tg, struct cfs_rq cfs_rq,
				8307	struct sched_entity *se, int cpu,
				8308	struct sched_entity *parent)
				8309	{
				8310	struct rq *rq = cpu_rq(cpu);
				8311
				8312	cfs_rq->tg = tg;
				8313	cfs_rq->rq = rq;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	8314	init_cfs_rq_runtime(cfs_rq);
				8315
				8316	tg->cfs_rq[cpu] = cfs_rq;
				8317	tg->se[cpu] = se;
				8318
				8319	/* se could be NULL for root_task_group */
				8320	if (!se)
				8321	return;
				8322
Peter Zijlstra	fed14d4	2012-02-11 06:05:00 +0100	[diff] [blame]	8323	if (!parent) {
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	8324	se->cfs_rq = &rq->cfs;
Peter Zijlstra	fed14d4	2012-02-11 06:05:00 +0100	[diff] [blame]	8325	se->depth = 0;
				8326	} else {
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	8327	se->cfs_rq = parent->my_q;
Peter Zijlstra	fed14d4	2012-02-11 06:05:00 +0100	[diff] [blame]	8328	se->depth = parent->depth + 1;
				8329	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	8330
				8331	se->my_q = cfs_rq;
Paul Turner	0ac9b1c	2013-10-16 11:16:27 -0700	[diff] [blame]	8332	/* guarantee group entities always have weight */
				8333	update_load_set(&se->load, NICE_0_LOAD);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	8334	se->parent = parent;
				8335	}
				8336
				8337	static DEFINE_MUTEX(shares_mutex);
				8338
				8339	int sched_group_set_shares(struct task_group *tg, unsigned long shares)
				8340	{
				8341	int i;
				8342	unsigned long flags;
				8343
				8344	/*
				8345	* We can't change the weight of the root cgroup.
				8346	*/
				8347	if (!tg->se[0])
				8348	return -EINVAL;
				8349
				8350	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
				8351
				8352	mutex_lock(&shares_mutex);
				8353	if (tg->shares == shares)
				8354	goto done;
				8355
				8356	tg->shares = shares;
				8357	for_each_possible_cpu(i) {
				8358	struct rq *rq = cpu_rq(i);
				8359	struct sched_entity *se;
				8360
				8361	se = tg->se[i];
				8362	/* Propagate contribution to hierarchy */
				8363	raw_spin_lock_irqsave(&rq->lock, flags);
Frederic Weisbecker	71b1da4	2013-04-12 01:50:59 +0200	[diff] [blame]	8364
				8365	/* Possible calls to update_curr() need rq clock */
				8366	update_rq_clock(rq);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	8367	for_each_sched_entity(se)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	8368	update_cfs_shares(group_cfs_rq(se));
				8369	raw_spin_unlock_irqrestore(&rq->lock, flags);
				8370	}
				8371
				8372	done:
				8373	mutex_unlock(&shares_mutex);
				8374	return 0;
				8375	}
				8376	#else /* CONFIG_FAIR_GROUP_SCHED */
				8377
				8378	void free_fair_sched_group(struct task_group *tg) { }
				8379
				8380	int alloc_fair_sched_group(struct task_group tg, struct task_group parent)
				8381	{
				8382	return 1;
				8383	}
				8384
				8385	void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
				8386
				8387	#endif /* CONFIG_FAIR_GROUP_SCHED */
				8388
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	8389
H Hartley Sweeten	6d686f4	2010-01-13 20:21:52 -0700	[diff] [blame]	8390	static unsigned int get_rr_interval_fair(struct rq rq, struct task_struct task)
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	8391	{
				8392	struct sched_entity *se = &task->se;
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	8393	unsigned int rr_interval = 0;
				8394
				8395	/*
				8396	* Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
				8397	* idle runqueue:
				8398	*/
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	8399	if (rq->cfs.load.weight)
Zhu Yanhai	a59f4e0	2013-01-08 12:56:52 +0800	[diff] [blame]	8400	rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	8401
				8402	return rr_interval;
				8403	}
				8404
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	8405	/*
				8406	* All the scheduling class methods:
				8407	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	8408	const struct sched_class fair_sched_class = {
Ingo Molnar	5522d5d	2007-10-15 17:00:12 +0200	[diff] [blame]	8409	.next = &idle_sched_class,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	8410	.enqueue_task = enqueue_task_fair,
				8411	.dequeue_task = dequeue_task_fair,
				8412	.yield_task = yield_task_fair,
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	8413	.yield_to_task = yield_to_task_fair,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	8414
Ingo Molnar	2e09bf5	2007-10-15 17:00:05 +0200	[diff] [blame]	8415	.check_preempt_curr = check_preempt_wakeup,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	8416
				8417	.pick_next_task = pick_next_task_fair,
				8418	.put_prev_task = put_prev_task_fair,
				8419
Peter Williams	681f3e6	2007-10-24 18:23:51 +0200	[diff] [blame]	8420	#ifdef CONFIG_SMP
Li Zefan	4ce72a2	2008-10-22 15:25:26 +0800	[diff] [blame]	8421	.select_task_rq = select_task_rq_fair,
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	8422	.migrate_task_rq = migrate_task_rq_fair,
Alex Shi	141965c	2013-06-26 13:05:39 +0800	[diff] [blame]	8423
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	8424	.rq_online = rq_online_fair,
				8425	.rq_offline = rq_offline_fair,
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	8426
				8427	.task_waking = task_waking_fair,
Yuyang Du	1269557	2015-07-15 08:04:40 +0800	[diff] [blame]	8428	.task_dead = task_dead_fair,
Peter Zijlstra	c5b2803	2015-05-15 17:43:35 +0200	[diff] [blame]	8429	.set_cpus_allowed = set_cpus_allowed_common,
Peter Williams	681f3e6	2007-10-24 18:23:51 +0200	[diff] [blame]	8430	#endif
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	8431
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	8432	.set_curr_task = set_curr_task_fair,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	8433	.task_tick = task_tick_fair,
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	8434	.task_fork = task_fork_fair,
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	8435
				8436	.prio_changed = prio_changed_fair,
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	8437	.switched_from = switched_from_fair,
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	8438	.switched_to = switched_to_fair,
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	8439
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	8440	.get_rr_interval = get_rr_interval_fair,
				8441
Stanislaw Gruszka	6e99891	2014-11-12 16:58:44 +0100	[diff] [blame]	8442	.update_curr = update_curr_fair,
				8443
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	8444	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	8445	.task_move_group = task_move_group_fair,
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	8446	#endif
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	8447	};
				8448
				8449	#ifdef CONFIG_SCHED_DEBUG
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	8450	void print_cfs_stats(struct seq_file *m, int cpu)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	8451	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	8452	struct cfs_rq *cfs_rq;
				8453
Peter Zijlstra	5973e5b	2008-01-25 21:08:34 +0100	[diff] [blame]	8454	rcu_read_lock();
Ingo Molnar	c3b64f1	2007-08-09 11:16:51 +0200	[diff] [blame]	8455	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
Ingo Molnar	5cef9ec	2007-08-09 11:16:47 +0200	[diff] [blame]	8456	print_cfs_rq(m, cpu, cfs_rq);
Peter Zijlstra	5973e5b	2008-01-25 21:08:34 +0100	[diff] [blame]	8457	rcu_read_unlock();
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	8458	}
Srikar Dronamraju	397f237	2015-06-25 22:51:43 +0530	[diff] [blame]	8459
				8460	#ifdef CONFIG_NUMA_BALANCING
				8461	void show_numa_stats(struct task_struct p, struct seq_file m)
				8462	{
				8463	int node;
				8464	unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
				8465
				8466	for_each_online_node(node) {
				8467	if (p->numa_faults) {
				8468	tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
				8469	tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
				8470	}
				8471	if (p->numa_group) {
				8472	gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
				8473	gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
				8474	}
				8475	print_numa_stats(m, node, tsf, tpf, gsf, gpf);
				8476	}
				8477	}
				8478	#endif /* CONFIG_NUMA_BALANCING */
				8479	#endif /* CONFIG_SCHED_DEBUG */
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	8480
				8481	__init void init_sched_fair_class(void)
				8482	{
				8483	#ifdef CONFIG_SMP
				8484	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
				8485
Frederic Weisbecker	3451d02	2011-08-10 23:21:01 +0200	[diff] [blame]	8486	#ifdef CONFIG_NO_HZ_COMMON
Diwakar Tundlam	554ceca	2012-03-07 14:44:26 -0800	[diff] [blame]	8487	nohz.next_balance = jiffies;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	8488	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	8489	cpu_notifier(sched_ilb_notifier, 0);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	8490	#endif
				8491	#endif /* SMP */
				8492
				8493	}