Blame - kernel/sched/fair.c - SHIFTPHONES/android_kernel_shift_sdm845

blob: 5b1e96687b4992f9b1d118028e52ac05fb13a49b [file] [log] [blame]

Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1	/*
				2	* Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
				3	*
				4	* Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
				5	*
				6	* Interactivity improvements by Mike Galbraith
				7	* (C) 2007 Mike Galbraith <efault@gmx.de>
				8	*
				9	* Various enhancements by Dmitry Adamushko.
				10	* (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
				11	*
				12	* Group scheduling enhancements by Srivatsa Vaddagiri
				13	* Copyright IBM Corporation, 2007
				14	* Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
				15	*
				16	* Scaled math optimizations by Thomas Gleixner
				17	* Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	18	*
				19	* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
				20	* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	21	*/
				22
Arjan van de Ven	9745512	2008-01-25 21:08:34 +0100	[diff] [blame]	23	#include <linux/latencytop.h>
Christian Ehrhardt	1983a92	2009-11-30 12:16:47 +0100	[diff] [blame]	24	#include <linux/sched.h>
Sisir Koppaka	3436ae1	2011-03-26 18:22:55 +0530	[diff] [blame]	25	#include <linux/cpumask.h>
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	26	#include <linux/slab.h>
				27	#include <linux/profile.h>
				28	#include <linux/interrupt.h>
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	29	#include <linux/mempolicy.h>
Mel Gorman	e14808b	2012-11-19 10:59:15 +0000	[diff] [blame]	30	#include <linux/migrate.h>
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	31	#include <linux/task_work.h>
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	32
				33	#include <trace/events/sched.h>
				34
				35	#include "sched.h"
Arjan van de Ven	9745512	2008-01-25 21:08:34 +0100	[diff] [blame]	36
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	37	/*
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	38	* Targeted preemption latency for CPU-bound tasks:
Takuya Yoshikawa	864616e	2010-10-14 16:09:13 +0900	[diff] [blame]	39	* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	40	*
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	41	* NOTE: this latency value is not the same as the concept of
Ingo Molnar	d274a4c	2007-10-15 17:00:14 +0200	[diff] [blame]	42	* 'timeslice length' - timeslices in CFS are of variable length
				43	* and have no persistent notion like in traditional, time-slice
				44	* based scheduling concepts.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	45	*
Ingo Molnar	d274a4c	2007-10-15 17:00:14 +0200	[diff] [blame]	46	* (to see the precise effective timeslice length of your workload,
				47	* run vmstat and monitor the context-switches (cs) field)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	48	*/
Mike Galbraith	2140692	2010-03-11 17:17:15 +0100	[diff] [blame]	49	unsigned int sysctl_sched_latency = 6000000ULL;
				50	unsigned int normalized_sysctl_sched_latency = 6000000ULL;
Ingo Molnar	2bd8e6d	2007-10-15 17:00:02 +0200	[diff] [blame]	51
				52	/*
Christian Ehrhardt	1983a92	2009-11-30 12:16:47 +0100	[diff] [blame]	53	* The initial- and re-scaling of tunables is configurable
				54	* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
				55	*
				56	* Options are:
				57	* SCHED_TUNABLESCALING_NONE - unscaled, always *1
				58	* SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
				59	* SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
				60	*/
				61	enum sched_tunable_scaling sysctl_sched_tunable_scaling
				62	= SCHED_TUNABLESCALING_LOG;
				63
				64	/*
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	65	* Minimal preemption granularity for CPU-bound tasks:
Takuya Yoshikawa	864616e	2010-10-14 16:09:13 +0900	[diff] [blame]	66	* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	67	*/
Ingo Molnar	0bf377b	2010-09-12 08:14:52 +0200	[diff] [blame]	68	unsigned int sysctl_sched_min_granularity = 750000ULL;
				69	unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	70
				71	/*
				72	* is kept at sysctl_sched_latency / sysctl_sched_min_granularity
				73	*/
Ingo Molnar	0bf377b	2010-09-12 08:14:52 +0200	[diff] [blame]	74	static unsigned int sched_nr_latency = 8;
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	75
				76	/*
Mike Galbraith	2bba22c	2009-09-09 15:41:37 +0200	[diff] [blame]	77	* After fork, child runs first. If set to 0 (default) then
Ingo Molnar	2bd8e6d	2007-10-15 17:00:02 +0200	[diff] [blame]	78	* parent will (try to) run first.
				79	*/
Mike Galbraith	2bba22c	2009-09-09 15:41:37 +0200	[diff] [blame]	80	unsigned int sysctl_sched_child_runs_first __read_mostly;
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	81
				82	/*
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	83	* SCHED_OTHER wake-up granularity.
Mike Galbraith	172e082	2009-09-09 15:41:37 +0200	[diff] [blame]	84	* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	85	*
				86	* This option delays the preemption effects of decoupled workloads
				87	* and reduces their over-scheduling. Synchronous workloads will still
				88	* have immediate wakeup/sleep latencies.
				89	*/
Mike Galbraith	172e082	2009-09-09 15:41:37 +0200	[diff] [blame]	90	unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	91	unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	92
Ingo Molnar	da84d96	2007-10-15 17:00:18 +0200	[diff] [blame]	93	const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
				94
Paul Turner	a7a4f8a	2010-11-15 15:47:06 -0800	[diff] [blame]	95	/*
				96	* The exponential sliding window over which load is averaged for shares
				97	* distribution.
				98	* (default: 10msec)
				99	*/
				100	unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
				101
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	102	#ifdef CONFIG_CFS_BANDWIDTH
				103	/*
				104	* Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
				105	* each time a cfs_rq requests quota.
				106	*
				107	* Note: in the case that the slice exceeds the runtime remaining (either due
				108	* to consumption or the quota being specified to be smaller than the slice)
				109	* we will always only issue the remaining available time.
				110	*
				111	* default: 5 msec, units: microseconds
				112	*/
				113	unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
				114	#endif
				115
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	116	/*
				117	* Increase the granularity value when there are more CPUs,
				118	* because with more CPUs the 'effective latency' as visible
				119	* to users decreases. But the relationship is not linear,
				120	* so pick a second-best guess by going with the log2 of the
				121	* number of CPUs.
				122	*
				123	* This idea comes from the SD scheduler of Con Kolivas:
				124	*/
				125	static int get_update_sysctl_factor(void)
				126	{
				127	unsigned int cpus = min_t(int, num_online_cpus(), 8);
				128	unsigned int factor;
				129
				130	switch (sysctl_sched_tunable_scaling) {
				131	case SCHED_TUNABLESCALING_NONE:
				132	factor = 1;
				133	break;
				134	case SCHED_TUNABLESCALING_LINEAR:
				135	factor = cpus;
				136	break;
				137	case SCHED_TUNABLESCALING_LOG:
				138	default:
				139	factor = 1 + ilog2(cpus);
				140	break;
				141	}
				142
				143	return factor;
				144	}
				145
				146	static void update_sysctl(void)
				147	{
				148	unsigned int factor = get_update_sysctl_factor();
				149
				150	#define SET_SYSCTL(name) \
				151	(sysctl_##name = (factor) * normalized_sysctl_##name)
				152	SET_SYSCTL(sched_min_granularity);
				153	SET_SYSCTL(sched_latency);
				154	SET_SYSCTL(sched_wakeup_granularity);
				155	#undef SET_SYSCTL
				156	}
				157
				158	void sched_init_granularity(void)
				159	{
				160	update_sysctl();
				161	}
				162
				163	#if BITS_PER_LONG == 32
				164	# define WMULT_CONST (~0UL)
				165	#else
				166	# define WMULT_CONST (1UL << 32)
				167	#endif
				168
				169	#define WMULT_SHIFT 32
				170
				171	/*
				172	* Shift right and round:
				173	*/
				174	#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
				175
				176	/*
				177	* delta *= weight / lw
				178	*/
				179	static unsigned long
				180	calc_delta_mine(unsigned long delta_exec, unsigned long weight,
				181	struct load_weight *lw)
				182	{
				183	u64 tmp;
				184
				185	/*
				186	* weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
				187	* entities since MIN_SHARES = 2. Treat weight as 1 if less than
				188	* 2^SCHED_LOAD_RESOLUTION.
				189	*/
				190	if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
				191	tmp = (u64)delta_exec * scale_load_down(weight);
				192	else
				193	tmp = (u64)delta_exec;
				194
				195	if (!lw->inv_weight) {
				196	unsigned long w = scale_load_down(lw->weight);
				197
				198	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
				199	lw->inv_weight = 1;
				200	else if (unlikely(!w))
				201	lw->inv_weight = WMULT_CONST;
				202	else
				203	lw->inv_weight = WMULT_CONST / w;
				204	}
				205
				206	/*
				207	* Check whether we'd overflow the 64-bit multiplication:
				208	*/
				209	if (unlikely(tmp > WMULT_CONST))
				210	tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
				211	WMULT_SHIFT/2);
				212	else
				213	tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
				214
				215	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
				216	}
				217
				218
				219	const struct sched_class fair_sched_class;
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	220
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	221	/**************************************************************
				222	* CFS operations on generic schedulable entities:
				223	*/
				224
				225	#ifdef CONFIG_FAIR_GROUP_SCHED
				226
				227	/* cpu runqueue to which this cfs_rq is attached */
				228	static inline struct rq rq_of(struct cfs_rq cfs_rq)
				229	{
				230	return cfs_rq->rq;
				231	}
				232
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	233	/* An entity is a task if it doesn't "own" a runqueue */
				234	#define entity_is_task(se) (!se->my_q)
				235
Peter Zijlstra	8f48894	2009-07-24 12:25:30 +0200	[diff] [blame]	236	static inline struct task_struct task_of(struct sched_entity se)
				237	{
				238	#ifdef CONFIG_SCHED_DEBUG
				239	WARN_ON_ONCE(!entity_is_task(se));
				240	#endif
				241	return container_of(se, struct task_struct, se);
				242	}
				243
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	244	/* Walk up scheduling entities hierarchy */
				245	#define for_each_sched_entity(se) \
				246	for (; se; se = se->parent)
				247
				248	static inline struct cfs_rq task_cfs_rq(struct task_struct p)
				249	{
				250	return p->se.cfs_rq;
				251	}
				252
				253	/* runqueue on which this entity is (to be) queued */
				254	static inline struct cfs_rq cfs_rq_of(struct sched_entity se)
				255	{
				256	return se->cfs_rq;
				257	}
				258
				259	/* runqueue "owned" by this group */
				260	static inline struct cfs_rq group_cfs_rq(struct sched_entity grp)
				261	{
				262	return grp->my_q;
				263	}
				264
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	265	static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
				266	int force_update);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	267
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	268	static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				269	{
				270	if (!cfs_rq->on_list) {
Paul Turner	67e8625	2010-11-15 15:47:05 -0800	[diff] [blame]	271	/*
				272	* Ensure we either appear before our parent (if already
				273	* enqueued) or force our parent to appear after us when it is
				274	* enqueued. The fact that we always enqueue bottom-up
				275	* reduces this to two cases.
				276	*/
				277	if (cfs_rq->tg->parent &&
				278	cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
				279	list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	280	&rq_of(cfs_rq)->leaf_cfs_rq_list);
Paul Turner	67e8625	2010-11-15 15:47:05 -0800	[diff] [blame]	281	} else {
				282	list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
				283	&rq_of(cfs_rq)->leaf_cfs_rq_list);
				284	}
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	285
				286	cfs_rq->on_list = 1;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	287	/* We should have no load, but we need to update last_decay. */
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	288	update_cfs_rq_blocked_load(cfs_rq, 0);
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	289	}
				290	}
				291
				292	static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				293	{
				294	if (cfs_rq->on_list) {
				295	list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
				296	cfs_rq->on_list = 0;
				297	}
				298	}
				299
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	300	/* Iterate thr' all leaf cfs_rq's on a runqueue */
				301	#define for_each_leaf_cfs_rq(rq, cfs_rq) \
				302	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
				303
				304	/* Do the two (enqueued) entities belong to the same group ? */
				305	static inline int
				306	is_same_group(struct sched_entity se, struct sched_entity pse)
				307	{
				308	if (se->cfs_rq == pse->cfs_rq)
				309	return 1;
				310
				311	return 0;
				312	}
				313
				314	static inline struct sched_entity parent_entity(struct sched_entity se)
				315	{
				316	return se->parent;
				317	}
				318
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	319	/* return depth at which a sched entity is present in the hierarchy */
				320	static inline int depth_se(struct sched_entity *se)
				321	{
				322	int depth = 0;
				323
				324	for_each_sched_entity(se)
				325	depth++;
				326
				327	return depth;
				328	}
				329
				330	static void
				331	find_matching_se(struct sched_entity se, struct sched_entity pse)
				332	{
				333	int se_depth, pse_depth;
				334
				335	/*
				336	* preemption test can be made between sibling entities who are in the
				337	* same cfs_rq i.e who have a common parent. Walk up the hierarchy of
				338	* both tasks until we find their ancestors who are siblings of common
				339	* parent.
				340	*/
				341
				342	/* First walk up until both entities are at same depth */
				343	se_depth = depth_se(*se);
				344	pse_depth = depth_se(*pse);
				345
				346	while (se_depth > pse_depth) {
				347	se_depth--;
				348	se = parent_entity(se);
				349	}
				350
				351	while (pse_depth > se_depth) {
				352	pse_depth--;
				353	pse = parent_entity(pse);
				354	}
				355
				356	while (!is_same_group(se, pse)) {
				357	se = parent_entity(se);
				358	pse = parent_entity(pse);
				359	}
				360	}
				361
Peter Zijlstra	8f48894	2009-07-24 12:25:30 +0200	[diff] [blame]	362	#else /* !CONFIG_FAIR_GROUP_SCHED */
				363
				364	static inline struct task_struct task_of(struct sched_entity se)
				365	{
				366	return container_of(se, struct task_struct, se);
				367	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	368
				369	static inline struct rq rq_of(struct cfs_rq cfs_rq)
				370	{
				371	return container_of(cfs_rq, struct rq, cfs);
				372	}
				373
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	374	#define entity_is_task(se) 1
				375
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	376	#define for_each_sched_entity(se) \
				377	for (; se; se = NULL)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	378
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	379	static inline struct cfs_rq task_cfs_rq(struct task_struct p)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	380	{
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	381	return &task_rq(p)->cfs;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	382	}
				383
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	384	static inline struct cfs_rq cfs_rq_of(struct sched_entity se)
				385	{
				386	struct task_struct *p = task_of(se);
				387	struct rq *rq = task_rq(p);
				388
				389	return &rq->cfs;
				390	}
				391
				392	/* runqueue "owned" by this group */
				393	static inline struct cfs_rq group_cfs_rq(struct sched_entity grp)
				394	{
				395	return NULL;
				396	}
				397
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	398	static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				399	{
				400	}
				401
				402	static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				403	{
				404	}
				405
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	406	#define for_each_leaf_cfs_rq(rq, cfs_rq) \
				407	for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
				408
				409	static inline int
				410	is_same_group(struct sched_entity se, struct sched_entity pse)
				411	{
				412	return 1;
				413	}
				414
				415	static inline struct sched_entity parent_entity(struct sched_entity se)
				416	{
				417	return NULL;
				418	}
				419
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	420	static inline void
				421	find_matching_se(struct sched_entity se, struct sched_entity pse)
				422	{
				423	}
				424
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	425	#endif /* CONFIG_FAIR_GROUP_SCHED */
				426
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	427	static __always_inline
				428	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	429
				430	/**************************************************************
				431	* Scheduling class tree data structure manipulation methods:
				432	*/
				433
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	434	static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
Peter Zijlstra	02e0431	2007-10-15 17:00:07 +0200	[diff] [blame]	435	{
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	436	s64 delta = (s64)(vruntime - max_vruntime);
Peter Zijlstra	368059a	2007-10-15 17:00:11 +0200	[diff] [blame]	437	if (delta > 0)
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	438	max_vruntime = vruntime;
Peter Zijlstra	02e0431	2007-10-15 17:00:07 +0200	[diff] [blame]	439
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	440	return max_vruntime;
Peter Zijlstra	02e0431	2007-10-15 17:00:07 +0200	[diff] [blame]	441	}
				442
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	443	static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
Peter Zijlstra	b0ffd24	2007-10-15 17:00:12 +0200	[diff] [blame]	444	{
				445	s64 delta = (s64)(vruntime - min_vruntime);
				446	if (delta < 0)
				447	min_vruntime = vruntime;
				448
				449	return min_vruntime;
				450	}
				451
Fabio Checconi	54fdc58	2009-07-16 12:32:27 +0200	[diff] [blame]	452	static inline int entity_before(struct sched_entity *a,
				453	struct sched_entity *b)
				454	{
				455	return (s64)(a->vruntime - b->vruntime) < 0;
				456	}
				457
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	458	static void update_min_vruntime(struct cfs_rq *cfs_rq)
				459	{
				460	u64 vruntime = cfs_rq->min_vruntime;
				461
				462	if (cfs_rq->curr)
				463	vruntime = cfs_rq->curr->vruntime;
				464
				465	if (cfs_rq->rb_leftmost) {
				466	struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
				467	struct sched_entity,
				468	run_node);
				469
Peter Zijlstra	e17036d	2009-01-15 14:53:39 +0100	[diff] [blame]	470	if (!cfs_rq->curr)
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	471	vruntime = se->vruntime;
				472	else
				473	vruntime = min_vruntime(vruntime, se->vruntime);
				474	}
				475
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	476	/* ensure we never gain time by being placed backwards. */
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	477	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	478	#ifndef CONFIG_64BIT
				479	smp_wmb();
				480	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
				481	#endif
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	482	}
				483
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	484	/*
				485	* Enqueue an entity into the rb-tree:
				486	*/
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	487	static void __enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	488	{
				489	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
				490	struct rb_node *parent = NULL;
				491	struct sched_entity *entry;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	492	int leftmost = 1;
				493
				494	/*
				495	* Find the right place in the rbtree:
				496	*/
				497	while (*link) {
				498	parent = *link;
				499	entry = rb_entry(parent, struct sched_entity, run_node);
				500	/*
				501	* We dont care about collisions. Nodes with
				502	* the same key stay together.
				503	*/
Stephan Baerwolf	2bd2d6f	2011-07-20 14:46:59 +0200	[diff] [blame]	504	if (entity_before(se, entry)) {
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	505	link = &parent->rb_left;
				506	} else {
				507	link = &parent->rb_right;
				508	leftmost = 0;
				509	}
				510	}
				511
				512	/*
				513	* Maintain a cache of leftmost tree entries (it is frequently
				514	* used):
				515	*/
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	516	if (leftmost)
Ingo Molnar	57cb499	2007-10-15 17:00:11 +0200	[diff] [blame]	517	cfs_rq->rb_leftmost = &se->run_node;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	518
				519	rb_link_node(&se->run_node, parent, link);
				520	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	521	}
				522
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	523	static void __dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	524	{
Peter Zijlstra	3fe6974	2008-03-14 20:55:51 +0100	[diff] [blame]	525	if (cfs_rq->rb_leftmost == &se->run_node) {
				526	struct rb_node *next_node;
Peter Zijlstra	3fe6974	2008-03-14 20:55:51 +0100	[diff] [blame]	527
				528	next_node = rb_next(&se->run_node);
				529	cfs_rq->rb_leftmost = next_node;
Peter Zijlstra	3fe6974	2008-03-14 20:55:51 +0100	[diff] [blame]	530	}
Ingo Molnar	e9acbff	2007-10-15 17:00:04 +0200	[diff] [blame]	531
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	532	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	533	}
				534
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	535	struct sched_entity __pick_first_entity(struct cfs_rq cfs_rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	536	{
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	537	struct rb_node *left = cfs_rq->rb_leftmost;
				538
				539	if (!left)
				540	return NULL;
				541
				542	return rb_entry(left, struct sched_entity, run_node);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	543	}
				544
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	545	static struct sched_entity __pick_next_entity(struct sched_entity se)
				546	{
				547	struct rb_node *next = rb_next(&se->run_node);
				548
				549	if (!next)
				550	return NULL;
				551
				552	return rb_entry(next, struct sched_entity, run_node);
				553	}
				554
				555	#ifdef CONFIG_SCHED_DEBUG
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	556	struct sched_entity __pick_last_entity(struct cfs_rq cfs_rq)
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	557	{
Ingo Molnar	7eee3e6	2008-02-22 10:32:21 +0100	[diff] [blame]	558	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	559
Balbir Singh	70eee74	2008-02-22 13:25:53 +0530	[diff] [blame]	560	if (!last)
				561	return NULL;
Ingo Molnar	7eee3e6	2008-02-22 10:32:21 +0100	[diff] [blame]	562
				563	return rb_entry(last, struct sched_entity, run_node);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	564	}
				565
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	566	/**************************************************************
				567	* Scheduling class statistics methods:
				568	*/
				569
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	570	int sched_proc_update_handler(struct ctl_table *table, int write,
Alexey Dobriyan	8d65af7	2009-09-23 15:57:19 -0700	[diff] [blame]	571	void __user buffer, size_t lenp,
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	572	loff_t *ppos)
				573	{
Alexey Dobriyan	8d65af7	2009-09-23 15:57:19 -0700	[diff] [blame]	574	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	575	int factor = get_update_sysctl_factor();
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	576
				577	if (ret \|\| !write)
				578	return ret;
				579
				580	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
				581	sysctl_sched_min_granularity);
				582
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	583	#define WRT_SYSCTL(name) \
				584	(normalized_sysctl_##name = sysctl_##name / (factor))
				585	WRT_SYSCTL(sched_min_granularity);
				586	WRT_SYSCTL(sched_latency);
				587	WRT_SYSCTL(sched_wakeup_granularity);
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	588	#undef WRT_SYSCTL
				589
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	590	return 0;
				591	}
				592	#endif
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	593
				594	/*
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	595	* delta /= w
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	596	*/
				597	static inline unsigned long
				598	calc_delta_fair(unsigned long delta, struct sched_entity *se)
				599	{
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	600	if (unlikely(se->load.weight != NICE_0_LOAD))
				601	delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	602
				603	return delta;
				604	}
				605
				606	/*
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	607	* The idea is to set a period in which each task runs once.
				608	*
Borislav Petkov	532b185	2012-08-08 16:16:04 +0200	[diff] [blame]	609	* When there are too many tasks (sched_nr_latency) we have to stretch
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	610	* this period because otherwise the slices get too small.
				611	*
				612	* p = (nr <= nl) ? l : l*nr/nl
				613	*/
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	614	static u64 __sched_period(unsigned long nr_running)
				615	{
				616	u64 period = sysctl_sched_latency;
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	617	unsigned long nr_latency = sched_nr_latency;
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	618
				619	if (unlikely(nr_running > nr_latency)) {
Peter Zijlstra	4bf0b77	2008-01-25 21:08:21 +0100	[diff] [blame]	620	period = sysctl_sched_min_granularity;
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	621	period *= nr_running;
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	622	}
				623
				624	return period;
				625	}
				626
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	627	/*
				628	* We calculate the wall-time slice from the period by taking a part
				629	* proportional to the weight.
				630	*
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	631	* s = p*P[w/rw]
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	632	*/
Peter Zijlstra	6d0f0ebd	2007-10-15 17:00:05 +0200	[diff] [blame]	633	static u64 sched_slice(struct cfs_rq cfs_rq, struct sched_entity se)
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	634	{
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	635	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	636
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	637	for_each_sched_entity(se) {
Lin Ming	6272d68	2009-01-15 17:17:15 +0100	[diff] [blame]	638	struct load_weight *load;
Christian Engelmayer	3104bf0	2009-06-16 10:35:12 +0200	[diff] [blame]	639	struct load_weight lw;
Lin Ming	6272d68	2009-01-15 17:17:15 +0100	[diff] [blame]	640
				641	cfs_rq = cfs_rq_of(se);
				642	load = &cfs_rq->load;
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	643
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	644	if (unlikely(!se->on_rq)) {
Christian Engelmayer	3104bf0	2009-06-16 10:35:12 +0200	[diff] [blame]	645	lw = cfs_rq->load;
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	646
				647	update_load_add(&lw, se->load.weight);
				648	load = &lw;
				649	}
				650	slice = calc_delta_mine(slice, se->load.weight, load);
				651	}
				652	return slice;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	653	}
				654
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	655	/*
Andrei Epure	660cc00	2013-03-11 12:03:20 +0200	[diff] [blame]	656	* We calculate the vruntime slice of a to-be-inserted task.
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	657	*
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	658	* vs = s/w
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	659	*/
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	660	static u64 sched_vslice(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	661	{
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	662	return calc_delta_fair(sched_slice(cfs_rq, se), se);
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	663	}
				664
				665	/*
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	666	* Update the current task's runtime statistics. Skip current tasks that
				667	* are not in our scheduling class.
				668	*/
				669	static inline void
Ingo Molnar	8ebc91d	2007-10-15 17:00:03 +0200	[diff] [blame]	670	__update_curr(struct cfs_rq cfs_rq, struct sched_entity curr,
				671	unsigned long delta_exec)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	672	{
Ingo Molnar	bbdba7c	2007-10-15 17:00:06 +0200	[diff] [blame]	673	unsigned long delta_exec_weighted;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	674
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	675	schedstat_set(curr->statistics.exec_max,
				676	max((u64)delta_exec, curr->statistics.exec_max));
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	677
				678	curr->sum_exec_runtime += delta_exec;
Ingo Molnar	7a62eab	2007-10-15 17:00:06 +0200	[diff] [blame]	679	schedstat_add(cfs_rq, exec_clock, delta_exec);
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	680	delta_exec_weighted = calc_delta_fair(delta_exec, curr);
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	681
Ingo Molnar	e9acbff	2007-10-15 17:00:04 +0200	[diff] [blame]	682	curr->vruntime += delta_exec_weighted;
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	683	update_min_vruntime(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	684	}
				685
Ingo Molnar	b7cc089	2007-08-09 11:16:47 +0200	[diff] [blame]	686	static void update_curr(struct cfs_rq *cfs_rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	687	{
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	688	struct sched_entity *curr = cfs_rq->curr;
Venkatesh Pallipadi	305e683	2010-10-04 17:03:21 -0700	[diff] [blame]	689	u64 now = rq_of(cfs_rq)->clock_task;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	690	unsigned long delta_exec;
				691
				692	if (unlikely(!curr))
				693	return;
				694
				695	/*
				696	* Get the amount of time the current task was running
				697	* since the last time we changed load (this cannot
				698	* overflow on 32 bits):
				699	*/
Ingo Molnar	8ebc91d	2007-10-15 17:00:03 +0200	[diff] [blame]	700	delta_exec = (unsigned long)(now - curr->exec_start);
Peter Zijlstra	34f28ec	2008-12-16 08:45:31 +0100	[diff] [blame]	701	if (!delta_exec)
				702	return;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	703
Ingo Molnar	8ebc91d	2007-10-15 17:00:03 +0200	[diff] [blame]	704	__update_curr(cfs_rq, curr, delta_exec);
				705	curr->exec_start = now;
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	706
				707	if (entity_is_task(curr)) {
				708	struct task_struct *curtask = task_of(curr);
				709
Ingo Molnar	f977bb4	2009-09-13 18:15:54 +0200	[diff] [blame]	710	trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	711	cpuacct_charge(curtask, delta_exec);
Frank Mayhar	f06febc	2008-09-12 09:54:39 -0700	[diff] [blame]	712	account_group_exec_runtime(curtask, delta_exec);
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	713	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	714
				715	account_cfs_rq_runtime(cfs_rq, delta_exec);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	716	}
				717
				718	static inline void
Ingo Molnar	5870db5	2007-08-09 11:16:47 +0200	[diff] [blame]	719	update_stats_wait_start(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	720	{
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	721	schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	722	}
				723
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	724	/*
				725	* Task is being enqueued - update stats:
				726	*/
Ingo Molnar	d2417e5	2007-08-09 11:16:47 +0200	[diff] [blame]	727	static void update_stats_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	728	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	729	/*
				730	* Are we enqueueing a waiting task? (for current tasks
				731	* a dequeue/enqueue event is a NOP)
				732	*/
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	733	if (se != cfs_rq->curr)
Ingo Molnar	5870db5	2007-08-09 11:16:47 +0200	[diff] [blame]	734	update_stats_wait_start(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	735	}
				736
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	737	static void
Ingo Molnar	9ef0a96	2007-08-09 11:16:47 +0200	[diff] [blame]	738	update_stats_wait_end(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	739	{
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	740	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
				741	rq_of(cfs_rq)->clock - se->statistics.wait_start));
				742	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
				743	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
				744	rq_of(cfs_rq)->clock - se->statistics.wait_start);
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	745	#ifdef CONFIG_SCHEDSTATS
				746	if (entity_is_task(se)) {
				747	trace_sched_stat_wait(task_of(se),
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	748	rq_of(cfs_rq)->clock - se->statistics.wait_start);
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	749	}
				750	#endif
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	751	schedstat_set(se->statistics.wait_start, 0);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	752	}
				753
				754	static inline void
Ingo Molnar	19b6a2e	2007-08-09 11:16:48 +0200	[diff] [blame]	755	update_stats_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	756	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	757	/*
				758	* Mark the end of the wait period if dequeueing a
				759	* waiting task:
				760	*/
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	761	if (se != cfs_rq->curr)
Ingo Molnar	9ef0a96	2007-08-09 11:16:47 +0200	[diff] [blame]	762	update_stats_wait_end(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	763	}
				764
				765	/*
				766	* We are picking a new current task - update its stats:
				767	*/
				768	static inline void
Ingo Molnar	79303e9	2007-08-09 11:16:47 +0200	[diff] [blame]	769	update_stats_curr_start(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	770	{
				771	/*
				772	* We are starting a new run period:
				773	*/
Venkatesh Pallipadi	305e683	2010-10-04 17:03:21 -0700	[diff] [blame]	774	se->exec_start = rq_of(cfs_rq)->clock_task;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	775	}
				776
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	777	/**************************************************
				778	* Scheduling class queueing methods:
				779	*/
				780
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	781	#ifdef CONFIG_NUMA_BALANCING
				782	/*
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	783	* numa task sample period in ms
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	784	*/
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	785	unsigned int sysctl_numa_balancing_scan_period_min = 100;
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	786	unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
				787	unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	788
				789	/* Portion of address space to scan in MB */
				790	unsigned int sysctl_numa_balancing_scan_size = 256;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	791
Peter Zijlstra	4b96a29	2012-10-25 14:16:47 +0200	[diff] [blame]	792	/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
				793	unsigned int sysctl_numa_balancing_scan_delay = 1000;
				794
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	795	static void task_numa_placement(struct task_struct *p)
				796	{
Hugh Dickins	2832bc1	2012-12-19 17:42:16 -0800	[diff] [blame]	797	int seq;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	798
Hugh Dickins	2832bc1	2012-12-19 17:42:16 -0800	[diff] [blame]	799	if (!p->mm) /* for example, ksmd faulting in a user's mm */
				800	return;
				801	seq = ACCESS_ONCE(p->mm->numa_scan_seq);
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	802	if (p->numa_scan_seq == seq)
				803	return;
				804	p->numa_scan_seq = seq;
				805
				806	/* FIXME: Scheduling placement policy hints go here */
				807	}
				808
				809	/*
				810	* Got a PROT_NONE fault for a page on @node.
				811	*/
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	812	void task_numa_fault(int node, int pages, bool migrated)
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	813	{
				814	struct task_struct *p = current;
				815
Mel Gorman	1a687c2	2012-11-22 11:16:36 +0000	[diff] [blame]	816	if (!sched_feat_numa(NUMA))
				817	return;
				818
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	819	/* FIXME: Allocate task-specific structure for placement policy here */
				820
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	821	/*
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	822	* If pages are properly placed (did not migrate) then scan slower.
				823	* This is reset periodically in case of phase changes
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	824	*/
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	825	if (!migrated)
				826	p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
				827	p->numa_scan_period + jiffies_to_msecs(10));
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	828
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	829	task_numa_placement(p);
				830	}
				831
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	832	static void reset_ptenuma_scan(struct task_struct *p)
				833	{
				834	ACCESS_ONCE(p->mm->numa_scan_seq)++;
				835	p->mm->numa_scan_offset = 0;
				836	}
				837
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	838	/*
				839	* The expensive part of numa migration is done from task_work context.
				840	* Triggered from task_tick_numa().
				841	*/
				842	void task_numa_work(struct callback_head *work)
				843	{
				844	unsigned long migrate, next_scan, now = jiffies;
				845	struct task_struct *p = current;
				846	struct mm_struct *mm = p->mm;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	847	struct vm_area_struct *vma;
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	848	unsigned long start, end;
				849	long pages;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	850
				851	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
				852
				853	work->next = work; /* protect against double add */
				854	/*
				855	* Who cares about NUMA placement when they're dying.
				856	*
				857	* NOTE: make sure not to dereference p->mm before this check,
				858	* exit_task_work() happens _after_ exit_mm() so we could be called
				859	* without p->mm even though we still had it when we enqueued this
				860	* work.
				861	*/
				862	if (p->flags & PF_EXITING)
				863	return;
				864
				865	/*
Mel Gorman	5bca230	2012-11-22 14:40:03 +0000	[diff] [blame]	866	* We do not care about task placement until a task runs on a node
				867	* other than the first one used by the address space. This is
				868	* largely because migrations are driven by what CPU the task
				869	* is running on. If it's never scheduled on another node, it'll
				870	* not migrate so why bother trapping the fault.
				871	*/
				872	if (mm->first_nid == NUMA_PTE_SCAN_INIT)
				873	mm->first_nid = numa_node_id();
				874	if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
				875	/* Are we running on a new node yet? */
				876	if (numa_node_id() == mm->first_nid &&
				877	!sched_feat_numa(NUMA_FORCE))
				878	return;
				879
				880	mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
				881	}
				882
				883	/*
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	884	* Reset the scan period if enough time has gone by. Objective is that
				885	* scanning will be reduced if pages are properly placed. As tasks
				886	* can enter different phases this needs to be re-examined. Lacking
				887	* proper tracking of reference behaviour, this blunt hammer is used.
				888	*/
				889	migrate = mm->numa_next_reset;
				890	if (time_after(now, migrate)) {
				891	p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
				892	next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
				893	xchg(&mm->numa_next_reset, next_scan);
				894	}
				895
				896	/*
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	897	* Enforce maximal scan/migration frequency..
				898	*/
				899	migrate = mm->numa_next_scan;
				900	if (time_before(now, migrate))
				901	return;
				902
				903	if (p->numa_scan_period == 0)
				904	p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
				905
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	906	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	907	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
				908	return;
				909
Mel Gorman	e14808b	2012-11-19 10:59:15 +0000	[diff] [blame]	910	/*
				911	* Do not set pte_numa if the current running node is rate-limited.
				912	* This loses statistics on the fault but if we are unwilling to
				913	* migrate to this node, it is less likely we can do useful work
				914	*/
				915	if (migrate_ratelimited(numa_node_id()))
				916	return;
				917
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	918	start = mm->numa_scan_offset;
				919	pages = sysctl_numa_balancing_scan_size;
				920	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
				921	if (!pages)
				922	return;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	923
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	924	down_read(&mm->mmap_sem);
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	925	vma = find_vma(mm, start);
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	926	if (!vma) {
				927	reset_ptenuma_scan(p);
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	928	start = 0;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	929	vma = mm->mmap;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	930	}
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	931	for (; vma; vma = vma->vm_next) {
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	932	if (!vma_migratable(vma))
				933	continue;
				934
				935	/* Skip small VMAs. They are not likely to be of relevance */
Mel Gorman	221392c	2012-12-17 14:05:53 +0000	[diff] [blame]	936	if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	937	continue;
				938
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	939	do {
				940	start = max(start, vma->vm_start);
				941	end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
				942	end = min(end, vma->vm_end);
				943	pages -= change_prot_numa(vma, start, end);
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	944
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	945	start = end;
				946	if (pages <= 0)
				947	goto out;
				948	} while (end != vma->vm_end);
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	949	}
				950
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	951	out:
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	952	/*
				953	* It is possible to reach the end of the VMA list but the last few VMAs are
				954	* not guaranteed to the vma_migratable. If they are not, we would find the
				955	* !migratable VMA on the next scan but not reset the scanner to the start
				956	* so check it now.
				957	*/
				958	if (vma)
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	959	mm->numa_scan_offset = start;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	960	else
				961	reset_ptenuma_scan(p);
				962	up_read(&mm->mmap_sem);
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	963	}
				964
				965	/*
				966	* Drive the periodic memory faults..
				967	*/
				968	void task_tick_numa(struct rq rq, struct task_struct curr)
				969	{
				970	struct callback_head *work = &curr->numa_work;
				971	u64 period, now;
				972
				973	/*
				974	* We don't care about NUMA placement if we don't have memory.
				975	*/
				976	if (!curr->mm \|\| (curr->flags & PF_EXITING) \|\| work->next != work)
				977	return;
				978
				979	/*
				980	* Using runtime rather than walltime has the dual advantage that
				981	* we (mostly) drive the selection from busy threads and that the
				982	* task needs to have done some actual work before we bother with
				983	* NUMA placement.
				984	*/
				985	now = curr->se.sum_exec_runtime;
				986	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
				987
				988	if (now - curr->node_stamp > period) {
Peter Zijlstra	4b96a29	2012-10-25 14:16:47 +0200	[diff] [blame]	989	if (!curr->node_stamp)
				990	curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	991	curr->node_stamp = now;
				992
				993	if (!time_before(jiffies, curr->mm->numa_next_scan)) {
				994	init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
				995	task_work_add(curr, work, true);
				996	}
				997	}
				998	}
				999	#else
				1000	static void task_tick_numa(struct rq rq, struct task_struct curr)
				1001	{
				1002	}
				1003	#endif /* CONFIG_NUMA_BALANCING */
				1004
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1005	static void
				1006	account_entity_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
				1007	{
				1008	update_load_add(&cfs_rq->load, se->load.weight);
Peter Zijlstra	c09595f	2008-06-27 13:41:14 +0200	[diff] [blame]	1009	if (!parent_entity(se))
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	1010	update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	1011	#ifdef CONFIG_SMP
				1012	if (entity_is_task(se))
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	1013	list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	1014	#endif
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1015	cfs_rq->nr_running++;
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1016	}
				1017
				1018	static void
				1019	account_entity_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)
				1020	{
				1021	update_load_sub(&cfs_rq->load, se->load.weight);
Peter Zijlstra	c09595f	2008-06-27 13:41:14 +0200	[diff] [blame]	1022	if (!parent_entity(se))
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	1023	update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	1024	if (entity_is_task(se))
Bharata B Rao	b87f172	2008-09-25 09:53:54 +0530	[diff] [blame]	1025	list_del_init(&se->group_node);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1026	cfs_rq->nr_running--;
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1027	}
				1028
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1029	#ifdef CONFIG_FAIR_GROUP_SCHED
				1030	# ifdef CONFIG_SMP
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1031	static inline long calc_tg_weight(struct task_group tg, struct cfs_rq cfs_rq)
				1032	{
				1033	long tg_weight;
				1034
				1035	/*
				1036	* Use this CPU's actual weight instead of the last load_contribution
				1037	* to gain a more accurate current total weight. See
				1038	* update_cfs_rq_load_contribution().
				1039	*/
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	1040	tg_weight = atomic64_read(&tg->load_avg);
				1041	tg_weight -= cfs_rq->tg_load_contrib;
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1042	tg_weight += cfs_rq->load.weight;
				1043
				1044	return tg_weight;
				1045	}
				1046
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1047	static long calc_cfs_shares(struct cfs_rq cfs_rq, struct task_group tg)
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1048	{
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1049	long tg_weight, load, shares;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1050
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1051	tg_weight = calc_tg_weight(tg, cfs_rq);
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1052	load = cfs_rq->load.weight;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1053
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1054	shares = (tg->shares * load);
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1055	if (tg_weight)
				1056	shares /= tg_weight;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1057
				1058	if (shares < MIN_SHARES)
				1059	shares = MIN_SHARES;
				1060	if (shares > tg->shares)
				1061	shares = tg->shares;
				1062
				1063	return shares;
				1064	}
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1065	# else /* CONFIG_SMP */
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1066	static inline long calc_cfs_shares(struct cfs_rq cfs_rq, struct task_group tg)
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1067	{
				1068	return tg->shares;
				1069	}
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1070	# endif /* CONFIG_SMP */
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1071	static void reweight_entity(struct cfs_rq cfs_rq, struct sched_entity se,
				1072	unsigned long weight)
				1073	{
Paul Turner	19e5eeb	2010-12-15 19:10:18 -0800	[diff] [blame]	1074	if (se->on_rq) {
				1075	/* commit outstanding execution time */
				1076	if (cfs_rq->curr == se)
				1077	update_curr(cfs_rq);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1078	account_entity_dequeue(cfs_rq, se);
Paul Turner	19e5eeb	2010-12-15 19:10:18 -0800	[diff] [blame]	1079	}
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1080
				1081	update_load_set(&se->load, weight);
				1082
				1083	if (se->on_rq)
				1084	account_entity_enqueue(cfs_rq, se);
				1085	}
				1086
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	1087	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
				1088
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1089	static void update_cfs_shares(struct cfs_rq *cfs_rq)
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1090	{
				1091	struct task_group *tg;
				1092	struct sched_entity *se;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1093	long shares;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1094
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1095	tg = cfs_rq->tg;
				1096	se = tg->se[cpu_of(rq_of(cfs_rq))];
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	1097	if (!se \|\| throttled_hierarchy(cfs_rq))
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1098	return;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1099	#ifndef CONFIG_SMP
				1100	if (likely(se->load.weight == tg->shares))
				1101	return;
				1102	#endif
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1103	shares = calc_cfs_shares(cfs_rq, tg);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1104
				1105	reweight_entity(cfs_rq_of(se), se, shares);
				1106	}
				1107	#else /* CONFIG_FAIR_GROUP_SCHED */
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1108	static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1109	{
				1110	}
				1111	#endif /* CONFIG_FAIR_GROUP_SCHED */
				1112
Paul Turner	f4e26b1	2012-10-04 13:18:32 +0200	[diff] [blame]	1113	/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
				1114	#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1115	/*
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1116	* We choose a half-life close to 1 scheduling period.
				1117	* Note: The tables below are dependent on this value.
				1118	*/
				1119	#define LOAD_AVG_PERIOD 32
				1120	#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
				1121	#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
				1122
				1123	/* Precomputed fixed inverse multiplies for multiplication by y^n */
				1124	static const u32 runnable_avg_yN_inv[] = {
				1125	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
				1126	0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
				1127	0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
				1128	0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
				1129	0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
				1130	0x85aac367, 0x82cd8698,
				1131	};
				1132
				1133	/*
				1134	* Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
				1135	* over-estimates when re-combining.
				1136	*/
				1137	static const u32 runnable_avg_yN_sum[] = {
				1138	0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
				1139	9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
				1140	17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
				1141	};
				1142
				1143	/*
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1144	* Approximate:
				1145	* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
				1146	*/
				1147	static __always_inline u64 decay_load(u64 val, u64 n)
				1148	{
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1149	unsigned int local_n;
				1150
				1151	if (!n)
				1152	return val;
				1153	else if (unlikely(n > LOAD_AVG_PERIOD * 63))
				1154	return 0;
				1155
				1156	/* after bounds checking we can collapse to 32-bit */
				1157	local_n = n;
				1158
				1159	/*
				1160	* As y^PERIOD = 1/2, we can combine
				1161	* y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
				1162	* With a look-up table which covers k^n (n<PERIOD)
				1163	*
				1164	* To achieve constant time decay_load.
				1165	*/
				1166	if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
				1167	val >>= local_n / LOAD_AVG_PERIOD;
				1168	local_n %= LOAD_AVG_PERIOD;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1169	}
				1170
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1171	val *= runnable_avg_yN_inv[local_n];
				1172	/* We don't use SRR here since we always want to round down. */
				1173	return val >> 32;
				1174	}
				1175
				1176	/*
				1177	* For updates fully spanning n periods, the contribution to runnable
				1178	* average will be: \Sum 1024*y^n
				1179	*
				1180	* We can compute this reasonably efficiently by combining:
				1181	* y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
				1182	*/
				1183	static u32 __compute_runnable_contrib(u64 n)
				1184	{
				1185	u32 contrib = 0;
				1186
				1187	if (likely(n <= LOAD_AVG_PERIOD))
				1188	return runnable_avg_yN_sum[n];
				1189	else if (unlikely(n >= LOAD_AVG_MAX_N))
				1190	return LOAD_AVG_MAX;
				1191
				1192	/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
				1193	do {
				1194	contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
				1195	contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
				1196
				1197	n -= LOAD_AVG_PERIOD;
				1198	} while (n > LOAD_AVG_PERIOD);
				1199
				1200	contrib = decay_load(contrib, n);
				1201	return contrib + runnable_avg_yN_sum[n];
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1202	}
				1203
				1204	/*
				1205	* We can represent the historical contribution to runnable average as the
				1206	* coefficients of a geometric series. To do this we sub-divide our runnable
				1207	* history into segments of approximately 1ms (1024us); label the segment that
				1208	* occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
				1209	*
				1210	* [<- 1024us ->\|<- 1024us ->\|<- 1024us ->\| ...
				1211	* p0 p1 p2
				1212	* (now) (~1ms ago) (~2ms ago)
				1213	*
				1214	* Let u_i denote the fraction of p_i that the entity was runnable.
				1215	*
				1216	* We then designate the fractions u_i as our co-efficients, yielding the
				1217	* following representation of historical load:
				1218	* u_0 + u_1y + u_2y^2 + u_3*y^3 + ...
				1219	*
				1220	* We choose y based on the with of a reasonably scheduling period, fixing:
				1221	* y^32 = 0.5
				1222	*
				1223	* This means that the contribution to load ~32ms ago (u_32) will be weighted
				1224	* approximately half as much as the contribution to load within the last ms
				1225	* (u_0).
				1226	*
				1227	* When a period "rolls over" and we have new u_0`, multiplying the previous
				1228	* sum again by y is sufficient to update:
				1229	* load_avg = u_0` + y(u_0 + u_1y + u_2*y^2 + ... )
				1230	* = u_0 + u_1y + u_2y^2 + ... [re-labeling u_i --> u_{i+1}]
				1231	*/
				1232	static __always_inline int __update_entity_runnable_avg(u64 now,
				1233	struct sched_avg *sa,
				1234	int runnable)
				1235	{
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1236	u64 delta, periods;
				1237	u32 runnable_contrib;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1238	int delta_w, decayed = 0;
				1239
				1240	delta = now - sa->last_runnable_update;
				1241	/*
				1242	* This should only happen when time goes backwards, which it
				1243	* unfortunately does during sched clock init when we swap over to TSC.
				1244	*/
				1245	if ((s64)delta < 0) {
				1246	sa->last_runnable_update = now;
				1247	return 0;
				1248	}
				1249
				1250	/*
				1251	* Use 1024ns as the unit of measurement since it's a reasonable
				1252	* approximation of 1us and fast to compute.
				1253	*/
				1254	delta >>= 10;
				1255	if (!delta)
				1256	return 0;
				1257	sa->last_runnable_update = now;
				1258
				1259	/* delta_w is the amount already accumulated against our next period */
				1260	delta_w = sa->runnable_avg_period % 1024;
				1261	if (delta + delta_w >= 1024) {
				1262	/* period roll-over */
				1263	decayed = 1;
				1264
				1265	/*
				1266	* Now that we know we're crossing a period boundary, figure
				1267	* out how much from delta we need to complete the current
				1268	* period and accrue it.
				1269	*/
				1270	delta_w = 1024 - delta_w;
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1271	if (runnable)
				1272	sa->runnable_avg_sum += delta_w;
				1273	sa->runnable_avg_period += delta_w;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1274
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1275	delta -= delta_w;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1276
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1277	/* Figure out how many additional periods this update spans */
				1278	periods = delta / 1024;
				1279	delta %= 1024;
				1280
				1281	sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
				1282	periods + 1);
				1283	sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
				1284	periods + 1);
				1285
				1286	/* Efficiently calculate \sum (1..n_period) 1024y^i /
				1287	runnable_contrib = __compute_runnable_contrib(periods);
				1288	if (runnable)
				1289	sa->runnable_avg_sum += runnable_contrib;
				1290	sa->runnable_avg_period += runnable_contrib;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1291	}
				1292
				1293	/* Remainder of delta accrued against u_0` */
				1294	if (runnable)
				1295	sa->runnable_avg_sum += delta;
				1296	sa->runnable_avg_period += delta;
				1297
				1298	return decayed;
				1299	}
				1300
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1301	/* Synchronize an entity's decay with its parenting cfs_rq.*/
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1302	static inline u64 __synchronize_entity_decay(struct sched_entity *se)
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1303	{
				1304	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1305	u64 decays = atomic64_read(&cfs_rq->decay_counter);
				1306
				1307	decays -= se->avg.decay_count;
				1308	if (!decays)
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1309	return 0;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1310
				1311	se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
				1312	se->avg.decay_count = 0;
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1313
				1314	return decays;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1315	}
				1316
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1317	#ifdef CONFIG_FAIR_GROUP_SCHED
				1318	static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
				1319	int force_update)
				1320	{
				1321	struct task_group *tg = cfs_rq->tg;
				1322	s64 tg_contrib;
				1323
				1324	tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
				1325	tg_contrib -= cfs_rq->tg_load_contrib;
				1326
				1327	if (force_update \|\| abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
				1328	atomic64_add(tg_contrib, &tg->load_avg);
				1329	cfs_rq->tg_load_contrib += tg_contrib;
				1330	}
				1331	}
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1332
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1333	/*
				1334	* Aggregate cfs_rq runnable averages into an equivalent task_group
				1335	* representation for computing load contributions.
				1336	*/
				1337	static inline void __update_tg_runnable_avg(struct sched_avg *sa,
				1338	struct cfs_rq *cfs_rq)
				1339	{
				1340	struct task_group *tg = cfs_rq->tg;
				1341	long contrib;
				1342
				1343	/* The fraction of a cpu used by this cfs_rq */
				1344	contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
				1345	sa->runnable_avg_period + 1);
				1346	contrib -= cfs_rq->tg_runnable_contrib;
				1347
				1348	if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
				1349	atomic_add(contrib, &tg->runnable_avg);
				1350	cfs_rq->tg_runnable_contrib += contrib;
				1351	}
				1352	}
				1353
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1354	static inline void __update_group_entity_contrib(struct sched_entity *se)
				1355	{
				1356	struct cfs_rq *cfs_rq = group_cfs_rq(se);
				1357	struct task_group *tg = cfs_rq->tg;
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1358	int runnable_avg;
				1359
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1360	u64 contrib;
				1361
				1362	contrib = cfs_rq->tg_load_contrib * tg->shares;
				1363	se->avg.load_avg_contrib = div64_u64(contrib,
				1364	atomic64_read(&tg->load_avg) + 1);
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1365
				1366	/*
				1367	* For group entities we need to compute a correction term in the case
				1368	* that they are consuming <1 cpu so that we would contribute the same
				1369	* load as a task of equal weight.
				1370	*
				1371	* Explicitly co-ordinating this measurement would be expensive, but
				1372	* fortunately the sum of each cpus contribution forms a usable
				1373	* lower-bound on the true value.
				1374	*
				1375	* Consider the aggregate of 2 contributions. Either they are disjoint
				1376	* (and the sum represents true value) or they are disjoint and we are
				1377	* understating by the aggregate of their overlap.
				1378	*
				1379	* Extending this to N cpus, for a given overlap, the maximum amount we
				1380	* understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
				1381	* cpus that overlap for this interval and w_i is the interval width.
				1382	*
				1383	* On a small machine; the first term is well-bounded which bounds the
				1384	* total error since w_i is a subset of the period. Whereas on a
				1385	* larger machine, while this first term can be larger, if w_i is the
				1386	* of consequential size guaranteed to see n_i*w_i quickly converge to
				1387	* our upper bound of 1-cpu.
				1388	*/
				1389	runnable_avg = atomic_read(&tg->runnable_avg);
				1390	if (runnable_avg < NICE_0_LOAD) {
				1391	se->avg.load_avg_contrib *= runnable_avg;
				1392	se->avg.load_avg_contrib >>= NICE_0_SHIFT;
				1393	}
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1394	}
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1395	#else
				1396	static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
				1397	int force_update) {}
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1398	static inline void __update_tg_runnable_avg(struct sched_avg *sa,
				1399	struct cfs_rq *cfs_rq) {}
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1400	static inline void __update_group_entity_contrib(struct sched_entity *se) {}
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1401	#endif
				1402
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1403	static inline void __update_task_entity_contrib(struct sched_entity *se)
				1404	{
				1405	u32 contrib;
				1406
				1407	/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
				1408	contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
				1409	contrib /= (se->avg.runnable_avg_period + 1);
				1410	se->avg.load_avg_contrib = scale_load(contrib);
				1411	}
				1412
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1413	/* Compute the current contribution to load_avg by se, return any delta */
				1414	static long __update_entity_load_avg_contrib(struct sched_entity *se)
				1415	{
				1416	long old_contrib = se->avg.load_avg_contrib;
				1417
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1418	if (entity_is_task(se)) {
				1419	__update_task_entity_contrib(se);
				1420	} else {
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1421	__update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1422	__update_group_entity_contrib(se);
				1423	}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1424
				1425	return se->avg.load_avg_contrib - old_contrib;
				1426	}
				1427
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1428	static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
				1429	long load_contrib)
				1430	{
				1431	if (likely(load_contrib < cfs_rq->blocked_load_avg))
				1432	cfs_rq->blocked_load_avg -= load_contrib;
				1433	else
				1434	cfs_rq->blocked_load_avg = 0;
				1435	}
				1436
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	1437	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
				1438
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1439	/* Update a sched_entity's runnable average */
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1440	static inline void update_entity_load_avg(struct sched_entity *se,
				1441	int update_cfs_rq)
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1442	{
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1443	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1444	long contrib_delta;
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	1445	u64 now;
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1446
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	1447	/*
				1448	* For a group entity we need to use their owned cfs_rq_clock_task() in
				1449	* case they are the parent of a throttled hierarchy.
				1450	*/
				1451	if (entity_is_task(se))
				1452	now = cfs_rq_clock_task(cfs_rq);
				1453	else
				1454	now = cfs_rq_clock_task(group_cfs_rq(se));
				1455
				1456	if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1457	return;
				1458
				1459	contrib_delta = __update_entity_load_avg_contrib(se);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1460
				1461	if (!update_cfs_rq)
				1462	return;
				1463
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1464	if (se->on_rq)
				1465	cfs_rq->runnable_load_avg += contrib_delta;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1466	else
				1467	subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
				1468	}
				1469
				1470	/*
				1471	* Decay the load contributed by all blocked children and account this so that
				1472	* their contribution may appropriately discounted when they wake up.
				1473	*/
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1474	static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1475	{
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	1476	u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1477	u64 decays;
				1478
				1479	decays = now - cfs_rq->last_decay;
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1480	if (!decays && !force_update)
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1481	return;
				1482
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1483	if (atomic64_read(&cfs_rq->removed_load)) {
				1484	u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
				1485	subtract_blocked_load_contrib(cfs_rq, removed_load);
				1486	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1487
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1488	if (decays) {
				1489	cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
				1490	decays);
				1491	atomic64_add(decays, &cfs_rq->decay_counter);
				1492	cfs_rq->last_decay = now;
				1493	}
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1494
				1495	__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1496	}
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	1497
				1498	static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
				1499	{
				1500	__update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1501	__update_tg_runnable_avg(&rq->avg, &rq->cfs);
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	1502	}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1503
				1504	/* Add the load generated by se into cfs_rq's child load-average */
				1505	static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1506	struct sched_entity *se,
				1507	int wakeup)
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1508	{
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1509	/*
				1510	* We track migrations using entity decay_count <= 0, on a wake-up
				1511	* migration we use a negative decay count to track the remote decays
				1512	* accumulated while sleeping.
				1513	*/
				1514	if (unlikely(se->avg.decay_count <= 0)) {
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1515	se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1516	if (se->avg.decay_count) {
				1517	/*
				1518	* In a wake-up migration we have to approximate the
				1519	* time sleeping. This is because we can't synchronize
				1520	* clock_task between the two cpus, and it is not
				1521	* guaranteed to be read-safe. Instead, we can
				1522	* approximate this using our carried decays, which are
				1523	* explicitly atomically readable.
				1524	*/
				1525	se->avg.last_runnable_update -= (-se->avg.decay_count)
				1526	<< 20;
				1527	update_entity_load_avg(se, 0);
				1528	/* Indicate that we're now synchronized and on-rq */
				1529	se->avg.decay_count = 0;
				1530	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1531	wakeup = 0;
				1532	} else {
				1533	__synchronize_entity_decay(se);
				1534	}
				1535
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1536	/* migrated tasks did not contribute to our blocked load */
				1537	if (wakeup) {
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1538	subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1539	update_entity_load_avg(se, 0);
				1540	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1541
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1542	cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1543	/* we force update consideration on load-balancer moves */
				1544	update_cfs_rq_blocked_load(cfs_rq, !wakeup);
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1545	}
				1546
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1547	/*
				1548	* Remove se's load from this cfs_rq child load-average, if the entity is
				1549	* transitioning to a blocked state we track its projected decay using
				1550	* blocked_load_avg.
				1551	*/
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1552	static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1553	struct sched_entity *se,
				1554	int sleep)
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1555	{
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1556	update_entity_load_avg(se, 1);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1557	/* we force update consideration on load-balancer moves */
				1558	update_cfs_rq_blocked_load(cfs_rq, !sleep);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1559
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1560	cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1561	if (sleep) {
				1562	cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
				1563	se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
				1564	} /* migrations, e.g. sleep=0 leave decay_count == 0 */
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1565	}
Vincent Guittot	642dbc3	2013-04-18 18:34:26 +0200	[diff] [blame]	1566
				1567	/*
				1568	* Update the rq's load with the elapsed running time before entering
				1569	* idle. if the last scheduled task is not a CFS task, idle_enter will
				1570	* be the only way to update the runnable statistic.
				1571	*/
				1572	void idle_enter_fair(struct rq *this_rq)
				1573	{
				1574	update_rq_runnable_avg(this_rq, 1);
				1575	}
				1576
				1577	/*
				1578	* Update the rq's load with the elapsed idle time before a task is
				1579	* scheduled. if the newly scheduled task is not a CFS task, idle_exit will
				1580	* be the only way to update the runnable statistic.
				1581	*/
				1582	void idle_exit_fair(struct rq *this_rq)
				1583	{
				1584	update_rq_runnable_avg(this_rq, 0);
				1585	}
				1586
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1587	#else
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1588	static inline void update_entity_load_avg(struct sched_entity *se,
				1589	int update_cfs_rq) {}
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	1590	static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1591	static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1592	struct sched_entity *se,
				1593	int wakeup) {}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1594	static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1595	struct sched_entity *se,
				1596	int sleep) {}
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1597	static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
				1598	int force_update) {}
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1599	#endif
				1600
Ingo Molnar	2396af6	2007-08-09 11:16:48 +0200	[diff] [blame]	1601	static void enqueue_sleeper(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1602	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1603	#ifdef CONFIG_SCHEDSTATS
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	1604	struct task_struct *tsk = NULL;
				1605
				1606	if (entity_is_task(se))
				1607	tsk = task_of(se);
				1608
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1609	if (se->statistics.sleep_start) {
				1610	u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1611
				1612	if ((s64)delta < 0)
				1613	delta = 0;
				1614
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1615	if (unlikely(delta > se->statistics.sleep_max))
				1616	se->statistics.sleep_max = delta;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1617
Peter Zijlstra	8c79a04	2012-01-30 14:51:37 +0100	[diff] [blame]	1618	se->statistics.sleep_start = 0;
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1619	se->statistics.sum_sleep_runtime += delta;
Arjan van de Ven	9745512	2008-01-25 21:08:34 +0100	[diff] [blame]	1620
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	1621	if (tsk) {
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	1622	account_scheduler_latency(tsk, delta >> 10, 1);
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	1623	trace_sched_stat_sleep(tsk, delta);
				1624	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1625	}
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1626	if (se->statistics.block_start) {
				1627	u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1628
				1629	if ((s64)delta < 0)
				1630	delta = 0;
				1631
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1632	if (unlikely(delta > se->statistics.block_max))
				1633	se->statistics.block_max = delta;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1634
Peter Zijlstra	8c79a04	2012-01-30 14:51:37 +0100	[diff] [blame]	1635	se->statistics.block_start = 0;
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1636	se->statistics.sum_sleep_runtime += delta;
Ingo Molnar	30084fb	2007-10-02 14:13:08 +0200	[diff] [blame]	1637
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	1638	if (tsk) {
Arjan van de Ven	8f0dfc3	2009-07-20 11:26:58 -0700	[diff] [blame]	1639	if (tsk->in_iowait) {
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1640	se->statistics.iowait_sum += delta;
				1641	se->statistics.iowait_count++;
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	1642	trace_sched_stat_iowait(tsk, delta);
Arjan van de Ven	8f0dfc3	2009-07-20 11:26:58 -0700	[diff] [blame]	1643	}
				1644
Andrew Vagin	b781a60	2011-11-28 12:03:35 +0300	[diff] [blame]	1645	trace_sched_stat_blocked(tsk, delta);
				1646
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	1647	/*
				1648	* Blocking time is in units of nanosecs, so shift by
				1649	* 20 to get a milliseconds-range estimation of the
				1650	* amount of time that the task spent sleeping:
				1651	*/
				1652	if (unlikely(prof_on == SLEEP_PROFILING)) {
				1653	profile_hits(SLEEP_PROFILING,
				1654	(void *)get_wchan(tsk),
				1655	delta >> 20);
				1656	}
				1657	account_scheduler_latency(tsk, delta >> 10, 0);
Ingo Molnar	30084fb	2007-10-02 14:13:08 +0200	[diff] [blame]	1658	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1659	}
				1660	#endif
				1661	}
				1662
Peter Zijlstra	ddc9729	2007-10-15 17:00:10 +0200	[diff] [blame]	1663	static void check_spread(struct cfs_rq cfs_rq, struct sched_entity se)
				1664	{
				1665	#ifdef CONFIG_SCHED_DEBUG
				1666	s64 d = se->vruntime - cfs_rq->min_vruntime;
				1667
				1668	if (d < 0)
				1669	d = -d;
				1670
				1671	if (d > 3*sysctl_sched_latency)
				1672	schedstat_inc(cfs_rq, nr_spread_over);
				1673	#endif
				1674	}
				1675
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1676	static void
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1677	place_entity(struct cfs_rq cfs_rq, struct sched_entity se, int initial)
				1678	{
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	1679	u64 vruntime = cfs_rq->min_vruntime;
Peter Zijlstra	94dfb5e	2007-10-15 17:00:05 +0200	[diff] [blame]	1680
Peter Zijlstra	2cb8600	2007-11-09 22:39:37 +0100	[diff] [blame]	1681	/*
				1682	* The 'current' period is already promised to the current tasks,
				1683	* however the extra weight of the new task will slow them down a
				1684	* little, place the new task so that it fits in the slot that
				1685	* stays open at the end.
				1686	*/
Peter Zijlstra	94dfb5e	2007-10-15 17:00:05 +0200	[diff] [blame]	1687	if (initial && sched_feat(START_DEBIT))
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	1688	vruntime += sched_vslice(cfs_rq, se);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1689
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1690	/* sleeps up to a single latency don't count. */
Mike Galbraith	5ca9880	2010-03-11 17:17:17 +0100	[diff] [blame]	1691	if (!initial) {
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1692	unsigned long thresh = sysctl_sched_latency;
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	1693
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1694	/*
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1695	* Halve their sleep time's effect, to allow
				1696	* for a gentler effect of sleepers:
				1697	*/
				1698	if (sched_feat(GENTLE_FAIR_SLEEPERS))
				1699	thresh >>= 1;
Ingo Molnar	51e0304	2009-09-16 08:54:45 +0200	[diff] [blame]	1700
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1701	vruntime -= thresh;
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1702	}
				1703
Mike Galbraith	b5d9d73	2009-09-08 11:12:28 +0200	[diff] [blame]	1704	/* ensure we never gain time by being placed backwards. */
Viresh Kumar	16c8f1c	2012-11-08 13:33:46 +0530	[diff] [blame]	1705	se->vruntime = max_vruntime(se->vruntime, vruntime);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1706	}
				1707
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	1708	static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
				1709
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1710	static void
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1711	enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1712	{
				1713	/*
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1714	* Update the normalized vruntime before updating min_vruntime
				1715	* through callig update_curr().
				1716	*/
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	1717	if (!(flags & ENQUEUE_WAKEUP) \|\| (flags & ENQUEUE_WAKING))
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1718	se->vruntime += cfs_rq->min_vruntime;
				1719
				1720	/*
Dmitry Adamushko	a2a2d68	2007-10-15 17:00:13 +0200	[diff] [blame]	1721	* Update run-time statistics of the 'current'.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1722	*/
Ingo Molnar	b7cc089	2007-08-09 11:16:47 +0200	[diff] [blame]	1723	update_curr(cfs_rq);
Paul Turner	f269ae0	2012-10-04 13:18:31 +0200	[diff] [blame]	1724	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	1725	account_entity_enqueue(cfs_rq, se);
				1726	update_cfs_shares(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1727
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1728	if (flags & ENQUEUE_WAKEUP) {
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1729	place_entity(cfs_rq, se, 0);
Ingo Molnar	2396af6	2007-08-09 11:16:48 +0200	[diff] [blame]	1730	enqueue_sleeper(cfs_rq, se);
Ingo Molnar	e9acbff	2007-10-15 17:00:04 +0200	[diff] [blame]	1731	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1732
Ingo Molnar	d2417e5	2007-08-09 11:16:47 +0200	[diff] [blame]	1733	update_stats_enqueue(cfs_rq, se);
Peter Zijlstra	ddc9729	2007-10-15 17:00:10 +0200	[diff] [blame]	1734	check_spread(cfs_rq, se);
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	1735	if (se != cfs_rq->curr)
				1736	__enqueue_entity(cfs_rq, se);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1737	se->on_rq = 1;
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	1738
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	1739	if (cfs_rq->nr_running == 1) {
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	1740	list_add_leaf_cfs_rq(cfs_rq);
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	1741	check_enqueue_throttle(cfs_rq);
				1742	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1743	}
				1744
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	1745	static void __clear_buddies_last(struct sched_entity *se)
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	1746	{
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	1747	for_each_sched_entity(se) {
				1748	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1749	if (cfs_rq->last == se)
				1750	cfs_rq->last = NULL;
				1751	else
				1752	break;
				1753	}
				1754	}
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	1755
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	1756	static void __clear_buddies_next(struct sched_entity *se)
				1757	{
				1758	for_each_sched_entity(se) {
				1759	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1760	if (cfs_rq->next == se)
				1761	cfs_rq->next = NULL;
				1762	else
				1763	break;
				1764	}
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	1765	}
				1766
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1767	static void __clear_buddies_skip(struct sched_entity *se)
				1768	{
				1769	for_each_sched_entity(se) {
				1770	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1771	if (cfs_rq->skip == se)
				1772	cfs_rq->skip = NULL;
				1773	else
				1774	break;
				1775	}
				1776	}
				1777
Peter Zijlstra	a571bbe	2009-01-28 14:51:40 +0100	[diff] [blame]	1778	static void clear_buddies(struct cfs_rq cfs_rq, struct sched_entity se)
				1779	{
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	1780	if (cfs_rq->last == se)
				1781	__clear_buddies_last(se);
				1782
				1783	if (cfs_rq->next == se)
				1784	__clear_buddies_next(se);
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1785
				1786	if (cfs_rq->skip == se)
				1787	__clear_buddies_skip(se);
Peter Zijlstra	a571bbe	2009-01-28 14:51:40 +0100	[diff] [blame]	1788	}
				1789
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	1790	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	1791
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1792	static void
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	1793	dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1794	{
Dmitry Adamushko	a2a2d68	2007-10-15 17:00:13 +0200	[diff] [blame]	1795	/*
				1796	* Update run-time statistics of the 'current'.
				1797	*/
				1798	update_curr(cfs_rq);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	1799	dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
Dmitry Adamushko	a2a2d68	2007-10-15 17:00:13 +0200	[diff] [blame]	1800
Ingo Molnar	19b6a2e	2007-08-09 11:16:48 +0200	[diff] [blame]	1801	update_stats_dequeue(cfs_rq, se);
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	1802	if (flags & DEQUEUE_SLEEP) {
Peter Zijlstra	67e9fb2	2007-10-15 17:00:10 +0200	[diff] [blame]	1803	#ifdef CONFIG_SCHEDSTATS
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1804	if (entity_is_task(se)) {
				1805	struct task_struct *tsk = task_of(se);
				1806
				1807	if (tsk->state & TASK_INTERRUPTIBLE)
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1808	se->statistics.sleep_start = rq_of(cfs_rq)->clock;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1809	if (tsk->state & TASK_UNINTERRUPTIBLE)
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1810	se->statistics.block_start = rq_of(cfs_rq)->clock;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1811	}
Dmitry Adamushko	db36cc7	2007-10-15 17:00:06 +0200	[diff] [blame]	1812	#endif
Peter Zijlstra	67e9fb2	2007-10-15 17:00:10 +0200	[diff] [blame]	1813	}
				1814
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	1815	clear_buddies(cfs_rq, se);
Peter Zijlstra	4793241	2008-11-04 21:25:09 +0100	[diff] [blame]	1816
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	1817	if (se != cfs_rq->curr)
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1818	__dequeue_entity(cfs_rq, se);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	1819	se->on_rq = 0;
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1820	account_entity_dequeue(cfs_rq, se);
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1821
				1822	/*
				1823	* Normalize the entity after updating the min_vruntime because the
				1824	* update can refer to the ->curr item and we need to reflect this
				1825	* movement in our normalized position.
				1826	*/
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	1827	if (!(flags & DEQUEUE_SLEEP))
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1828	se->vruntime -= cfs_rq->min_vruntime;
Peter Zijlstra	1e87623	2011-05-17 16:21:10 -0700	[diff] [blame]	1829
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	1830	/* return excess runtime on last dequeue */
				1831	return_cfs_rq_runtime(cfs_rq);
				1832
Peter Zijlstra	1e87623	2011-05-17 16:21:10 -0700	[diff] [blame]	1833	update_min_vruntime(cfs_rq);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	1834	update_cfs_shares(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1835	}
				1836
				1837	/*
				1838	* Preempt the current task with a newly woken task if needed:
				1839	*/
Peter Zijlstra	7c92e54	2007-09-05 14:32:49 +0200	[diff] [blame]	1840	static void
Ingo Molnar	2e09bf5	2007-10-15 17:00:05 +0200	[diff] [blame]	1841	check_preempt_tick(struct cfs_rq cfs_rq, struct sched_entity curr)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1842	{
Peter Zijlstra	1169783	2007-09-05 14:32:49 +0200	[diff] [blame]	1843	unsigned long ideal_runtime, delta_exec;
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	1844	struct sched_entity *se;
				1845	s64 delta;
Peter Zijlstra	1169783	2007-09-05 14:32:49 +0200	[diff] [blame]	1846
Peter Zijlstra	6d0f0ebd	2007-10-15 17:00:05 +0200	[diff] [blame]	1847	ideal_runtime = sched_slice(cfs_rq, curr);
Peter Zijlstra	1169783	2007-09-05 14:32:49 +0200	[diff] [blame]	1848	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
Mike Galbraith	a9f3e2b	2009-01-28 14:51:39 +0100	[diff] [blame]	1849	if (delta_exec > ideal_runtime) {
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1850	resched_task(rq_of(cfs_rq)->curr);
Mike Galbraith	a9f3e2b	2009-01-28 14:51:39 +0100	[diff] [blame]	1851	/*
				1852	* The current task ran long enough, ensure it doesn't get
				1853	* re-elected due to buddy favours.
				1854	*/
				1855	clear_buddies(cfs_rq, curr);
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1856	return;
				1857	}
				1858
				1859	/*
				1860	* Ensure that a task that missed wakeup preemption by a
				1861	* narrow margin doesn't have to wait for a full slice.
				1862	* This also mitigates buddy induced latencies under load.
				1863	*/
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1864	if (delta_exec < sysctl_sched_min_granularity)
				1865	return;
				1866
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	1867	se = __pick_first_entity(cfs_rq);
				1868	delta = curr->vruntime - se->vruntime;
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1869
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	1870	if (delta < 0)
				1871	return;
Mike Galbraith	d7d8294	2011-01-05 05:41:17 +0100	[diff] [blame]	1872
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	1873	if (delta > ideal_runtime)
				1874	resched_task(rq_of(cfs_rq)->curr);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1875	}
				1876
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	1877	static void
Ingo Molnar	8494f41	2007-08-09 11:16:48 +0200	[diff] [blame]	1878	set_next_entity(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1879	{
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	1880	/* 'current' is not kept within the tree. */
				1881	if (se->on_rq) {
				1882	/*
				1883	* Any task has to be enqueued before it get to execute on
				1884	* a CPU. So account for the time it spent waiting on the
				1885	* runqueue.
				1886	*/
				1887	update_stats_wait_end(cfs_rq, se);
				1888	__dequeue_entity(cfs_rq, se);
				1889	}
				1890
Ingo Molnar	79303e9	2007-08-09 11:16:47 +0200	[diff] [blame]	1891	update_stats_curr_start(cfs_rq, se);
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	1892	cfs_rq->curr = se;
Ingo Molnar	eba1ed4	2007-10-15 17:00:02 +0200	[diff] [blame]	1893	#ifdef CONFIG_SCHEDSTATS
				1894	/*
				1895	* Track our maximum slice length, if the CPU's load is at
				1896	* least twice that of our own weight (i.e. dont track it
				1897	* when there are only lesser-weight tasks around):
				1898	*/
Dmitry Adamushko	495eca4	2007-10-15 17:00:06 +0200	[diff] [blame]	1899	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1900	se->statistics.slice_max = max(se->statistics.slice_max,
Ingo Molnar	eba1ed4	2007-10-15 17:00:02 +0200	[diff] [blame]	1901	se->sum_exec_runtime - se->prev_sum_exec_runtime);
				1902	}
				1903	#endif
Peter Zijlstra	4a55b45	2007-09-05 14:32:49 +0200	[diff] [blame]	1904	se->prev_sum_exec_runtime = se->sum_exec_runtime;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1905	}
				1906
Peter Zijlstra	3f3a490	2008-10-24 11:06:16 +0200	[diff] [blame]	1907	static int
				1908	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se);
				1909
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1910	/*
				1911	* Pick the next process, keeping these things in mind, in this order:
				1912	* 1) keep things fair between processes/task groups
				1913	* 2) pick the "next" process, since someone really wants that to run
				1914	* 3) pick the "last" process, for cache locality
				1915	* 4) do not run the "skip" process, if something else is available
				1916	*/
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	1917	static struct sched_entity pick_next_entity(struct cfs_rq cfs_rq)
Peter Zijlstra	aa2ac25	2008-03-14 21:12:12 +0100	[diff] [blame]	1918	{
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1919	struct sched_entity *se = __pick_first_entity(cfs_rq);
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1920	struct sched_entity *left = se;
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	1921
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1922	/*
				1923	* Avoid running the skip buddy, if running something else can
				1924	* be done without getting too unfair.
				1925	*/
				1926	if (cfs_rq->skip == se) {
				1927	struct sched_entity *second = __pick_next_entity(se);
				1928	if (second && wakeup_preempt_entity(second, left) < 1)
				1929	se = second;
				1930	}
Peter Zijlstra	aa2ac25	2008-03-14 21:12:12 +0100	[diff] [blame]	1931
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1932	/*
				1933	* Prefer last buddy, try to return the CPU to a preempted task.
				1934	*/
				1935	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
				1936	se = cfs_rq->last;
				1937
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1938	/*
				1939	* Someone really wants this to run. If it's not unfair, run it.
				1940	*/
				1941	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
				1942	se = cfs_rq->next;
				1943
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1944	clear_buddies(cfs_rq, se);
Peter Zijlstra	4793241	2008-11-04 21:25:09 +0100	[diff] [blame]	1945
				1946	return se;
Peter Zijlstra	aa2ac25	2008-03-14 21:12:12 +0100	[diff] [blame]	1947	}
				1948
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	1949	static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
				1950
Ingo Molnar	ab6cde2	2007-08-09 11:16:48 +0200	[diff] [blame]	1951	static void put_prev_entity(struct cfs_rq cfs_rq, struct sched_entity prev)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1952	{
				1953	/*
				1954	* If still on the runqueue then deactivate_task()
				1955	* was not called and update_curr() has to be done:
				1956	*/
				1957	if (prev->on_rq)
Ingo Molnar	b7cc089	2007-08-09 11:16:47 +0200	[diff] [blame]	1958	update_curr(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1959
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	1960	/* throttle cfs_rqs exceeding runtime */
				1961	check_cfs_rq_runtime(cfs_rq);
				1962
Peter Zijlstra	ddc9729	2007-10-15 17:00:10 +0200	[diff] [blame]	1963	check_spread(cfs_rq, prev);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1964	if (prev->on_rq) {
Ingo Molnar	5870db5	2007-08-09 11:16:47 +0200	[diff] [blame]	1965	update_stats_wait_start(cfs_rq, prev);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1966	/* Put 'current' back into the tree. */
				1967	__enqueue_entity(cfs_rq, prev);
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1968	/* in !on_rq case, update occurred at dequeue */
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1969	update_entity_load_avg(prev, 1);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1970	}
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	1971	cfs_rq->curr = NULL;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1972	}
				1973
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	1974	static void
				1975	entity_tick(struct cfs_rq cfs_rq, struct sched_entity curr, int queued)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1976	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1977	/*
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1978	* Update run-time statistics of the 'current'.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1979	*/
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1980	update_curr(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1981
Paul Turner	43365bd	2010-12-15 19:10:17 -0800	[diff] [blame]	1982	/*
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1983	* Ensure that runnable average is periodically updated.
				1984	*/
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1985	update_entity_load_avg(curr, 1);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1986	update_cfs_rq_blocked_load(cfs_rq, 1);
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1987
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	1988	#ifdef CONFIG_SCHED_HRTICK
				1989	/*
				1990	* queued ticks are scheduled to match the slice, so don't bother
				1991	* validating it and just reschedule.
				1992	*/
Harvey Harrison	983ed7a	2008-04-24 18:17:55 -0700	[diff] [blame]	1993	if (queued) {
				1994	resched_task(rq_of(cfs_rq)->curr);
				1995	return;
				1996	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	1997	/*
				1998	* don't let the period tick interfere with the hrtick preemption
				1999	*/
				2000	if (!sched_feat(DOUBLE_TICK) &&
				2001	hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
				2002	return;
				2003	#endif
				2004
Yong Zhang	2c2efae	2011-07-29 16:20:33 +0800	[diff] [blame]	2005	if (cfs_rq->nr_running > 1)
Ingo Molnar	2e09bf5	2007-10-15 17:00:05 +0200	[diff] [blame]	2006	check_preempt_tick(cfs_rq, curr);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2007	}
				2008
Paul Turner	ab84d31	2011-07-21 09:43:28 -0700	[diff] [blame]	2009
				2010	/**************************************************
				2011	* CFS bandwidth control machinery
				2012	*/
				2013
				2014	#ifdef CONFIG_CFS_BANDWIDTH
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2015
				2016	#ifdef HAVE_JUMP_LABEL
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	2017	static struct static_key __cfs_bandwidth_used;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2018
				2019	static inline bool cfs_bandwidth_used(void)
				2020	{
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	2021	return static_key_false(&__cfs_bandwidth_used);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2022	}
				2023
				2024	void account_cfs_bandwidth_used(int enabled, int was_enabled)
				2025	{
				2026	/* only need to count groups transitioning between enabled/!enabled */
				2027	if (enabled && !was_enabled)
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	2028	static_key_slow_inc(&__cfs_bandwidth_used);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2029	else if (!enabled && was_enabled)
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	2030	static_key_slow_dec(&__cfs_bandwidth_used);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2031	}
				2032	#else /* HAVE_JUMP_LABEL */
				2033	static bool cfs_bandwidth_used(void)
				2034	{
				2035	return true;
				2036	}
				2037
				2038	void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
				2039	#endif /* HAVE_JUMP_LABEL */
				2040
Paul Turner	ab84d31	2011-07-21 09:43:28 -0700	[diff] [blame]	2041	/*
				2042	* default period for cfs group bandwidth.
				2043	* default: 0.1s, units: nanoseconds
				2044	*/
				2045	static inline u64 default_cfs_period(void)
				2046	{
				2047	return 100000000ULL;
				2048	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2049
				2050	static inline u64 sched_cfs_bandwidth_slice(void)
				2051	{
				2052	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
				2053	}
				2054
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2055	/*
				2056	* Replenish runtime according to assigned quota and update expiration time.
				2057	* We use sched_clock_cpu directly instead of rq->clock to avoid adding
				2058	* additional synchronization around rq->lock.
				2059	*
				2060	* requires cfs_b->lock
				2061	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2062	void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2063	{
				2064	u64 now;
				2065
				2066	if (cfs_b->quota == RUNTIME_INF)
				2067	return;
				2068
				2069	now = sched_clock_cpu(smp_processor_id());
				2070	cfs_b->runtime = cfs_b->quota;
				2071	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
				2072	}
				2073
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2074	static inline struct cfs_bandwidth tg_cfs_bandwidth(struct task_group tg)
				2075	{
				2076	return &tg->cfs_bandwidth;
				2077	}
				2078
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2079	/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
				2080	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
				2081	{
				2082	if (unlikely(cfs_rq->throttle_count))
				2083	return cfs_rq->throttled_clock_task;
				2084
				2085	return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
				2086	}
				2087
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2088	/* returns 0 on failure to allocate runtime */
				2089	static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2090	{
				2091	struct task_group *tg = cfs_rq->tg;
				2092	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2093	u64 amount = 0, min_amount, expires;
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2094
				2095	/* note: this is a positive sum as runtime_remaining <= 0 */
				2096	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
				2097
				2098	raw_spin_lock(&cfs_b->lock);
				2099	if (cfs_b->quota == RUNTIME_INF)
				2100	amount = min_amount;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2101	else {
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2102	/*
				2103	* If the bandwidth pool has become inactive, then at least one
				2104	* period must have elapsed since the last consumption.
				2105	* Refresh the global state and ensure bandwidth timer becomes
				2106	* active.
				2107	*/
				2108	if (!cfs_b->timer_active) {
				2109	__refill_cfs_bandwidth_runtime(cfs_b);
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2110	__start_cfs_bandwidth(cfs_b);
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2111	}
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2112
				2113	if (cfs_b->runtime > 0) {
				2114	amount = min(cfs_b->runtime, min_amount);
				2115	cfs_b->runtime -= amount;
				2116	cfs_b->idle = 0;
				2117	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2118	}
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2119	expires = cfs_b->runtime_expires;
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2120	raw_spin_unlock(&cfs_b->lock);
				2121
				2122	cfs_rq->runtime_remaining += amount;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2123	/*
				2124	* we may have advanced our local expiration to account for allowed
				2125	* spread between our sched_clock and the one on which runtime was
				2126	* issued.
				2127	*/
				2128	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
				2129	cfs_rq->runtime_expires = expires;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2130
				2131	return cfs_rq->runtime_remaining > 0;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2132	}
				2133
				2134	/*
				2135	* Note: This depends on the synchronization provided by sched_clock and the
				2136	* fact that rq->clock snapshots this value.
				2137	*/
				2138	static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2139	{
				2140	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2141	struct rq *rq = rq_of(cfs_rq);
				2142
				2143	/* if the deadline is ahead of our clock, nothing to do */
				2144	if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
				2145	return;
				2146
				2147	if (cfs_rq->runtime_remaining < 0)
				2148	return;
				2149
				2150	/*
				2151	* If the local deadline has passed we have to consider the
				2152	* possibility that our sched_clock is 'fast' and the global deadline
				2153	* has not truly expired.
				2154	*
				2155	* Fortunately we can check determine whether this the case by checking
				2156	* whether the global deadline has advanced.
				2157	*/
				2158
				2159	if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
				2160	/* extend local deadline, drift is bounded above by 2 ticks */
				2161	cfs_rq->runtime_expires += TICK_NSEC;
				2162	} else {
				2163	/* global deadline is ahead, expiration has passed */
				2164	cfs_rq->runtime_remaining = 0;
				2165	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2166	}
				2167
				2168	static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
				2169	unsigned long delta_exec)
				2170	{
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2171	/* dock delta_exec before expiring quota (as it could span periods) */
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2172	cfs_rq->runtime_remaining -= delta_exec;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2173	expire_cfs_rq_runtime(cfs_rq);
				2174
				2175	if (likely(cfs_rq->runtime_remaining > 0))
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2176	return;
				2177
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2178	/*
				2179	* if we're unable to extend our runtime we resched so that the active
				2180	* hierarchy can be throttled
				2181	*/
				2182	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
				2183	resched_task(rq_of(cfs_rq)->curr);
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2184	}
				2185
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	2186	static __always_inline
				2187	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2188	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2189	if (!cfs_bandwidth_used() \|\| !cfs_rq->runtime_enabled)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2190	return;
				2191
				2192	__account_cfs_rq_runtime(cfs_rq, delta_exec);
				2193	}
				2194
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2195	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
				2196	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2197	return cfs_bandwidth_used() && cfs_rq->throttled;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2198	}
				2199
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2200	/* check whether cfs_rq, or any parent, is throttled */
				2201	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
				2202	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2203	return cfs_bandwidth_used() && cfs_rq->throttle_count;
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2204	}
				2205
				2206	/*
				2207	* Ensure that neither of the group entities corresponding to src_cpu or
				2208	* dest_cpu are members of a throttled hierarchy when performing group
				2209	* load-balance operations.
				2210	*/
				2211	static inline int throttled_lb_pair(struct task_group *tg,
				2212	int src_cpu, int dest_cpu)
				2213	{
				2214	struct cfs_rq src_cfs_rq, dest_cfs_rq;
				2215
				2216	src_cfs_rq = tg->cfs_rq[src_cpu];
				2217	dest_cfs_rq = tg->cfs_rq[dest_cpu];
				2218
				2219	return throttled_hierarchy(src_cfs_rq) \|\|
				2220	throttled_hierarchy(dest_cfs_rq);
				2221	}
				2222
				2223	/* updated child weight may affect parent so we have to do this bottom up */
				2224	static int tg_unthrottle_up(struct task_group tg, void data)
				2225	{
				2226	struct rq *rq = data;
				2227	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
				2228
				2229	cfs_rq->throttle_count--;
				2230	#ifdef CONFIG_SMP
				2231	if (!cfs_rq->throttle_count) {
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2232	/* adjust cfs_rq_clock_task() */
				2233	cfs_rq->throttled_clock_task_time += rq->clock_task -
				2234	cfs_rq->throttled_clock_task;
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2235	}
				2236	#endif
				2237
				2238	return 0;
				2239	}
				2240
				2241	static int tg_throttle_down(struct task_group tg, void data)
				2242	{
				2243	struct rq *rq = data;
				2244	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
				2245
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	2246	/* group is entering throttled state, stop time */
				2247	if (!cfs_rq->throttle_count)
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2248	cfs_rq->throttled_clock_task = rq->clock_task;
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2249	cfs_rq->throttle_count++;
				2250
				2251	return 0;
				2252	}
				2253
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2254	static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2255	{
				2256	struct rq *rq = rq_of(cfs_rq);
				2257	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2258	struct sched_entity *se;
				2259	long task_delta, dequeue = 1;
				2260
				2261	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
				2262
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2263	/* freeze hierarchy runnable averages while throttled */
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2264	rcu_read_lock();
				2265	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
				2266	rcu_read_unlock();
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2267
				2268	task_delta = cfs_rq->h_nr_running;
				2269	for_each_sched_entity(se) {
				2270	struct cfs_rq *qcfs_rq = cfs_rq_of(se);
				2271	/* throttled entity or throttle-on-deactivate */
				2272	if (!se->on_rq)
				2273	break;
				2274
				2275	if (dequeue)
				2276	dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
				2277	qcfs_rq->h_nr_running -= task_delta;
				2278
				2279	if (qcfs_rq->load.weight)
				2280	dequeue = 0;
				2281	}
				2282
				2283	if (!se)
				2284	rq->nr_running -= task_delta;
				2285
				2286	cfs_rq->throttled = 1;
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2287	cfs_rq->throttled_clock = rq->clock;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2288	raw_spin_lock(&cfs_b->lock);
				2289	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
				2290	raw_spin_unlock(&cfs_b->lock);
				2291	}
				2292
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2293	void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2294	{
				2295	struct rq *rq = rq_of(cfs_rq);
				2296	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2297	struct sched_entity *se;
				2298	int enqueue = 1;
				2299	long task_delta;
				2300
				2301	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
				2302
				2303	cfs_rq->throttled = 0;
				2304	raw_spin_lock(&cfs_b->lock);
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2305	cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2306	list_del_rcu(&cfs_rq->throttled_list);
				2307	raw_spin_unlock(&cfs_b->lock);
				2308
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2309	update_rq_clock(rq);
				2310	/* update hierarchical throttle state */
				2311	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
				2312
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2313	if (!cfs_rq->load.weight)
				2314	return;
				2315
				2316	task_delta = cfs_rq->h_nr_running;
				2317	for_each_sched_entity(se) {
				2318	if (se->on_rq)
				2319	enqueue = 0;
				2320
				2321	cfs_rq = cfs_rq_of(se);
				2322	if (enqueue)
				2323	enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
				2324	cfs_rq->h_nr_running += task_delta;
				2325
				2326	if (cfs_rq_throttled(cfs_rq))
				2327	break;
				2328	}
				2329
				2330	if (!se)
				2331	rq->nr_running += task_delta;
				2332
				2333	/* determine whether we need to wake up potentially idle cpu */
				2334	if (rq->curr == rq->idle && rq->cfs.nr_running)
				2335	resched_task(rq->curr);
				2336	}
				2337
				2338	static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
				2339	u64 remaining, u64 expires)
				2340	{
				2341	struct cfs_rq *cfs_rq;
				2342	u64 runtime = remaining;
				2343
				2344	rcu_read_lock();
				2345	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
				2346	throttled_list) {
				2347	struct rq *rq = rq_of(cfs_rq);
				2348
				2349	raw_spin_lock(&rq->lock);
				2350	if (!cfs_rq_throttled(cfs_rq))
				2351	goto next;
				2352
				2353	runtime = -cfs_rq->runtime_remaining + 1;
				2354	if (runtime > remaining)
				2355	runtime = remaining;
				2356	remaining -= runtime;
				2357
				2358	cfs_rq->runtime_remaining += runtime;
				2359	cfs_rq->runtime_expires = expires;
				2360
				2361	/* we check whether we're throttled above */
				2362	if (cfs_rq->runtime_remaining > 0)
				2363	unthrottle_cfs_rq(cfs_rq);
				2364
				2365	next:
				2366	raw_spin_unlock(&rq->lock);
				2367
				2368	if (!remaining)
				2369	break;
				2370	}
				2371	rcu_read_unlock();
				2372
				2373	return remaining;
				2374	}
				2375
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2376	/*
				2377	* Responsible for refilling a task_group's bandwidth and unthrottling its
				2378	* cfs_rqs as appropriate. If there has been no activity within the last
				2379	* period the timer is deactivated until scheduling resumes; cfs_b->idle is
				2380	* used to track this state.
				2381	*/
				2382	static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
				2383	{
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2384	u64 runtime, runtime_expires;
				2385	int idle = 1, throttled;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2386
				2387	raw_spin_lock(&cfs_b->lock);
				2388	/* no need to continue the timer with no bandwidth constraint */
				2389	if (cfs_b->quota == RUNTIME_INF)
				2390	goto out_unlock;
				2391
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2392	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
				2393	/* idle depends on !throttled (for the case of a large deficit) */
				2394	idle = cfs_b->idle && !throttled;
Nikhil Rao	e8da1b1	2011-07-21 09:43:40 -0700	[diff] [blame]	2395	cfs_b->nr_periods += overrun;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2396
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2397	/* if we're going inactive then everything else can be deferred */
				2398	if (idle)
				2399	goto out_unlock;
				2400
				2401	__refill_cfs_bandwidth_runtime(cfs_b);
				2402
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2403	if (!throttled) {
				2404	/* mark as potentially idle for the upcoming period */
				2405	cfs_b->idle = 1;
				2406	goto out_unlock;
				2407	}
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2408
Nikhil Rao	e8da1b1	2011-07-21 09:43:40 -0700	[diff] [blame]	2409	/* account preceding periods in which throttling occurred */
				2410	cfs_b->nr_throttled += overrun;
				2411
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2412	/*
				2413	* There are throttled entities so we must first use the new bandwidth
				2414	* to unthrottle them before making it generally available. This
				2415	* ensures that all existing debts will be paid before a new cfs_rq is
				2416	* allowed to run.
				2417	*/
				2418	runtime = cfs_b->runtime;
				2419	runtime_expires = cfs_b->runtime_expires;
				2420	cfs_b->runtime = 0;
				2421
				2422	/*
				2423	* This check is repeated as we are holding onto the new bandwidth
				2424	* while we unthrottle. This can potentially race with an unthrottled
				2425	* group trying to acquire new bandwidth from the global pool.
				2426	*/
				2427	while (throttled && runtime > 0) {
				2428	raw_spin_unlock(&cfs_b->lock);
				2429	/* we can't nest cfs_b->lock while distributing bandwidth */
				2430	runtime = distribute_cfs_runtime(cfs_b, runtime,
				2431	runtime_expires);
				2432	raw_spin_lock(&cfs_b->lock);
				2433
				2434	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
				2435	}
				2436
				2437	/* return (any) remaining runtime */
				2438	cfs_b->runtime = runtime;
				2439	/*
				2440	* While we are ensured activity in the period following an
				2441	* unthrottle, this also covers the case in which the new bandwidth is
				2442	* insufficient to cover the existing bandwidth deficit. (Forcing the
				2443	* timer to remain active while there are any throttled entities.)
				2444	*/
				2445	cfs_b->idle = 0;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2446	out_unlock:
				2447	if (idle)
				2448	cfs_b->timer_active = 0;
				2449	raw_spin_unlock(&cfs_b->lock);
				2450
				2451	return idle;
				2452	}
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2453
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	2454	/* a cfs_rq won't donate quota below this amount */
				2455	static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
				2456	/* minimum remaining period time to redistribute slack quota */
				2457	static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
				2458	/* how long we wait to gather additional slack before distributing */
				2459	static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
				2460
				2461	/* are we near the end of the current quota period? */
				2462	static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
				2463	{
				2464	struct hrtimer *refresh_timer = &cfs_b->period_timer;
				2465	u64 remaining;
				2466
				2467	/* if the call-back is running a quota refresh is already occurring */
				2468	if (hrtimer_callback_running(refresh_timer))
				2469	return 1;
				2470
				2471	/* is a quota refresh about to occur? */
				2472	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
				2473	if (remaining < min_expire)
				2474	return 1;
				2475
				2476	return 0;
				2477	}
				2478
				2479	static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
				2480	{
				2481	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
				2482
				2483	/* if there's a quota refresh soon don't bother with slack */
				2484	if (runtime_refresh_within(cfs_b, min_left))
				2485	return;
				2486
				2487	start_bandwidth_timer(&cfs_b->slack_timer,
				2488	ns_to_ktime(cfs_bandwidth_slack_period));
				2489	}
				2490
				2491	/* we know any runtime found here is valid as update_curr() precedes return */
				2492	static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2493	{
				2494	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2495	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
				2496
				2497	if (slack_runtime <= 0)
				2498	return;
				2499
				2500	raw_spin_lock(&cfs_b->lock);
				2501	if (cfs_b->quota != RUNTIME_INF &&
				2502	cfs_rq->runtime_expires == cfs_b->runtime_expires) {
				2503	cfs_b->runtime += slack_runtime;
				2504
				2505	/* we are under rq->lock, defer unthrottling using a timer */
				2506	if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
				2507	!list_empty(&cfs_b->throttled_cfs_rq))
				2508	start_cfs_slack_bandwidth(cfs_b);
				2509	}
				2510	raw_spin_unlock(&cfs_b->lock);
				2511
				2512	/* even if it's not valid for return we don't want to try again */
				2513	cfs_rq->runtime_remaining -= slack_runtime;
				2514	}
				2515
				2516	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2517	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2518	if (!cfs_bandwidth_used())
				2519	return;
				2520
Paul Turner	fccfdc6	2011-11-07 20:26:34 -0800	[diff] [blame]	2521	if (!cfs_rq->runtime_enabled \|\| cfs_rq->nr_running)
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	2522	return;
				2523
				2524	__return_cfs_rq_runtime(cfs_rq);
				2525	}
				2526
				2527	/*
				2528	* This is done with a timer (instead of inline with bandwidth return) since
				2529	* it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
				2530	*/
				2531	static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
				2532	{
				2533	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
				2534	u64 expires;
				2535
				2536	/* confirm we're still not at a refresh boundary */
				2537	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
				2538	return;
				2539
				2540	raw_spin_lock(&cfs_b->lock);
				2541	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
				2542	runtime = cfs_b->runtime;
				2543	cfs_b->runtime = 0;
				2544	}
				2545	expires = cfs_b->runtime_expires;
				2546	raw_spin_unlock(&cfs_b->lock);
				2547
				2548	if (!runtime)
				2549	return;
				2550
				2551	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
				2552
				2553	raw_spin_lock(&cfs_b->lock);
				2554	if (expires == cfs_b->runtime_expires)
				2555	cfs_b->runtime = runtime;
				2556	raw_spin_unlock(&cfs_b->lock);
				2557	}
				2558
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2559	/*
				2560	* When a group wakes up we want to make sure that its quota is not already
				2561	* expired/exceeded, otherwise it may be allowed to steal additional ticks of
				2562	* runtime as update_curr() throttling can not not trigger until it's on-rq.
				2563	*/
				2564	static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
				2565	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2566	if (!cfs_bandwidth_used())
				2567	return;
				2568
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2569	/* an active group must be handled by the update_curr()->put() path */
				2570	if (!cfs_rq->runtime_enabled \|\| cfs_rq->curr)
				2571	return;
				2572
				2573	/* ensure the group is not already throttled */
				2574	if (cfs_rq_throttled(cfs_rq))
				2575	return;
				2576
				2577	/* update runtime allocation */
				2578	account_cfs_rq_runtime(cfs_rq, 0);
				2579	if (cfs_rq->runtime_remaining <= 0)
				2580	throttle_cfs_rq(cfs_rq);
				2581	}
				2582
				2583	/* conditionally throttle active cfs_rq's from put_prev_entity() */
				2584	static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2585	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2586	if (!cfs_bandwidth_used())
				2587	return;
				2588
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2589	if (likely(!cfs_rq->runtime_enabled \|\| cfs_rq->runtime_remaining > 0))
				2590	return;
				2591
				2592	/*
				2593	* it's possible for a throttled entity to be forced into a running
				2594	* state (e.g. set_curr_task), in this case we're finished.
				2595	*/
				2596	if (cfs_rq_throttled(cfs_rq))
				2597	return;
				2598
				2599	throttle_cfs_rq(cfs_rq);
				2600	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2601
				2602	static inline u64 default_cfs_period(void);
				2603	static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
				2604	static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
				2605
				2606	static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
				2607	{
				2608	struct cfs_bandwidth *cfs_b =
				2609	container_of(timer, struct cfs_bandwidth, slack_timer);
				2610	do_sched_cfs_slack_timer(cfs_b);
				2611
				2612	return HRTIMER_NORESTART;
				2613	}
				2614
				2615	static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
				2616	{
				2617	struct cfs_bandwidth *cfs_b =
				2618	container_of(timer, struct cfs_bandwidth, period_timer);
				2619	ktime_t now;
				2620	int overrun;
				2621	int idle = 0;
				2622
				2623	for (;;) {
				2624	now = hrtimer_cb_get_time(timer);
				2625	overrun = hrtimer_forward(timer, now, cfs_b->period);
				2626
				2627	if (!overrun)
				2628	break;
				2629
				2630	idle = do_sched_cfs_period_timer(cfs_b, overrun);
				2631	}
				2632
				2633	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
				2634	}
				2635
				2636	void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				2637	{
				2638	raw_spin_lock_init(&cfs_b->lock);
				2639	cfs_b->runtime = 0;
				2640	cfs_b->quota = RUNTIME_INF;
				2641	cfs_b->period = ns_to_ktime(default_cfs_period());
				2642
				2643	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
				2644	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
				2645	cfs_b->period_timer.function = sched_cfs_period_timer;
				2646	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
				2647	cfs_b->slack_timer.function = sched_cfs_slack_timer;
				2648	}
				2649
				2650	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2651	{
				2652	cfs_rq->runtime_enabled = 0;
				2653	INIT_LIST_HEAD(&cfs_rq->throttled_list);
				2654	}
				2655
				2656	/* requires cfs_b->lock, may release to reprogram timer */
				2657	void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				2658	{
				2659	/*
				2660	* The timer may be active because we're trying to set a new bandwidth
				2661	* period or because we're racing with the tear-down path
				2662	* (timer_active==0 becomes visible before the hrtimer call-back
				2663	* terminates). In either case we ensure that it's re-programmed
				2664	*/
				2665	while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
				2666	raw_spin_unlock(&cfs_b->lock);
				2667	/* ensure cfs_b->lock is available while we wait */
				2668	hrtimer_cancel(&cfs_b->period_timer);
				2669
				2670	raw_spin_lock(&cfs_b->lock);
				2671	/* if someone else restarted the timer then we're done */
				2672	if (cfs_b->timer_active)
				2673	return;
				2674	}
				2675
				2676	cfs_b->timer_active = 1;
				2677	start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
				2678	}
				2679
				2680	static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				2681	{
				2682	hrtimer_cancel(&cfs_b->period_timer);
				2683	hrtimer_cancel(&cfs_b->slack_timer);
				2684	}
				2685
Arnd Bergmann	38dc334	2013-01-25 14:14:22 +0000	[diff] [blame]	2686	static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2687	{
				2688	struct cfs_rq *cfs_rq;
				2689
				2690	for_each_leaf_cfs_rq(rq, cfs_rq) {
				2691	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2692
				2693	if (!cfs_rq->runtime_enabled)
				2694	continue;
				2695
				2696	/*
				2697	* clock_task is not advancing so we just need to make sure
				2698	* there's some valid quota amount
				2699	*/
				2700	cfs_rq->runtime_remaining = cfs_b->quota;
				2701	if (cfs_rq_throttled(cfs_rq))
				2702	unthrottle_cfs_rq(cfs_rq);
				2703	}
				2704	}
				2705
				2706	#else /* CONFIG_CFS_BANDWIDTH */
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2707	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
				2708	{
				2709	return rq_of(cfs_rq)->clock_task;
				2710	}
				2711
				2712	static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
				2713	unsigned long delta_exec) {}
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2714	static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
				2715	static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	2716	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2717
				2718	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
				2719	{
				2720	return 0;
				2721	}
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2722
				2723	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
				2724	{
				2725	return 0;
				2726	}
				2727
				2728	static inline int throttled_lb_pair(struct task_group *tg,
				2729	int src_cpu, int dest_cpu)
				2730	{
				2731	return 0;
				2732	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2733
				2734	void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
				2735
				2736	#ifdef CONFIG_FAIR_GROUP_SCHED
				2737	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
Paul Turner	ab84d31	2011-07-21 09:43:28 -0700	[diff] [blame]	2738	#endif
				2739
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2740	static inline struct cfs_bandwidth tg_cfs_bandwidth(struct task_group tg)
				2741	{
				2742	return NULL;
				2743	}
				2744	static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
Peter Boonstoppel	a4c96ae	2012-08-09 15:34:47 -0700	[diff] [blame]	2745	static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2746
				2747	#endif /* CONFIG_CFS_BANDWIDTH */
				2748
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2749	/**************************************************
				2750	* CFS operations on tasks:
				2751	*/
				2752
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2753	#ifdef CONFIG_SCHED_HRTICK
				2754	static void hrtick_start_fair(struct rq rq, struct task_struct p)
				2755	{
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2756	struct sched_entity *se = &p->se;
				2757	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				2758
				2759	WARN_ON(task_rq(p) != rq);
				2760
Mike Galbraith	b39e66e	2011-11-22 15:20:07 +0100	[diff] [blame]	2761	if (cfs_rq->nr_running > 1) {
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2762	u64 slice = sched_slice(cfs_rq, se);
				2763	u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
				2764	s64 delta = slice - ran;
				2765
				2766	if (delta < 0) {
				2767	if (rq->curr == p)
				2768	resched_task(p);
				2769	return;
				2770	}
				2771
				2772	/*
				2773	* Don't schedule slices shorter than 10000ns, that just
				2774	* doesn't make sense. Rely on vruntime for fairness.
				2775	*/
Peter Zijlstra	3165651	2008-07-18 18:01:23 +0200	[diff] [blame]	2776	if (rq->curr != p)
Peter Zijlstra	157124c	2008-07-28 11:53:11 +0200	[diff] [blame]	2777	delta = max_t(s64, 10000LL, delta);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2778
Peter Zijlstra	3165651	2008-07-18 18:01:23 +0200	[diff] [blame]	2779	hrtick_start(rq, delta);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2780	}
				2781	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	2782
				2783	/*
				2784	* called from enqueue/dequeue and updates the hrtick when the
				2785	* current task is from our class and nr_running is low enough
				2786	* to matter.
				2787	*/
				2788	static void hrtick_update(struct rq *rq)
				2789	{
				2790	struct task_struct *curr = rq->curr;
				2791
Mike Galbraith	b39e66e	2011-11-22 15:20:07 +0100	[diff] [blame]	2792	if (!hrtick_enabled(rq) \|\| curr->sched_class != &fair_sched_class)
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	2793	return;
				2794
				2795	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
				2796	hrtick_start_fair(rq, curr);
				2797	}
Dhaval Giani	55e12e5	2008-06-24 23:39:43 +0530	[diff] [blame]	2798	#else /* !CONFIG_SCHED_HRTICK */
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2799	static inline void
				2800	hrtick_start_fair(struct rq rq, struct task_struct p)
				2801	{
				2802	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	2803
				2804	static inline void hrtick_update(struct rq *rq)
				2805	{
				2806	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2807	#endif
				2808
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2809	/*
				2810	* The enqueue_task method is called before nr_running is
				2811	* increased. Here we update the fair scheduling stats and
				2812	* then put the task into the rbtree:
				2813	*/
Thomas Gleixner	ea87bb7	2010-01-20 20:58:57 +0000	[diff] [blame]	2814	static void
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	2815	enqueue_task_fair(struct rq rq, struct task_struct p, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2816	{
				2817	struct cfs_rq *cfs_rq;
Peter Zijlstra	62fb185	2008-02-25 17:34:02 +0100	[diff] [blame]	2818	struct sched_entity *se = &p->se;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2819
				2820	for_each_sched_entity(se) {
Peter Zijlstra	62fb185	2008-02-25 17:34:02 +0100	[diff] [blame]	2821	if (se->on_rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2822	break;
				2823	cfs_rq = cfs_rq_of(se);
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2824	enqueue_entity(cfs_rq, se, flags);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2825
				2826	/*
				2827	* end evaluation on encountering a throttled cfs_rq
				2828	*
				2829	* note: in the case of encountering a throttled cfs_rq we will
				2830	* post the final h_nr_running increment below.
				2831	*/
				2832	if (cfs_rq_throttled(cfs_rq))
				2833	break;
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	2834	cfs_rq->h_nr_running++;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2835
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2836	flags = ENQUEUE_WAKEUP;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2837	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2838
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2839	for_each_sched_entity(se) {
Lin Ming	0f31714	2011-07-22 09:14:31 +0800	[diff] [blame]	2840	cfs_rq = cfs_rq_of(se);
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	2841	cfs_rq->h_nr_running++;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2842
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2843	if (cfs_rq_throttled(cfs_rq))
				2844	break;
				2845
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	2846	update_cfs_shares(cfs_rq);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2847	update_entity_load_avg(se, 1);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2848	}
				2849
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	2850	if (!se) {
				2851	update_rq_runnable_avg(rq, rq->nr_running);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2852	inc_nr_running(rq);
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	2853	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	2854	hrtick_update(rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2855	}
				2856
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	2857	static void set_next_buddy(struct sched_entity *se);
				2858
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2859	/*
				2860	* The dequeue_task method is called before nr_running is
				2861	* decreased. We remove the task from the rbtree and
				2862	* update the fair scheduling stats:
				2863	*/
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	2864	static void dequeue_task_fair(struct rq rq, struct task_struct p, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2865	{
				2866	struct cfs_rq *cfs_rq;
Peter Zijlstra	62fb185	2008-02-25 17:34:02 +0100	[diff] [blame]	2867	struct sched_entity *se = &p->se;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	2868	int task_sleep = flags & DEQUEUE_SLEEP;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2869
				2870	for_each_sched_entity(se) {
				2871	cfs_rq = cfs_rq_of(se);
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	2872	dequeue_entity(cfs_rq, se, flags);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2873
				2874	/*
				2875	* end evaluation on encountering a throttled cfs_rq
				2876	*
				2877	* note: in the case of encountering a throttled cfs_rq we will
				2878	* post the final h_nr_running decrement below.
				2879	*/
				2880	if (cfs_rq_throttled(cfs_rq))
				2881	break;
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	2882	cfs_rq->h_nr_running--;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2883
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2884	/* Don't dequeue parent if it has other entities besides us */
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	2885	if (cfs_rq->load.weight) {
				2886	/*
				2887	* Bias pick_next to pick a task from this cfs_rq, as
				2888	* p is sleeping when it is within its sched_slice.
				2889	*/
				2890	if (task_sleep && parent_entity(se))
				2891	set_next_buddy(parent_entity(se));
Paul Turner	9598c82	2011-07-06 22:30:37 -0700	[diff] [blame]	2892
				2893	/* avoid re-evaluating load for this entity */
				2894	se = parent_entity(se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2895	break;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	2896	}
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	2897	flags \|= DEQUEUE_SLEEP;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2898	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2899
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2900	for_each_sched_entity(se) {
Lin Ming	0f31714	2011-07-22 09:14:31 +0800	[diff] [blame]	2901	cfs_rq = cfs_rq_of(se);
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	2902	cfs_rq->h_nr_running--;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2903
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2904	if (cfs_rq_throttled(cfs_rq))
				2905	break;
				2906
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	2907	update_cfs_shares(cfs_rq);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2908	update_entity_load_avg(se, 1);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2909	}
				2910
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	2911	if (!se) {
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2912	dec_nr_running(rq);
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	2913	update_rq_runnable_avg(rq, 1);
				2914	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	2915	hrtick_update(rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2916	}
				2917
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	2918	#ifdef CONFIG_SMP
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2919	/* Used instead of source_load when we know the type == 0 */
				2920	static unsigned long weighted_cpuload(const int cpu)
				2921	{
				2922	return cpu_rq(cpu)->load.weight;
				2923	}
				2924
				2925	/*
				2926	* Return a low guess at the load of a migration-source cpu weighted
				2927	* according to the scheduling class and "nice" value.
				2928	*
				2929	* We want to under-estimate the load of migration sources, to
				2930	* balance conservatively.
				2931	*/
				2932	static unsigned long source_load(int cpu, int type)
				2933	{
				2934	struct rq *rq = cpu_rq(cpu);
				2935	unsigned long total = weighted_cpuload(cpu);
				2936
				2937	if (type == 0 \|\| !sched_feat(LB_BIAS))
				2938	return total;
				2939
				2940	return min(rq->cpu_load[type-1], total);
				2941	}
				2942
				2943	/*
				2944	* Return a high guess at the load of a migration-target cpu weighted
				2945	* according to the scheduling class and "nice" value.
				2946	*/
				2947	static unsigned long target_load(int cpu, int type)
				2948	{
				2949	struct rq *rq = cpu_rq(cpu);
				2950	unsigned long total = weighted_cpuload(cpu);
				2951
				2952	if (type == 0 \|\| !sched_feat(LB_BIAS))
				2953	return total;
				2954
				2955	return max(rq->cpu_load[type-1], total);
				2956	}
				2957
				2958	static unsigned long power_of(int cpu)
				2959	{
				2960	return cpu_rq(cpu)->cpu_power;
				2961	}
				2962
				2963	static unsigned long cpu_avg_load_per_task(int cpu)
				2964	{
				2965	struct rq *rq = cpu_rq(cpu);
				2966	unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
				2967
				2968	if (nr_running)
				2969	return rq->load.weight / nr_running;
				2970
				2971	return 0;
				2972	}
				2973
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	2974
Peter Zijlstra	74f8e4b	2011-04-05 17:23:47 +0200	[diff] [blame]	2975	static void task_waking_fair(struct task_struct *p)
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2976	{
				2977	struct sched_entity *se = &p->se;
				2978	struct cfs_rq *cfs_rq = cfs_rq_of(se);
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	2979	u64 min_vruntime;
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2980
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	2981	#ifndef CONFIG_64BIT
				2982	u64 min_vruntime_copy;
Peter Zijlstra	74f8e4b	2011-04-05 17:23:47 +0200	[diff] [blame]	2983
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	2984	do {
				2985	min_vruntime_copy = cfs_rq->min_vruntime_copy;
				2986	smp_rmb();
				2987	min_vruntime = cfs_rq->min_vruntime;
				2988	} while (min_vruntime != min_vruntime_copy);
				2989	#else
				2990	min_vruntime = cfs_rq->min_vruntime;
				2991	#endif
				2992
				2993	se->vruntime -= min_vruntime;
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2994	}
				2995
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	2996	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	f5bfb7d	2008-06-27 13:41:39 +0200	[diff] [blame]	2997	/*
				2998	* effective_load() calculates the load change as seen from the root_task_group
				2999	*
				3000	* Adding load to a group doesn't make a group heavier, but can cause movement
				3001	* of group shares between cpus. Assuming the shares were perfectly aligned one
				3002	* can calculate the shift in shares.
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3003	*
				3004	* Calculate the effective load difference if @wl is added (subtracted) to @tg
				3005	* on this @cpu and results in a total addition (subtraction) of @wg to the
				3006	* total group weight.
				3007	*
				3008	* Given a runqueue weight distribution (rw_i) we can compute a shares
				3009	* distribution (s_i) using:
				3010	*
				3011	* s_i = rw_i / \Sum rw_j (1)
				3012	*
				3013	* Suppose we have 4 CPUs and our @tg is a direct child of the root group and
				3014	* has 7 equal weight tasks, distributed as below (rw_i), with the resulting
				3015	* shares distribution (s_i):
				3016	*
				3017	* rw_i = { 2, 4, 1, 0 }
				3018	* s_i = { 2/7, 4/7, 1/7, 0 }
				3019	*
				3020	* As per wake_affine() we're interested in the load of two CPUs (the CPU the
				3021	* task used to run on and the CPU the waker is running on), we need to
				3022	* compute the effect of waking a task on either CPU and, in case of a sync
				3023	* wakeup, compute the effect of the current task going to sleep.
				3024	*
				3025	* So for a change of @wl to the local @cpu with an overall group weight change
				3026	* of @wl we can compute the new shares distribution (s'_i) using:
				3027	*
				3028	* s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
				3029	*
				3030	* Suppose we're interested in CPUs 0 and 1, and want to compute the load
				3031	* differences in waking a task to CPU 0. The additional task changes the
				3032	* weight and shares distributions like:
				3033	*
				3034	* rw'_i = { 3, 4, 1, 0 }
				3035	* s'_i = { 3/8, 4/8, 1/8, 0 }
				3036	*
				3037	* We can then compute the difference in effective weight by using:
				3038	*
				3039	* dw_i = S * (s'_i - s_i) (3)
				3040	*
				3041	* Where 'S' is the group weight as seen by its parent.
				3042	*
				3043	* Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
				3044	* times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
				3045	* 4/7) times the weight of the group.
Peter Zijlstra	f5bfb7d	2008-06-27 13:41:39 +0200	[diff] [blame]	3046	*/
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	3047	static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3048	{
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3049	struct sched_entity *se = tg->se[cpu];
Peter Zijlstra	f1d239f	2008-06-27 13:41:38 +0200	[diff] [blame]	3050
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3051	if (!tg->parent) /* the trivial, non-cgroup case */
Peter Zijlstra	f1d239f	2008-06-27 13:41:38 +0200	[diff] [blame]	3052	return wl;
				3053
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3054	for_each_sched_entity(se) {
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3055	long w, W;
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3056
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	3057	tg = se->my_q->tg;
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3058
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3059	/*
				3060	* W = @wg + \Sum rw_j
				3061	*/
				3062	W = wg + calc_tg_weight(tg, se->my_q);
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3063
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3064	/*
				3065	* w = rw_i + @wl
				3066	*/
				3067	w = se->my_q->load.weight + wl;
Peter Zijlstra	940959e	2008-09-23 15:33:42 +0200	[diff] [blame]	3068
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3069	/*
				3070	* wl = S * s'_i; see (2)
				3071	*/
				3072	if (W > 0 && w < W)
				3073	wl = (w * tg->shares) / W;
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	3074	else
				3075	wl = tg->shares;
Peter Zijlstra	940959e	2008-09-23 15:33:42 +0200	[diff] [blame]	3076
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3077	/*
				3078	* Per the above, wl is the new se->load.weight value; since
				3079	* those are clipped to [MIN_SHARES, ...) do so now. See
				3080	* calc_cfs_shares().
				3081	*/
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	3082	if (wl < MIN_SHARES)
				3083	wl = MIN_SHARES;
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3084
				3085	/*
				3086	* wl = dw_i = S * (s'_i - s_i); see (3)
				3087	*/
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	3088	wl -= se->load.weight;
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3089
				3090	/*
				3091	* Recursively apply this logic to all parent groups to compute
				3092	* the final effective load change on the root group. Since
				3093	* only the @tg group gets extra weight, all parent groups can
				3094	* only redistribute existing shares. @wl is the shift in shares
				3095	* resulting from this level per the above.
				3096	*/
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3097	wg = 0;
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3098	}
				3099
				3100	return wl;
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3101	}
				3102	#else
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3103
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3104	static inline unsigned long effective_load(struct task_group *tg, int cpu,
				3105	unsigned long wl, unsigned long wg)
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3106	{
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3107	return wl;
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3108	}
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3109
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3110	#endif
				3111
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3112	static int wake_affine(struct sched_domain sd, struct task_struct p, int sync)
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3113	{
Paul Turner	e37b6a7	2011-01-21 20:44:59 -0800	[diff] [blame]	3114	s64 this_load, load;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3115	int idx, this_cpu, prev_cpu;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3116	unsigned long tl_per_task;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3117	struct task_group *tg;
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3118	unsigned long weight;
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	3119	int balanced;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3120
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3121	idx = sd->wake_idx;
				3122	this_cpu = smp_processor_id();
				3123	prev_cpu = task_cpu(p);
				3124	load = source_load(prev_cpu, idx);
				3125	this_load = target_load(this_cpu, idx);
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3126
				3127	/*
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3128	* If sync wakeup then subtract the (maximum possible)
				3129	* effect of the currently running task from the load
				3130	* of the current CPU:
				3131	*/
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3132	if (sync) {
				3133	tg = task_group(current);
				3134	weight = current->se.load.weight;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3135
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3136	this_load += effective_load(tg, this_cpu, -weight, -weight);
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3137	load += effective_load(tg, prev_cpu, 0, -weight);
				3138	}
				3139
				3140	tg = task_group(p);
				3141	weight = p->se.load.weight;
				3142
Peter Zijlstra	71a29aa	2009-09-07 18:28:05 +0200	[diff] [blame]	3143	/*
				3144	* In low-load situations, where prev_cpu is idle and this_cpu is idle
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3145	* due to the sync cause above having dropped this_load to 0, we'll
				3146	* always have an imbalance, but there's really nothing you can do
				3147	* about that, so that's good too.
Peter Zijlstra	71a29aa	2009-09-07 18:28:05 +0200	[diff] [blame]	3148	*
				3149	* Otherwise check if either cpus are near enough in load to allow this
				3150	* task to be woken on this_cpu.
				3151	*/
Paul Turner	e37b6a7	2011-01-21 20:44:59 -0800	[diff] [blame]	3152	if (this_load > 0) {
				3153	s64 this_eff_load, prev_eff_load;
Peter Zijlstra	e51fd5e	2010-05-31 12:37:30 +0200	[diff] [blame]	3154
				3155	this_eff_load = 100;
				3156	this_eff_load *= power_of(prev_cpu);
				3157	this_eff_load *= this_load +
				3158	effective_load(tg, this_cpu, weight, weight);
				3159
				3160	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
				3161	prev_eff_load *= power_of(this_cpu);
				3162	prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
				3163
				3164	balanced = this_eff_load <= prev_eff_load;
				3165	} else
				3166	balanced = true;
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	3167
				3168	/*
				3169	* If the currently running task will sleep within
				3170	* a reasonable amount of time then attract this newly
				3171	* woken task:
				3172	*/
Peter Zijlstra	2fb7635	2008-10-08 09:16:04 +0200	[diff] [blame]	3173	if (sync && balanced)
				3174	return 1;
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	3175
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3176	schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	3177	tl_per_task = cpu_avg_load_per_task(this_cpu);
				3178
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3179	if (balanced \|\|
				3180	(this_load <= load &&
				3181	this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3182	/*
				3183	* This domain has SD_WAKE_AFFINE and
				3184	* p is cache cold in this domain, and
				3185	* there is no bad imbalance.
				3186	*/
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3187	schedstat_inc(sd, ttwu_move_affine);
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3188	schedstat_inc(p, se.statistics.nr_wakeups_affine);
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3189
				3190	return 1;
				3191	}
				3192	return 0;
				3193	}
				3194
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3195	/*
				3196	* find_idlest_group finds and returns the least busy CPU group within the
				3197	* domain.
				3198	*/
				3199	static struct sched_group *
Peter Zijlstra	78e7ed5	2009-09-03 13:16:51 +0200	[diff] [blame]	3200	find_idlest_group(struct sched_domain sd, struct task_struct p,
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	3201	int this_cpu, int load_idx)
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3202	{
Andi Kleen	b3bd3de	2010-08-10 14:17:51 -0700	[diff] [blame]	3203	struct sched_group idlest = NULL, group = sd->groups;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3204	unsigned long min_load = ULONG_MAX, this_load = 0;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3205	int imbalance = 100 + (sd->imbalance_pct-100)/2;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3206
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3207	do {
				3208	unsigned long load, avg_load;
				3209	int local_group;
				3210	int i;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3211
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3212	/* Skip over this group if it has no CPUs allowed */
				3213	if (!cpumask_intersects(sched_group_cpus(group),
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	3214	tsk_cpus_allowed(p)))
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3215	continue;
				3216
				3217	local_group = cpumask_test_cpu(this_cpu,
				3218	sched_group_cpus(group));
				3219
				3220	/* Tally up the load of all CPUs in the group */
				3221	avg_load = 0;
				3222
				3223	for_each_cpu(i, sched_group_cpus(group)) {
				3224	/* Bias balancing toward cpus of our domain */
				3225	if (local_group)
				3226	load = source_load(i, load_idx);
				3227	else
				3228	load = target_load(i, load_idx);
				3229
				3230	avg_load += load;
				3231	}
				3232
				3233	/* Adjust by relative CPU power of the group */
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	3234	avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3235
				3236	if (local_group) {
				3237	this_load = avg_load;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3238	} else if (avg_load < min_load) {
				3239	min_load = avg_load;
				3240	idlest = group;
				3241	}
				3242	} while (group = group->next, group != sd->groups);
				3243
				3244	if (!idlest \|\| 100this_load < imbalancemin_load)
				3245	return NULL;
				3246	return idlest;
				3247	}
				3248
				3249	/*
				3250	* find_idlest_cpu - find the idlest cpu among the cpus in group.
				3251	*/
				3252	static int
				3253	find_idlest_cpu(struct sched_group group, struct task_struct p, int this_cpu)
				3254	{
				3255	unsigned long load, min_load = ULONG_MAX;
				3256	int idlest = -1;
				3257	int i;
				3258
				3259	/* Traverse only the allowed CPUs */
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	3260	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3261	load = weighted_cpuload(i);
				3262
				3263	if (load < min_load \|\| (load == min_load && i == this_cpu)) {
				3264	min_load = load;
				3265	idlest = i;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3266	}
				3267	}
				3268
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3269	return idlest;
				3270	}
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3271
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3272	/*
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3273	* Try and locate an idle CPU in the sched_domain.
				3274	*/
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3275	static int select_idle_sibling(struct task_struct *p, int target)
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3276	{
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3277	struct sched_domain *sd;
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3278	struct sched_group *sg;
Mike Galbraith	e0a79f5	2013-01-28 12:19:25 +0100	[diff] [blame]	3279	int i = task_cpu(p);
				3280
				3281	if (idle_cpu(target))
				3282	return target;
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3283
				3284	/*
Mike Galbraith	e0a79f5	2013-01-28 12:19:25 +0100	[diff] [blame]	3285	* If the prevous cpu is cache affine and idle, don't be stupid.
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3286	*/
Mike Galbraith	e0a79f5	2013-01-28 12:19:25 +0100	[diff] [blame]	3287	if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
				3288	return i;
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3289
				3290	/*
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3291	* Otherwise, iterate the domains and find an elegible idle cpu.
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3292	*/
Peter Zijlstra	518cd62	2011-12-07 15:07:31 +0100	[diff] [blame]	3293	sd = rcu_dereference(per_cpu(sd_llc, target));
Suresh Siddha	77e8136	2011-11-17 11:08:23 -0800	[diff] [blame]	3294	for_each_lower_domain(sd) {
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3295	sg = sd->groups;
				3296	do {
				3297	if (!cpumask_intersects(sched_group_cpus(sg),
				3298	tsk_cpus_allowed(p)))
				3299	goto next;
Mike Galbraith	970e178	2012-06-12 05:18:32 +0200	[diff] [blame]	3300
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3301	for_each_cpu(i, sched_group_cpus(sg)) {
Mike Galbraith	e0a79f5	2013-01-28 12:19:25 +0100	[diff] [blame]	3302	if (i == target \|\| !idle_cpu(i))
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3303	goto next;
				3304	}
				3305
				3306	target = cpumask_first_and(sched_group_cpus(sg),
				3307	tsk_cpus_allowed(p));
				3308	goto done;
				3309	next:
				3310	sg = sg->next;
				3311	} while (sg != sd->groups);
				3312	}
				3313	done:
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3314	return target;
				3315	}
				3316
				3317	/*
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3318	* sched_balance_self: balance the current task (running on cpu) in domains
				3319	* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
				3320	* SD_BALANCE_EXEC.
				3321	*
				3322	* Balance, ie. select the least loaded group.
				3323	*
				3324	* Returns the target CPU number, or the same CPU if no balancing is needed.
				3325	*
				3326	* preempt must be disabled.
				3327	*/
Peter Zijlstra	0017d73	2010-03-24 18:34:10 +0100	[diff] [blame]	3328	static int
Peter Zijlstra	7608dec	2011-04-05 17:23:46 +0200	[diff] [blame]	3329	select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3330	{
Peter Zijlstra	29cd8ba	2009-09-17 09:01:14 +0200	[diff] [blame]	3331	struct sched_domain tmp, affine_sd = NULL, *sd = NULL;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3332	int cpu = smp_processor_id();
				3333	int prev_cpu = task_cpu(p);
				3334	int new_cpu = cpu;
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3335	int want_affine = 0;
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	3336	int sync = wake_flags & WF_SYNC;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3337
Peter Zijlstra	29baa74	2012-04-23 12:11:21 +0200	[diff] [blame]	3338	if (p->nr_cpus_allowed == 1)
Mike Galbraith	76854c7	2011-11-22 15:18:24 +0100	[diff] [blame]	3339	return prev_cpu;
				3340
Peter Zijlstra	0763a66	2009-09-14 19:37:39 +0200	[diff] [blame]	3341	if (sd_flag & SD_BALANCE_WAKE) {
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	3342	if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3343	want_affine = 1;
				3344	new_cpu = prev_cpu;
				3345	}
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3346
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	3347	rcu_read_lock();
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3348	for_each_domain(cpu, tmp) {
Peter Zijlstra	e4f42888	2009-12-16 18:04:34 +0100	[diff] [blame]	3349	if (!(tmp->flags & SD_LOAD_BALANCE))
				3350	continue;
				3351
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3352	/*
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3353	* If both cpu and prev_cpu are part of this domain,
				3354	* cpu is a valid SD_WAKE_AFFINE target.
Peter Zijlstra	fe3bcfe	2009-11-12 15:55:29 +0100	[diff] [blame]	3355	*/
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3356	if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
				3357	cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
				3358	affine_sd = tmp;
Alex Shi	f03542a	2012-07-26 08:55:34 +0800	[diff] [blame]	3359	break;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3360	}
				3361
Alex Shi	f03542a	2012-07-26 08:55:34 +0800	[diff] [blame]	3362	if (tmp->flags & sd_flag)
Peter Zijlstra	29cd8ba	2009-09-17 09:01:14 +0200	[diff] [blame]	3363	sd = tmp;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3364	}
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3365
Mike Galbraith	8b911ac	2010-03-11 17:17:16 +0100	[diff] [blame]	3366	if (affine_sd) {
Alex Shi	f03542a	2012-07-26 08:55:34 +0800	[diff] [blame]	3367	if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	3368	prev_cpu = cpu;
				3369
				3370	new_cpu = select_idle_sibling(p, prev_cpu);
				3371	goto unlock;
Mike Galbraith	8b911ac	2010-03-11 17:17:16 +0100	[diff] [blame]	3372	}
Peter Zijlstra	3b64089	2009-09-16 13:44:33 +0200	[diff] [blame]	3373
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3374	while (sd) {
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	3375	int load_idx = sd->forkexec_idx;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3376	struct sched_group *group;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3377	int weight;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3378
Peter Zijlstra	0763a66	2009-09-14 19:37:39 +0200	[diff] [blame]	3379	if (!(sd->flags & sd_flag)) {
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3380	sd = sd->child;
				3381	continue;
				3382	}
				3383
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	3384	if (sd_flag & SD_BALANCE_WAKE)
				3385	load_idx = sd->wake_idx;
				3386
				3387	group = find_idlest_group(sd, p, cpu, load_idx);
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3388	if (!group) {
				3389	sd = sd->child;
				3390	continue;
				3391	}
				3392
Peter Zijlstra	d7c33c4	2009-09-11 12:45:38 +0200	[diff] [blame]	3393	new_cpu = find_idlest_cpu(group, p, cpu);
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3394	if (new_cpu == -1 \|\| new_cpu == cpu) {
				3395	/* Now try balancing at a lower domain level of cpu */
				3396	sd = sd->child;
				3397	continue;
				3398	}
				3399
				3400	/* Now try balancing at a lower domain level of new_cpu */
				3401	cpu = new_cpu;
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	3402	weight = sd->span_weight;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3403	sd = NULL;
				3404	for_each_domain(cpu, tmp) {
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	3405	if (weight <= tmp->span_weight)
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3406	break;
Peter Zijlstra	0763a66	2009-09-14 19:37:39 +0200	[diff] [blame]	3407	if (tmp->flags & sd_flag)
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3408	sd = tmp;
				3409	}
				3410	/* while loop will break here if sd == NULL */
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3411	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	3412	unlock:
				3413	rcu_read_unlock();
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3414
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3415	return new_cpu;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3416	}
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	3417
				3418	/*
Paul Turner	f4e26b1	2012-10-04 13:18:32 +0200	[diff] [blame]	3419	* Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
				3420	* removed when useful for applications beyond shares distribution (e.g.
				3421	* load-balance).
				3422	*/
				3423	#ifdef CONFIG_FAIR_GROUP_SCHED
				3424	/*
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	3425	* Called immediately before a task is migrated to a new cpu; task_cpu(p) and
				3426	* cfs_rq_of(p) references at time of call are still valid and identify the
				3427	* previous cpu. However, the caller only guarantees p->pi_lock is held; no
				3428	* other assumptions, including the state of rq->lock, should be made.
				3429	*/
				3430	static void
				3431	migrate_task_rq_fair(struct task_struct *p, int next_cpu)
				3432	{
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	3433	struct sched_entity *se = &p->se;
				3434	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				3435
				3436	/*
				3437	* Load tracking: accumulate removed load so that it can be processed
				3438	* when we next update owning cfs_rq under rq->lock. Tasks contribute
				3439	* to blocked load iff they have a positive decay-count. It can never
				3440	* be negative here since on-rq tasks have decay-count == 0.
				3441	*/
				3442	if (se->avg.decay_count) {
				3443	se->avg.decay_count = -__synchronize_entity_decay(se);
				3444	atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
				3445	}
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	3446	}
Paul Turner	f4e26b1	2012-10-04 13:18:32 +0200	[diff] [blame]	3447	#endif
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3448	#endif /* CONFIG_SMP */
				3449
Peter Zijlstra	e52fb7c	2009-01-14 12:39:19 +0100	[diff] [blame]	3450	static unsigned long
				3451	wakeup_gran(struct sched_entity curr, struct sched_entity se)
Peter Zijlstra	0bbd333	2008-04-19 19:44:57 +0200	[diff] [blame]	3452	{
				3453	unsigned long gran = sysctl_sched_wakeup_granularity;
				3454
				3455	/*
Peter Zijlstra	e52fb7c	2009-01-14 12:39:19 +0100	[diff] [blame]	3456	* Since its curr running now, convert the gran from real-time
				3457	* to virtual-time in his units.
Mike Galbraith	13814d4	2010-03-11 17:17:04 +0100	[diff] [blame]	3458	*
				3459	* By using 'se' instead of 'curr' we penalize light tasks, so
				3460	* they get preempted easier. That is, if 'se' < 'curr' then
				3461	* the resulting gran will be larger, therefore penalizing the
				3462	* lighter, if otoh 'se' > 'curr' then the resulting gran will
				3463	* be smaller, again penalizing the lighter task.
				3464	*
				3465	* This is especially important for buddies when the leftmost
				3466	* task is higher priority than the buddy.
Peter Zijlstra	0bbd333	2008-04-19 19:44:57 +0200	[diff] [blame]	3467	*/
Shaohua Li	f4ad9bd	2011-04-08 12:53:09 +0800	[diff] [blame]	3468	return calc_delta_fair(gran, se);
Peter Zijlstra	0bbd333	2008-04-19 19:44:57 +0200	[diff] [blame]	3469	}
				3470
				3471	/*
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	3472	* Should 'se' preempt 'curr'.
				3473	*
				3474	* \|s1
				3475	* \|s2
				3476	* \|s3
				3477	* g
				3478	* \|<--->\|c
				3479	*
				3480	* w(c, s1) = -1
				3481	* w(c, s2) = 0
				3482	* w(c, s3) = 1
				3483	*
				3484	*/
				3485	static int
				3486	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se)
				3487	{
				3488	s64 gran, vdiff = curr->vruntime - se->vruntime;
				3489
				3490	if (vdiff <= 0)
				3491	return -1;
				3492
Peter Zijlstra	e52fb7c	2009-01-14 12:39:19 +0100	[diff] [blame]	3493	gran = wakeup_gran(curr, se);
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	3494	if (vdiff > gran)
				3495	return 1;
				3496
				3497	return 0;
				3498	}
				3499
Peter Zijlstra	0247909	2008-11-04 21:25:10 +0100	[diff] [blame]	3500	static void set_last_buddy(struct sched_entity *se)
				3501	{
Venkatesh Pallipadi	69c80f3	2011-04-13 18:21:09 -0700	[diff] [blame]	3502	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
				3503	return;
				3504
				3505	for_each_sched_entity(se)
				3506	cfs_rq_of(se)->last = se;
Peter Zijlstra	0247909	2008-11-04 21:25:10 +0100	[diff] [blame]	3507	}
				3508
				3509	static void set_next_buddy(struct sched_entity *se)
				3510	{
Venkatesh Pallipadi	69c80f3	2011-04-13 18:21:09 -0700	[diff] [blame]	3511	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
				3512	return;
				3513
				3514	for_each_sched_entity(se)
				3515	cfs_rq_of(se)->next = se;
Peter Zijlstra	0247909	2008-11-04 21:25:10 +0100	[diff] [blame]	3516	}
				3517
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3518	static void set_skip_buddy(struct sched_entity *se)
				3519	{
Venkatesh Pallipadi	69c80f3	2011-04-13 18:21:09 -0700	[diff] [blame]	3520	for_each_sched_entity(se)
				3521	cfs_rq_of(se)->skip = se;
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3522	}
				3523
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	3524	/*
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3525	* Preempt the current task with a newly woken task if needed:
				3526	*/
Peter Zijlstra	5a9b86f	2009-09-16 13:47:58 +0200	[diff] [blame]	3527	static void check_preempt_wakeup(struct rq rq, struct task_struct p, int wake_flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3528	{
				3529	struct task_struct *curr = rq->curr;
Srivatsa Vaddagiri	8651a86	2007-10-15 17:00:12 +0200	[diff] [blame]	3530	struct sched_entity se = &curr->se, pse = &p->se;
Mike Galbraith	03e89e4	2008-12-16 08:45:30 +0100	[diff] [blame]	3531	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	3532	int scale = cfs_rq->nr_running >= sched_nr_latency;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3533	int next_buddy_marked = 0;
Mike Galbraith	03e89e4	2008-12-16 08:45:30 +0100	[diff] [blame]	3534
Ingo Molnar	4ae7d5c	2008-03-19 01:42:00 +0100	[diff] [blame]	3535	if (unlikely(se == pse))
				3536	return;
				3537
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	3538	/*
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3539	* This is possible from callers such as move_task(), in which we
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	3540	* unconditionally check_prempt_curr() after an enqueue (which may have
				3541	* lead to a throttle). This both saves work and prevents false
				3542	* next-buddy nomination below.
				3543	*/
				3544	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
				3545	return;
				3546
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3547	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
Mike Galbraith	3cb63d5	2009-09-11 12:01:17 +0200	[diff] [blame]	3548	set_next_buddy(pse);
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3549	next_buddy_marked = 1;
				3550	}
Peter Zijlstra	57fdc26	2008-09-23 15:33:45 +0200	[diff] [blame]	3551
Bharata B Rao	aec0a51	2008-08-28 14:42:49 +0530	[diff] [blame]	3552	/*
				3553	* We can come here with TIF_NEED_RESCHED already set from new task
				3554	* wake up path.
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	3555	*
				3556	* Note: this also catches the edge-case of curr being in a throttled
				3557	* group (e.g. via set_curr_task), since update_curr() (in the
				3558	* enqueue of curr) will have resulted in resched being set. This
				3559	* prevents us from potentially nominating it as a false LAST_BUDDY
				3560	* below.
Bharata B Rao	aec0a51	2008-08-28 14:42:49 +0530	[diff] [blame]	3561	*/
				3562	if (test_tsk_need_resched(curr))
				3563	return;
				3564
Darren Hart	a2f5c9a	2011-02-22 13:04:33 -0800	[diff] [blame]	3565	/* Idle tasks are by definition preempted by non-idle tasks. */
				3566	if (unlikely(curr->policy == SCHED_IDLE) &&
				3567	likely(p->policy != SCHED_IDLE))
				3568	goto preempt;
				3569
Ingo Molnar	91c234b	2007-10-15 17:00:18 +0200	[diff] [blame]	3570	/*
Darren Hart	a2f5c9a	2011-02-22 13:04:33 -0800	[diff] [blame]	3571	* Batch and idle tasks do not preempt non-idle tasks (their preemption
				3572	* is driven by the tick):
Ingo Molnar	91c234b	2007-10-15 17:00:18 +0200	[diff] [blame]	3573	*/
Ingo Molnar	8ed92e51	2012-10-14 14:28:50 +0200	[diff] [blame]	3574	if (unlikely(p->policy != SCHED_NORMAL) \|\| !sched_feat(WAKEUP_PREEMPTION))
Ingo Molnar	91c234b	2007-10-15 17:00:18 +0200	[diff] [blame]	3575	return;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3576
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	3577	find_matching_se(&se, &pse);
Paul Turner	9bbd737	2011-07-05 19:07:21 -0700	[diff] [blame]	3578	update_curr(cfs_rq_of(se));
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	3579	BUG_ON(!pse);
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3580	if (wakeup_preempt_entity(se, pse) == 1) {
				3581	/*
				3582	* Bias pick_next to pick the sched entity that is
				3583	* triggering this preemption.
				3584	*/
				3585	if (!next_buddy_marked)
				3586	set_next_buddy(pse);
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	3587	goto preempt;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3588	}
Jupyung Lee	a65ac74	2009-11-17 18:51:40 +0900	[diff] [blame]	3589
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	3590	return;
				3591
				3592	preempt:
				3593	resched_task(curr);
				3594	/*
				3595	* Only set the backward buddy when the current task is still
				3596	* on the rq. This can happen when a wakeup gets interleaved
				3597	* with schedule on the ->pre_schedule() or idle_balance()
				3598	* point, either of which can * drop the rq lock.
				3599	*
				3600	* Also, during early boot the idle thread is in the fair class,
				3601	* for obvious reasons its a bad idea to schedule back to it.
				3602	*/
				3603	if (unlikely(!se->on_rq \|\| curr == rq->idle))
				3604	return;
				3605
				3606	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
				3607	set_last_buddy(se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3608	}
				3609
Ingo Molnar	fb8d472	2007-08-09 11:16:48 +0200	[diff] [blame]	3610	static struct task_struct pick_next_task_fair(struct rq rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3611	{
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3612	struct task_struct *p;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3613	struct cfs_rq *cfs_rq = &rq->cfs;
				3614	struct sched_entity *se;
				3615
Tim Blechmann	36ace27	2009-11-24 11:55:45 +0100	[diff] [blame]	3616	if (!cfs_rq->nr_running)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3617	return NULL;
				3618
				3619	do {
Ingo Molnar	9948f4b	2007-08-09 11:16:48 +0200	[diff] [blame]	3620	se = pick_next_entity(cfs_rq);
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	3621	set_next_entity(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3622	cfs_rq = group_cfs_rq(se);
				3623	} while (cfs_rq);
				3624
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3625	p = task_of(se);
Mike Galbraith	b39e66e	2011-11-22 15:20:07 +0100	[diff] [blame]	3626	if (hrtick_enabled(rq))
				3627	hrtick_start_fair(rq, p);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3628
				3629	return p;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3630	}
				3631
				3632	/*
				3633	* Account for a descheduled task:
				3634	*/
Ingo Molnar	31ee529	2007-08-09 11:16:49 +0200	[diff] [blame]	3635	static void put_prev_task_fair(struct rq rq, struct task_struct prev)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3636	{
				3637	struct sched_entity *se = &prev->se;
				3638	struct cfs_rq *cfs_rq;
				3639
				3640	for_each_sched_entity(se) {
				3641	cfs_rq = cfs_rq_of(se);
Ingo Molnar	ab6cde2	2007-08-09 11:16:48 +0200	[diff] [blame]	3642	put_prev_entity(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3643	}
				3644	}
				3645
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3646	/*
				3647	* sched_yield() is very simple
				3648	*
				3649	* The magic of dealing with the ->skip buddy is in pick_next_entity.
				3650	*/
				3651	static void yield_task_fair(struct rq *rq)
				3652	{
				3653	struct task_struct *curr = rq->curr;
				3654	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
				3655	struct sched_entity *se = &curr->se;
				3656
				3657	/*
				3658	* Are we the only task in the tree?
				3659	*/
				3660	if (unlikely(rq->nr_running == 1))
				3661	return;
				3662
				3663	clear_buddies(cfs_rq, se);
				3664
				3665	if (curr->policy != SCHED_BATCH) {
				3666	update_rq_clock(rq);
				3667	/*
				3668	* Update run-time statistics of the 'current'.
				3669	*/
				3670	update_curr(cfs_rq);
Mike Galbraith	916671c	2011-11-22 15:21:26 +0100	[diff] [blame]	3671	/*
				3672	* Tell update_rq_clock() that we've just updated,
				3673	* so we don't do microscopic update in schedule()
				3674	* and double the fastpath cost.
				3675	*/
				3676	rq->skip_clock_update = 1;
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3677	}
				3678
				3679	set_skip_buddy(se);
				3680	}
				3681
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	3682	static bool yield_to_task_fair(struct rq rq, struct task_struct p, bool preempt)
				3683	{
				3684	struct sched_entity *se = &p->se;
				3685
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	3686	/* throttled hierarchies are not runnable */
				3687	if (!se->on_rq \|\| throttled_hierarchy(cfs_rq_of(se)))
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	3688	return false;
				3689
				3690	/* Tell the scheduler that we'd really like pse to run next. */
				3691	set_next_buddy(se);
				3692
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	3693	yield_task_fair(rq);
				3694
				3695	return true;
				3696	}
				3697
Peter Williams	681f3e6	2007-10-24 18:23:51 +0200	[diff] [blame]	3698	#ifdef CONFIG_SMP
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3699	/**************************************************
Peter Zijlstra	e9c84cb	2012-07-03 13:53:26 +0200	[diff] [blame]	3700	* Fair scheduling class load-balancing methods.
				3701	*
				3702	* BASICS
				3703	*
				3704	* The purpose of load-balancing is to achieve the same basic fairness the
				3705	* per-cpu scheduler provides, namely provide a proportional amount of compute
				3706	* time to each task. This is expressed in the following equation:
				3707	*
				3708	* W_i,n/P_i == W_j,n/P_j for all i,j (1)
				3709	*
				3710	* Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
				3711	* W_i,0 is defined as:
				3712	*
				3713	* W_i,0 = \Sum_j w_i,j (2)
				3714	*
				3715	* Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
				3716	* is derived from the nice value as per prio_to_weight[].
				3717	*
				3718	* The weight average is an exponential decay average of the instantaneous
				3719	* weight:
				3720	*
				3721	* W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
				3722	*
				3723	* P_i is the cpu power (or compute capacity) of cpu i, typically it is the
				3724	* fraction of 'recent' time available for SCHED_OTHER task execution. But it
				3725	* can also include other factors [XXX].
				3726	*
				3727	* To achieve this balance we define a measure of imbalance which follows
				3728	* directly from (1):
				3729	*
				3730	* imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4)
				3731	*
				3732	* We them move tasks around to minimize the imbalance. In the continuous
				3733	* function space it is obvious this converges, in the discrete case we get
				3734	* a few fun cases generally called infeasible weight scenarios.
				3735	*
				3736	* [XXX expand on:
				3737	* - infeasible weights;
				3738	* - local vs global optima in the discrete case. ]
				3739	*
				3740	*
				3741	* SCHED DOMAINS
				3742	*
				3743	* In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
				3744	* for all i,j solution, we create a tree of cpus that follows the hardware
				3745	* topology where each level pairs two lower groups (or better). This results
				3746	* in O(log n) layers. Furthermore we reduce the number of cpus going up the
				3747	* tree to only the first of the previous level and we decrease the frequency
				3748	* of load-balance at each level inv. proportional to the number of cpus in
				3749	* the groups.
				3750	*
				3751	* This yields:
				3752	*
				3753	* log_2 n 1 n
				3754	* \Sum { --- * --- * 2^i } = O(n) (5)
				3755	* i = 0 2^i 2^i
				3756	* `- size of each group
				3757	* \| \| `- number of cpus doing load-balance
				3758	* \| `- freq
				3759	* `- sum over all levels
				3760	*
				3761	* Coupled with a limit on how many tasks we can migrate every balance pass,
				3762	* this makes (5) the runtime complexity of the balancer.
				3763	*
				3764	* An important property here is that each CPU is still (indirectly) connected
				3765	* to every other cpu in at most O(log n) steps:
				3766	*
				3767	* The adjacency matrix of the resulting graph is given by:
				3768	*
				3769	* log_2 n
				3770	* A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
				3771	* k = 0
				3772	*
				3773	* And you'll find that:
				3774	*
				3775	* A^(log_2 n)_i,j != 0 for all i,j (7)
				3776	*
				3777	* Showing there's indeed a path between every cpu in at most O(log n) steps.
				3778	* The task movement gives a factor of O(m), giving a convergence complexity
				3779	* of:
				3780	*
				3781	* O(nm log n), n := nr_cpus, m := nr_tasks (8)
				3782	*
				3783	*
				3784	* WORK CONSERVING
				3785	*
				3786	* In order to avoid CPUs going idle while there's still work to do, new idle
				3787	* balancing is more aggressive and has the newly idle cpu iterate up the domain
				3788	* tree itself instead of relying on other CPUs to bring it work.
				3789	*
				3790	* This adds some complexity to both (5) and (8) but it reduces the total idle
				3791	* time.
				3792	*
				3793	* [XXX more?]
				3794	*
				3795	*
				3796	* CGROUPS
				3797	*
				3798	* Cgroups make a horror show out of (2), instead of a simple sum we get:
				3799	*
				3800	* s_k,i
				3801	* W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
				3802	* S_k
				3803	*
				3804	* Where
				3805	*
				3806	* s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
				3807	*
				3808	* w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
				3809	*
				3810	* The big problem is S_k, its a global sum needed to compute a local (W_i)
				3811	* property.
				3812	*
				3813	* [XXX write more on how we solve this.. _after_ merging pjt's patches that
				3814	* rewrite all of this once again.]
				3815	*/
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3816
Hiroshi Shimamoto	ed387b7	2012-01-31 11:40:32 +0900	[diff] [blame]	3817	static unsigned long __read_mostly max_load_balance_interval = HZ/10;
				3818
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3819	#define LBF_ALL_PINNED 0x01
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3820	#define LBF_NEED_BREAK 0x02
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	3821	#define LBF_SOME_PINNED 0x04
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3822
				3823	struct lb_env {
				3824	struct sched_domain *sd;
				3825
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3826	struct rq *src_rq;
Prashanth Nageshappa	85c1e7d	2012-06-19 17:47:34 +0530	[diff] [blame]	3827	int src_cpu;
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3828
				3829	int dst_cpu;
				3830	struct rq *dst_rq;
				3831
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	3832	struct cpumask *dst_grpmask;
				3833	int new_dst_cpu;
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3834	enum cpu_idle_type idle;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	3835	long imbalance;
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	3836	/* The set of CPUs under consideration for load-balancing */
				3837	struct cpumask *cpus;
				3838
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3839	unsigned int flags;
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3840
				3841	unsigned int loop;
				3842	unsigned int loop_break;
				3843	unsigned int loop_max;
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3844	};
				3845
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3846	/*
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3847	* move_task - move a task from one runqueue to another runqueue.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3848	* Both runqueues must be locked.
				3849	*/
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3850	static void move_task(struct task_struct p, struct lb_env env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3851	{
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3852	deactivate_task(env->src_rq, p, 0);
				3853	set_task_cpu(p, env->dst_cpu);
				3854	activate_task(env->dst_rq, p, 0);
				3855	check_preempt_curr(env->dst_rq, p, 0);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3856	}
				3857
				3858	/*
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3859	* Is this task likely cache-hot:
				3860	*/
				3861	static int
				3862	task_hot(struct task_struct p, u64 now, struct sched_domain sd)
				3863	{
				3864	s64 delta;
				3865
				3866	if (p->sched_class != &fair_sched_class)
				3867	return 0;
				3868
				3869	if (unlikely(p->policy == SCHED_IDLE))
				3870	return 0;
				3871
				3872	/*
				3873	* Buddy candidates are cache hot:
				3874	*/
				3875	if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
				3876	(&p->se == cfs_rq_of(&p->se)->next \|\|
				3877	&p->se == cfs_rq_of(&p->se)->last))
				3878	return 1;
				3879
				3880	if (sysctl_sched_migration_cost == -1)
				3881	return 1;
				3882	if (sysctl_sched_migration_cost == 0)
				3883	return 0;
				3884
				3885	delta = now - p->se.exec_start;
				3886
				3887	return delta < (s64)sysctl_sched_migration_cost;
				3888	}
				3889
				3890	/*
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3891	* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
				3892	*/
				3893	static
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	3894	int can_migrate_task(struct task_struct p, struct lb_env env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3895	{
				3896	int tsk_cache_hot = 0;
				3897	/*
				3898	* We do not migrate tasks that are:
Joonsoo Kim	d319808	2013-04-23 17:27:40 +0900	[diff] [blame]	3899	* 1) throttled_lb_pair, or
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3900	* 2) cannot be migrated to this CPU due to cpus_allowed, or
Joonsoo Kim	d319808	2013-04-23 17:27:40 +0900	[diff] [blame]	3901	* 3) running (obviously), or
				3902	* 4) are cache-hot on their current CPU.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3903	*/
Joonsoo Kim	d319808	2013-04-23 17:27:40 +0900	[diff] [blame]	3904	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
				3905	return 0;
				3906
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3907	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	3908	int new_dst_cpu;
				3909
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3910	schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	3911
				3912	/*
				3913	* Remember if this task can be migrated to any other cpu in
				3914	* our sched_group. We may want to revisit it if we couldn't
				3915	* meet load balance goals by pulling other tasks on src_cpu.
				3916	*
				3917	* Also avoid computing new_dst_cpu if we have already computed
				3918	* one in current iteration.
				3919	*/
				3920	if (!env->dst_grpmask \|\| (env->flags & LBF_SOME_PINNED))
				3921	return 0;
				3922
				3923	new_dst_cpu = cpumask_first_and(env->dst_grpmask,
				3924	tsk_cpus_allowed(p));
				3925	if (new_dst_cpu < nr_cpu_ids) {
				3926	env->flags \|= LBF_SOME_PINNED;
				3927	env->new_dst_cpu = new_dst_cpu;
				3928	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3929	return 0;
				3930	}
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	3931
				3932	/* Record that we found atleast one task that could run on dst_cpu */
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	3933	env->flags &= ~LBF_ALL_PINNED;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3934
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3935	if (task_running(env->src_rq, p)) {
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3936	schedstat_inc(p, se.statistics.nr_failed_migrations_running);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3937	return 0;
				3938	}
				3939
				3940	/*
				3941	* Aggressive migration if:
				3942	* 1) task is cache cold, or
				3943	* 2) too many balance attempts have failed.
				3944	*/
				3945
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3946	tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3947	if (!tsk_cache_hot \|\|
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	3948	env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
Zhang Hang	4e2dcb7	2013-04-10 14:04:55 +0800	[diff] [blame]	3949
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3950	if (tsk_cache_hot) {
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	3951	schedstat_inc(env->sd, lb_hot_gained[env->idle]);
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3952	schedstat_inc(p, se.statistics.nr_forced_migrations);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3953	}
Zhang Hang	4e2dcb7	2013-04-10 14:04:55 +0800	[diff] [blame]	3954
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3955	return 1;
				3956	}
				3957
Zhang Hang	4e2dcb7	2013-04-10 14:04:55 +0800	[diff] [blame]	3958	schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
				3959	return 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3960	}
				3961
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3962	/*
				3963	* move_one_task tries to move exactly one task from busiest to this_rq, as
				3964	* part of active balancing operations within "domain".
				3965	* Returns 1 if successful and 0 otherwise.
				3966	*
				3967	* Called with both runqueues locked.
				3968	*/
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	3969	static int move_one_task(struct lb_env *env)
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3970	{
				3971	struct task_struct p, n;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3972
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3973	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3974	if (!can_migrate_task(p, env))
				3975	continue;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3976
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3977	move_task(p, env);
				3978	/*
				3979	* Right now, this is only the second place move_task()
				3980	* is called, so we can safely collect move_task()
				3981	* stats here rather than inside move_task().
				3982	*/
				3983	schedstat_inc(env->sd, lb_gained[env->idle]);
				3984	return 1;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3985	}
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3986	return 0;
				3987	}
				3988
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3989	static unsigned long task_h_load(struct task_struct *p);
				3990
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	3991	static const unsigned int sched_nr_migrate_break = 32;
				3992
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	3993	/*
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	3994	* move_tasks tries to move up to imbalance weighted load from busiest to
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	3995	* this_rq, as part of a balancing operation within domain "sd".
				3996	* Returns 1 if successful and 0 otherwise.
				3997	*
				3998	* Called with both runqueues locked.
				3999	*/
				4000	static int move_tasks(struct lb_env *env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4001	{
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4002	struct list_head *tasks = &env->src_rq->cfs_tasks;
				4003	struct task_struct *p;
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4004	unsigned long load;
				4005	int pulled = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4006
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4007	if (env->imbalance <= 0)
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4008	return 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4009
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4010	while (!list_empty(tasks)) {
				4011	p = list_first_entry(tasks, struct task_struct, se.group_node);
				4012
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4013	env->loop++;
				4014	/* We've more or less seen every task there is, call it quits */
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4015	if (env->loop > env->loop_max)
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4016	break;
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4017
				4018	/* take a breather every nr_migrate tasks */
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4019	if (env->loop > env->loop_break) {
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	4020	env->loop_break += sched_nr_migrate_break;
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	4021	env->flags \|= LBF_NEED_BREAK;
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4022	break;
Peter Zijlstra	a195f00	2011-09-22 15:30:18 +0200	[diff] [blame]	4023	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4024
Joonsoo Kim	d319808	2013-04-23 17:27:40 +0900	[diff] [blame]	4025	if (!can_migrate_task(p, env))
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4026	goto next;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4027
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4028	load = task_h_load(p);
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4029
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	4030	if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4031	goto next;
				4032
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4033	if ((load / 2) > env->imbalance)
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4034	goto next;
				4035
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4036	move_task(p, env);
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4037	pulled++;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4038	env->imbalance -= load;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4039
				4040	#ifdef CONFIG_PREEMPT
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4041	/*
				4042	* NEWIDLE balancing is a source of latency, so preemptible
				4043	* kernels will stop after the first task is pulled to minimize
				4044	* the critical section.
				4045	*/
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4046	if (env->idle == CPU_NEWLY_IDLE)
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4047	break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4048	#endif
				4049
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4050	/*
				4051	* We only want to steal up to the prescribed amount of
				4052	* weighted load.
				4053	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4054	if (env->imbalance <= 0)
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4055	break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4056
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4057	continue;
				4058	next:
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4059	list_move_tail(&p->se.group_node, tasks);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4060	}
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4061
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4062	/*
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4063	* Right now, this is one of only two places move_task() is called,
				4064	* so we can safely collect move_task() stats here rather than
				4065	* inside move_task().
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4066	*/
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	4067	schedstat_add(env->sd, lb_gained[env->idle], pulled);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4068
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4069	return pulled;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4070	}
				4071
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4072	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4073	/*
				4074	* update tg->load_weight by folding this cpu's load_avg
				4075	*/
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4076	static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4077	{
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4078	struct sched_entity *se = tg->se[cpu];
				4079	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4080
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4081	/* throttled entities do not contribute to load */
				4082	if (throttled_hierarchy(cfs_rq))
				4083	return;
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4084
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	4085	update_cfs_rq_blocked_load(cfs_rq, 1);
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4086
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	4087	if (se) {
				4088	update_entity_load_avg(se, 1);
				4089	/*
				4090	* We pivot on our runnable average having decayed to zero for
				4091	* list removal. This generally implies that all our children
				4092	* have also been removed (modulo rounding error or bandwidth
				4093	* control); however, such cases are rare and we can fix these
				4094	* at enqueue.
				4095	*
				4096	* TODO: fix up out-of-order children on enqueue.
				4097	*/
				4098	if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
				4099	list_del_leaf_cfs_rq(cfs_rq);
				4100	} else {
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4101	struct rq *rq = rq_of(cfs_rq);
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	4102	update_rq_runnable_avg(rq, rq->nr_running);
				4103	}
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4104	}
				4105
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4106	static void update_blocked_averages(int cpu)
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4107	{
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4108	struct rq *rq = cpu_rq(cpu);
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4109	struct cfs_rq *cfs_rq;
				4110	unsigned long flags;
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4111
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4112	raw_spin_lock_irqsave(&rq->lock, flags);
				4113	update_rq_clock(rq);
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4114	/*
				4115	* Iterates the task_group tree in a bottom up fashion, see
				4116	* list_add_leaf_cfs_rq() for details.
				4117	*/
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	4118	for_each_leaf_cfs_rq(rq, cfs_rq) {
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4119	/*
				4120	* Note: We may want to consider periodically releasing
				4121	* rq->lock about these updates so that creating many task
				4122	* groups does not result in continually extending hold time.
				4123	*/
				4124	__update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	4125	}
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4126
				4127	raw_spin_unlock_irqrestore(&rq->lock, flags);
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4128	}
				4129
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4130	/*
				4131	* Compute the cpu's hierarchical load factor for each task group.
				4132	* This needs to be done in a top-down fashion because the load of a child
				4133	* group is a fraction of its parents load.
				4134	*/
				4135	static int tg_load_down(struct task_group tg, void data)
				4136	{
				4137	unsigned long load;
				4138	long cpu = (long)data;
				4139
				4140	if (!tg->parent) {
				4141	load = cpu_rq(cpu)->load.weight;
				4142	} else {
				4143	load = tg->parent->cfs_rq[cpu]->h_load;
				4144	load *= tg->se[cpu]->load.weight;
				4145	load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
				4146	}
				4147
				4148	tg->cfs_rq[cpu]->h_load = load;
				4149
				4150	return 0;
				4151	}
				4152
				4153	static void update_h_load(long cpu)
				4154	{
Peter Zijlstra	a35b646	2012-08-08 21:46:40 +0200	[diff] [blame]	4155	struct rq *rq = cpu_rq(cpu);
				4156	unsigned long now = jiffies;
				4157
				4158	if (rq->h_load_throttle == now)
				4159	return;
				4160
				4161	rq->h_load_throttle = now;
				4162
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4163	rcu_read_lock();
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4164	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4165	rcu_read_unlock();
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4166	}
				4167
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4168	static unsigned long task_h_load(struct task_struct *p)
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4169	{
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4170	struct cfs_rq *cfs_rq = task_cfs_rq(p);
				4171	unsigned long load;
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4172
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4173	load = p->se.load.weight;
				4174	load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1);
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4175
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4176	return load;
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4177	}
				4178	#else
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4179	static inline void update_blocked_averages(int cpu)
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4180	{
				4181	}
				4182
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4183	static inline void update_h_load(long cpu)
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4184	{
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4185	}
				4186
				4187	static unsigned long task_h_load(struct task_struct *p)
				4188	{
				4189	return p->se.load.weight;
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4190	}
				4191	#endif
				4192
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4193	/******** Helpers for find_busiest_group **********************/
				4194	/*
				4195	* sd_lb_stats - Structure to store the statistics of a sched_domain
				4196	* during load balancing.
				4197	*/
				4198	struct sd_lb_stats {
				4199	struct sched_group busiest; / Busiest group in this sd */
				4200	struct sched_group this; / Local group in this sd */
				4201	unsigned long total_load; /* Total load of all groups in sd */
				4202	unsigned long total_pwr; /* Total power of all groups in sd */
				4203	unsigned long avg_load; /* Average load across all groups in sd */
				4204
				4205	/** Statistics of this group */
				4206	unsigned long this_load;
				4207	unsigned long this_load_per_task;
				4208	unsigned long this_nr_running;
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4209	unsigned long this_has_capacity;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4210	unsigned int this_idle_cpus;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4211
				4212	/* Statistics of the busiest group */
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4213	unsigned int busiest_idle_cpus;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4214	unsigned long max_load;
				4215	unsigned long busiest_load_per_task;
				4216	unsigned long busiest_nr_running;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4217	unsigned long busiest_group_capacity;
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4218	unsigned long busiest_has_capacity;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4219	unsigned int busiest_group_weight;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4220
				4221	int group_imb; /* Is there imbalance in this sd */
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4222	};
				4223
				4224	/*
				4225	* sg_lb_stats - stats of a sched_group required for load_balancing
				4226	*/
				4227	struct sg_lb_stats {
				4228	unsigned long avg_load; /Avg load across the CPUs of the group /
				4229	unsigned long group_load; /* Total load over the CPUs of the group */
				4230	unsigned long sum_nr_running; /* Nr tasks running in the group */
				4231	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
				4232	unsigned long group_capacity;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4233	unsigned long idle_cpus;
				4234	unsigned long group_weight;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4235	int group_imb; /* Is there an imbalance in the group ? */
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4236	int group_has_capacity; /* Is there extra capacity in the group? */
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4237	};
				4238
				4239	/**
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4240	* get_sd_load_idx - Obtain the load index for a given sched domain.
				4241	* @sd: The sched_domain whose load_idx is to be obtained.
				4242	* @idle: The Idle status of the CPU for whose sd load_icx is obtained.
				4243	*/
				4244	static inline int get_sd_load_idx(struct sched_domain *sd,
				4245	enum cpu_idle_type idle)
				4246	{
				4247	int load_idx;
				4248
				4249	switch (idle) {
				4250	case CPU_NOT_IDLE:
				4251	load_idx = sd->busy_idx;
				4252	break;
				4253
				4254	case CPU_NEWLY_IDLE:
				4255	load_idx = sd->newidle_idx;
				4256	break;
				4257	default:
				4258	load_idx = sd->idle_idx;
				4259	break;
				4260	}
				4261
				4262	return load_idx;
				4263	}
				4264
Li Zefan	15f803c	2013-03-05 16:07:11 +0800	[diff] [blame]	4265	static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4266	{
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4267	return SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4268	}
				4269
				4270	unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
				4271	{
				4272	return default_scale_freq_power(sd, cpu);
				4273	}
				4274
Li Zefan	15f803c	2013-03-05 16:07:11 +0800	[diff] [blame]	4275	static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4276	{
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	4277	unsigned long weight = sd->span_weight;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4278	unsigned long smt_gain = sd->smt_gain;
				4279
				4280	smt_gain /= weight;
				4281
				4282	return smt_gain;
				4283	}
				4284
				4285	unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
				4286	{
				4287	return default_scale_smt_power(sd, cpu);
				4288	}
				4289
Li Zefan	15f803c	2013-03-05 16:07:11 +0800	[diff] [blame]	4290	static unsigned long scale_rt_power(int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4291	{
				4292	struct rq *rq = cpu_rq(cpu);
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	4293	u64 total, available, age_stamp, avg;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4294
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	4295	/*
				4296	* Since we're reading these variables without serialization make sure
				4297	* we read them once before doing sanity checks on them.
				4298	*/
				4299	age_stamp = ACCESS_ONCE(rq->age_stamp);
				4300	avg = ACCESS_ONCE(rq->rt_avg);
Venkatesh Pallipadi	aa48380	2010-10-04 17:03:22 -0700	[diff] [blame]	4301
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	4302	total = sched_avg_period() + (rq->clock - age_stamp);
				4303
				4304	if (unlikely(total < avg)) {
Venkatesh Pallipadi	aa48380	2010-10-04 17:03:22 -0700	[diff] [blame]	4305	/* Ensures that power won't end up being negative */
				4306	available = 0;
				4307	} else {
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	4308	available = total - avg;
Venkatesh Pallipadi	aa48380	2010-10-04 17:03:22 -0700	[diff] [blame]	4309	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4310
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4311	if (unlikely((s64)total < SCHED_POWER_SCALE))
				4312	total = SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4313
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4314	total >>= SCHED_POWER_SHIFT;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4315
				4316	return div_u64(available, total);
				4317	}
				4318
				4319	static void update_cpu_power(struct sched_domain *sd, int cpu)
				4320	{
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	4321	unsigned long weight = sd->span_weight;
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4322	unsigned long power = SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4323	struct sched_group *sdg = sd->groups;
				4324
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4325	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
				4326	if (sched_feat(ARCH_POWER))
				4327	power *= arch_scale_smt_power(sd, cpu);
				4328	else
				4329	power *= default_scale_smt_power(sd, cpu);
				4330
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4331	power >>= SCHED_POWER_SHIFT;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4332	}
				4333
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4334	sdg->sgp->power_orig = power;
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4335
				4336	if (sched_feat(ARCH_POWER))
				4337	power *= arch_scale_freq_power(sd, cpu);
				4338	else
				4339	power *= default_scale_freq_power(sd, cpu);
				4340
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4341	power >>= SCHED_POWER_SHIFT;
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4342
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4343	power *= scale_rt_power(cpu);
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4344	power >>= SCHED_POWER_SHIFT;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4345
				4346	if (!power)
				4347	power = 1;
				4348
Peter Zijlstra	e51fd5e	2010-05-31 12:37:30 +0200	[diff] [blame]	4349	cpu_rq(cpu)->cpu_power = power;
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4350	sdg->sgp->power = power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4351	}
				4352
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4353	void update_group_power(struct sched_domain *sd, int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4354	{
				4355	struct sched_domain *child = sd->child;
				4356	struct sched_group group, sdg = sd->groups;
				4357	unsigned long power;
Vincent Guittot	4ec4412	2011-12-12 20:21:08 +0100	[diff] [blame]	4358	unsigned long interval;
				4359
				4360	interval = msecs_to_jiffies(sd->balance_interval);
				4361	interval = clamp(interval, 1UL, max_load_balance_interval);
				4362	sdg->sgp->next_update = jiffies + interval;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4363
				4364	if (!child) {
				4365	update_cpu_power(sd, cpu);
				4366	return;
				4367	}
				4368
				4369	power = 0;
				4370
Peter Zijlstra	74a5ce2	2012-05-23 18:00:43 +0200	[diff] [blame]	4371	if (child->flags & SD_OVERLAP) {
				4372	/*
				4373	* SD_OVERLAP domains cannot assume that child groups
				4374	* span the current group.
				4375	*/
				4376
				4377	for_each_cpu(cpu, sched_group_cpus(sdg))
				4378	power += power_of(cpu);
				4379	} else {
				4380	/*
				4381	* !SD_OVERLAP domains can assume that child groups
				4382	* span the current group.
				4383	*/
				4384
				4385	group = child->groups;
				4386	do {
				4387	power += group->sgp->power;
				4388	group = group->next;
				4389	} while (group != child->groups);
				4390	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4391
Peter Zijlstra	c3decf0	2012-05-31 12:05:32 +0200	[diff] [blame]	4392	sdg->sgp->power_orig = sdg->sgp->power = power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4393	}
				4394
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4395	/*
				4396	* Try and fix up capacity for tiny siblings, this is needed when
				4397	* things like SD_ASYM_PACKING need f_b_g to select another sibling
				4398	* which on its own isn't powerful enough.
				4399	*
				4400	* See update_sd_pick_busiest() and check_asym_packing().
				4401	*/
				4402	static inline int
				4403	fix_small_capacity(struct sched_domain sd, struct sched_group group)
				4404	{
				4405	/*
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4406	* Only siblings can have significantly less than SCHED_POWER_SCALE
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4407	*/
Peter Zijlstra	a6c75f2	2011-04-07 14:09:52 +0200	[diff] [blame]	4408	if (!(sd->flags & SD_SHARE_CPUPOWER))
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4409	return 0;
				4410
				4411	/*
				4412	* If ~90% of the cpu_power is still there, we're good.
				4413	*/
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4414	if (group->sgp->power * 32 > group->sgp->power_orig * 29)
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4415	return 1;
				4416
				4417	return 0;
				4418	}
				4419
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4420	/**
				4421	* update_sg_lb_stats - Update sched_group's statistics for load balancing.
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4422	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4423	* @group: sched_group whose statistics are to be updated.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4424	* @load_idx: Load index of sched_domain of this_cpu for load calc.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4425	* @local_group: Does group contain this_cpu.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4426	* @balance: Should we balance.
				4427	* @sgs: variable to hold the statistics for this group.
				4428	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4429	static inline void update_sg_lb_stats(struct lb_env *env,
				4430	struct sched_group *group, int load_idx,
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	4431	int local_group, int balance, struct sg_lb_stats sgs)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4432	{
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4433	unsigned long nr_running, max_nr_running, min_nr_running;
				4434	unsigned long load, max_cpu_load, min_cpu_load;
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	4435	unsigned int balance_cpu = -1, first_idle_cpu = 0;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4436	unsigned long avg_load_per_task = 0;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4437	int i;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4438
Gautham R Shenoy	871e35b	2010-01-20 14:02:44 -0600	[diff] [blame]	4439	if (local_group)
Peter Zijlstra	c117487	2012-05-31 14:47:33 +0200	[diff] [blame]	4440	balance_cpu = group_balance_cpu(group);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4441
				4442	/* Tally up the load of all CPUs in the group */
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4443	max_cpu_load = 0;
				4444	min_cpu_load = ~0UL;
Nikhil Rao	2582f0e	2010-10-13 12:09:36 -0700	[diff] [blame]	4445	max_nr_running = 0;
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4446	min_nr_running = ~0UL;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4447
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	4448	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4449	struct rq *rq = cpu_rq(i);
				4450
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4451	nr_running = rq->nr_running;
				4452
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4453	/* Bias balancing toward cpus of our domain */
				4454	if (local_group) {
Peter Zijlstra	c117487	2012-05-31 14:47:33 +0200	[diff] [blame]	4455	if (idle_cpu(i) && !first_idle_cpu &&
				4456	cpumask_test_cpu(i, sched_group_mask(group))) {
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	4457	first_idle_cpu = 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4458	balance_cpu = i;
				4459	}
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	4460
				4461	load = target_load(i, load_idx);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4462	} else {
				4463	load = source_load(i, load_idx);
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4464	if (load > max_cpu_load)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4465	max_cpu_load = load;
				4466	if (min_cpu_load > load)
				4467	min_cpu_load = load;
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4468
				4469	if (nr_running > max_nr_running)
				4470	max_nr_running = nr_running;
				4471	if (min_nr_running > nr_running)
				4472	min_nr_running = nr_running;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4473	}
				4474
				4475	sgs->group_load += load;
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4476	sgs->sum_nr_running += nr_running;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4477	sgs->sum_weighted_load += weighted_cpuload(i);
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4478	if (idle_cpu(i))
				4479	sgs->idle_cpus++;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4480	}
				4481
				4482	/*
				4483	* First idle cpu or the first cpu(busiest) in this sched group
				4484	* is eligible for doing load balancing at this and above
				4485	* domains. In the newly idle case, we will allow all the cpu's
				4486	* to do the newly idle load balance.
				4487	*/
Vincent Guittot	4ec4412	2011-12-12 20:21:08 +0100	[diff] [blame]	4488	if (local_group) {
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4489	if (env->idle != CPU_NEWLY_IDLE) {
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	4490	if (balance_cpu != env->dst_cpu) {
Vincent Guittot	4ec4412	2011-12-12 20:21:08 +0100	[diff] [blame]	4491	*balance = 0;
				4492	return;
				4493	}
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4494	update_group_power(env->sd, env->dst_cpu);
Vincent Guittot	4ec4412	2011-12-12 20:21:08 +0100	[diff] [blame]	4495	} else if (time_after_eq(jiffies, group->sgp->next_update))
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4496	update_group_power(env->sd, env->dst_cpu);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4497	}
				4498
				4499	/* Adjust by relative CPU power of the group */
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4500	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4501
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4502	/*
				4503	* Consider the group unbalanced when the imbalance is larger
Peter Zijlstra	866ab43	2011-02-21 18:56:47 +0100	[diff] [blame]	4504	* than the average weight of a task.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4505	*
				4506	* APZ: with cgroup the avg task weight can vary wildly and
				4507	* might not be a suitable number - should we keep a
				4508	* normalized nr_running number somewhere that negates
				4509	* the hierarchy?
				4510	*/
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4511	if (sgs->sum_nr_running)
				4512	avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4513
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4514	if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
				4515	(max_nr_running - min_nr_running) > 1)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4516	sgs->group_imb = 1;
				4517
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4518	sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4519	SCHED_POWER_SCALE);
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4520	if (!sgs->group_capacity)
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4521	sgs->group_capacity = fix_small_capacity(env->sd, group);
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4522	sgs->group_weight = group->group_weight;
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4523
				4524	if (sgs->group_capacity > sgs->sum_nr_running)
				4525	sgs->group_has_capacity = 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4526	}
				4527
				4528	/**
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4529	* update_sd_pick_busiest - return 1 on busiest group
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4530	* @env: The load balancing environment.
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4531	* @sds: sched_domain statistics
				4532	* @sg: sched_group candidate to be checked for being the busiest
Michael Neuling	b6b1229	2010-06-10 12:06:21 +1000	[diff] [blame]	4533	* @sgs: sched_group statistics
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4534	*
				4535	* Determine if @sg is a busier group than the previously selected
				4536	* busiest group.
				4537	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4538	static bool update_sd_pick_busiest(struct lb_env *env,
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4539	struct sd_lb_stats *sds,
				4540	struct sched_group *sg,
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4541	struct sg_lb_stats *sgs)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4542	{
				4543	if (sgs->avg_load <= sds->max_load)
				4544	return false;
				4545
				4546	if (sgs->sum_nr_running > sgs->group_capacity)
				4547	return true;
				4548
				4549	if (sgs->group_imb)
				4550	return true;
				4551
				4552	/*
				4553	* ASYM_PACKING needs to move all the work to the lowest
				4554	* numbered CPUs in the group, therefore mark all groups
				4555	* higher than ourself as busy.
				4556	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4557	if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
				4558	env->dst_cpu < group_first_cpu(sg)) {
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4559	if (!sds->busiest)
				4560	return true;
				4561
				4562	if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
				4563	return true;
				4564	}
				4565
				4566	return false;
				4567	}
				4568
				4569	/**
Hui Kang	461819a	2011-10-11 23:00:59 -0400	[diff] [blame]	4570	* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4571	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4572	* @balance: Should we balance.
				4573	* @sds: variable to hold the statistics for this sched_domain.
				4574	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4575	static inline void update_sd_lb_stats(struct lb_env *env,
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	4576	int balance, struct sd_lb_stats sds)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4577	{
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4578	struct sched_domain *child = env->sd->child;
				4579	struct sched_group *sg = env->sd->groups;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4580	struct sg_lb_stats sgs;
				4581	int load_idx, prefer_sibling = 0;
				4582
				4583	if (child && child->flags & SD_PREFER_SIBLING)
				4584	prefer_sibling = 1;
				4585
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4586	load_idx = get_sd_load_idx(env->sd, env->idle);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4587
				4588	do {
				4589	int local_group;
				4590
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4591	local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4592	memset(&sgs, 0, sizeof(sgs));
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	4593	update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4594
Peter Zijlstra	8f190fb	2009-12-24 14:18:21 +0100	[diff] [blame]	4595	if (local_group && !(*balance))
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4596	return;
				4597
				4598	sds->total_load += sgs.group_load;
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4599	sds->total_pwr += sg->sgp->power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4600
				4601	/*
				4602	* In case the child domain prefers tasks go to siblings
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4603	* first, lower the sg capacity to one so that we'll try
Nikhil Rao	75dd321	2010-10-15 13:12:30 -0700	[diff] [blame]	4604	* and move all the excess tasks away. We lower the capacity
				4605	* of a group only if the local group has the capacity to fit
				4606	* these excess tasks, i.e. nr_running < group_capacity. The
				4607	* extra check prevents the case where you always pull from the
				4608	* heaviest group when it is already under-utilized (possible
				4609	* with a large weight task outweighs the tasks on the system).
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4610	*/
Nikhil Rao	75dd321	2010-10-15 13:12:30 -0700	[diff] [blame]	4611	if (prefer_sibling && !local_group && sds->this_has_capacity)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4612	sgs.group_capacity = min(sgs.group_capacity, 1UL);
				4613
				4614	if (local_group) {
				4615	sds->this_load = sgs.avg_load;
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4616	sds->this = sg;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4617	sds->this_nr_running = sgs.sum_nr_running;
				4618	sds->this_load_per_task = sgs.sum_weighted_load;
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4619	sds->this_has_capacity = sgs.group_has_capacity;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4620	sds->this_idle_cpus = sgs.idle_cpus;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4621	} else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4622	sds->max_load = sgs.avg_load;
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4623	sds->busiest = sg;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4624	sds->busiest_nr_running = sgs.sum_nr_running;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4625	sds->busiest_idle_cpus = sgs.idle_cpus;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4626	sds->busiest_group_capacity = sgs.group_capacity;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4627	sds->busiest_load_per_task = sgs.sum_weighted_load;
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4628	sds->busiest_has_capacity = sgs.group_has_capacity;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4629	sds->busiest_group_weight = sgs.group_weight;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4630	sds->group_imb = sgs.group_imb;
				4631	}
				4632
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4633	sg = sg->next;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4634	} while (sg != env->sd->groups);
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4635	}
				4636
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4637	/**
				4638	* check_asym_packing - Check to see if the group is packed into the
				4639	* sched doman.
				4640	*
				4641	* This is primarily intended to used at the sibling level. Some
				4642	* cores like POWER7 prefer to use lower numbered SMT threads. In the
				4643	* case of POWER7, it can move to lower SMT modes only when higher
				4644	* threads are idle. When in lower SMT modes, the threads will
				4645	* perform better since they share less core resources. Hence when we
				4646	* have idle threads, we want them to be the higher ones.
				4647	*
				4648	* This packing function is run on idle threads. It checks to see if
				4649	* the busiest CPU in this domain (core in the P7 case) has a higher
				4650	* CPU number than the packing function is being run on. Here we are
				4651	* assuming lower CPU number will be equivalent to lower a SMT thread
				4652	* number.
				4653	*
Michael Neuling	b6b1229	2010-06-10 12:06:21 +1000	[diff] [blame]	4654	* Returns 1 when packing is required and a task should be moved to
				4655	* this CPU. The amount of the imbalance is returned in *imbalance.
				4656	*
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4657	* @env: The load balancing environment.
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4658	* @sds: Statistics of the sched_domain which is to be packed
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4659	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4660	static int check_asym_packing(struct lb_env env, struct sd_lb_stats sds)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4661	{
				4662	int busiest_cpu;
				4663
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4664	if (!(env->sd->flags & SD_ASYM_PACKING))
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4665	return 0;
				4666
				4667	if (!sds->busiest)
				4668	return 0;
				4669
				4670	busiest_cpu = group_first_cpu(sds->busiest);
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4671	if (env->dst_cpu > busiest_cpu)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4672	return 0;
				4673
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4674	env->imbalance = DIV_ROUND_CLOSEST(
				4675	sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
				4676
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4677	return 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4678	}
				4679
				4680	/**
				4681	* fix_small_imbalance - Calculate the minor imbalance that exists
				4682	* amongst the groups of a sched_domain, during
				4683	* load balancing.
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4684	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4685	* @sds: Statistics of the sched_domain whose imbalance is to be calculated.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4686	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4687	static inline
				4688	void fix_small_imbalance(struct lb_env env, struct sd_lb_stats sds)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4689	{
				4690	unsigned long tmp, pwr_now = 0, pwr_move = 0;
				4691	unsigned int imbn = 2;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4692	unsigned long scaled_busy_load_per_task;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4693
				4694	if (sds->this_nr_running) {
				4695	sds->this_load_per_task /= sds->this_nr_running;
				4696	if (sds->busiest_load_per_task >
				4697	sds->this_load_per_task)
				4698	imbn = 1;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4699	} else {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4700	sds->this_load_per_task =
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4701	cpu_avg_load_per_task(env->dst_cpu);
				4702	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4703
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4704	scaled_busy_load_per_task = sds->busiest_load_per_task
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4705	* SCHED_POWER_SCALE;
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4706	scaled_busy_load_per_task /= sds->busiest->sgp->power;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4707
				4708	if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
				4709	(scaled_busy_load_per_task * imbn)) {
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4710	env->imbalance = sds->busiest_load_per_task;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4711	return;
				4712	}
				4713
				4714	/*
				4715	* OK, we don't have enough imbalance to justify moving tasks,
				4716	* however we may be able to increase total CPU power used by
				4717	* moving them.
				4718	*/
				4719
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4720	pwr_now += sds->busiest->sgp->power *
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4721	min(sds->busiest_load_per_task, sds->max_load);
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4722	pwr_now += sds->this->sgp->power *
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4723	min(sds->this_load_per_task, sds->this_load);
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4724	pwr_now /= SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4725
				4726	/* Amount of load we'd subtract */
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4727	tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4728	sds->busiest->sgp->power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4729	if (sds->max_load > tmp)
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4730	pwr_move += sds->busiest->sgp->power *
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4731	min(sds->busiest_load_per_task, sds->max_load - tmp);
				4732
				4733	/* Amount of load we'd add */
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4734	if (sds->max_load * sds->busiest->sgp->power <
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4735	sds->busiest_load_per_task * SCHED_POWER_SCALE)
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4736	tmp = (sds->max_load * sds->busiest->sgp->power) /
				4737	sds->this->sgp->power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4738	else
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4739	tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4740	sds->this->sgp->power;
				4741	pwr_move += sds->this->sgp->power *
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4742	min(sds->this_load_per_task, sds->this_load + tmp);
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4743	pwr_move /= SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4744
				4745	/* Move if we gain throughput */
				4746	if (pwr_move > pwr_now)
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4747	env->imbalance = sds->busiest_load_per_task;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4748	}
				4749
				4750	/**
				4751	* calculate_imbalance - Calculate the amount of imbalance present within the
				4752	* groups of a given sched_domain during load balance.
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4753	* @env: load balance environment
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4754	* @sds: statistics of the sched_domain whose imbalance is to be calculated.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4755	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4756	static inline void calculate_imbalance(struct lb_env env, struct sd_lb_stats sds)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4757	{
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4758	unsigned long max_pull, load_above_capacity = ~0UL;
				4759
				4760	sds->busiest_load_per_task /= sds->busiest_nr_running;
				4761	if (sds->group_imb) {
				4762	sds->busiest_load_per_task =
				4763	min(sds->busiest_load_per_task, sds->avg_load);
				4764	}
				4765
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4766	/*
				4767	* In the presence of smp nice balancing, certain scenarios can have
				4768	* max load less than avg load(as we skip the groups at or below
				4769	* its cpu_power, while calculating max_load..)
				4770	*/
				4771	if (sds->max_load < sds->avg_load) {
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4772	env->imbalance = 0;
				4773	return fix_small_imbalance(env, sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4774	}
				4775
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4776	if (!sds->group_imb) {
				4777	/*
				4778	* Don't want to pull so many tasks that a group would go idle.
				4779	*/
				4780	load_above_capacity = (sds->busiest_nr_running -
				4781	sds->busiest_group_capacity);
				4782
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4783	load_above_capacity = (SCHED_LOAD_SCALE SCHED_POWER_SCALE);
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4784
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4785	load_above_capacity /= sds->busiest->sgp->power;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4786	}
				4787
				4788	/*
				4789	* We're trying to get all the cpus to the average_load, so we don't
				4790	* want to push ourselves above the average load, nor do we wish to
				4791	* reduce the max loaded cpu below the average load. At the same time,
				4792	* we also don't want to reduce the group load below the group capacity
				4793	* (so that we can implement power-savings policies etc). Thus we look
				4794	* for the minimum possible imbalance.
				4795	* Be careful of negative numbers as they'll appear as very large values
				4796	* with unsigned longs.
				4797	*/
				4798	max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4799
				4800	/* How much load to actually move to equalise the imbalance */
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4801	env->imbalance = min(max_pull * sds->busiest->sgp->power,
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4802	(sds->avg_load - sds->this_load) * sds->this->sgp->power)
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4803	/ SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4804
				4805	/*
				4806	* if *imbalance is less than the average load per runnable task
Lucas De Marchi	25985ed	2011-03-30 22:57:33 -0300	[diff] [blame]	4807	* there is no guarantee that any tasks will be moved so we'll have
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4808	* a think about bumping its value to force at least one task to be
				4809	* moved
				4810	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4811	if (env->imbalance < sds->busiest_load_per_task)
				4812	return fix_small_imbalance(env, sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4813
				4814	}
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4815
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4816	/***** find_busiest_group() helpers end here *******************/
				4817
				4818	/**
				4819	* find_busiest_group - Returns the busiest group within the sched_domain
				4820	* if there is an imbalance. If there isn't an imbalance, and
				4821	* the user has opted for power-savings, it returns a group whose
				4822	* CPUs can be put to idle by rebalancing those tasks elsewhere, if
				4823	* such a group exists.
				4824	*
				4825	* Also calculates the amount of weighted load which should be moved
				4826	* to restore balance.
				4827	*
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4828	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4829	* @balance: Pointer to a variable indicating if this_cpu
				4830	* is the appropriate cpu to perform load balancing at this_level.
				4831	*
				4832	* Returns: - the busiest group if imbalance exists.
				4833	* - If no imbalance and user has opted for power-savings balance,
				4834	* return the least loaded group whose CPUs can be
				4835	* put to idle by rebalancing its tasks onto our group.
				4836	*/
				4837	static struct sched_group *
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	4838	find_busiest_group(struct lb_env env, int balance)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4839	{
				4840	struct sd_lb_stats sds;
				4841
				4842	memset(&sds, 0, sizeof(sds));
				4843
				4844	/*
				4845	* Compute the various statistics relavent for load balancing at
				4846	* this level.
				4847	*/
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	4848	update_sd_lb_stats(env, balance, &sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4849
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	4850	/*
				4851	* this_cpu is not the appropriate cpu to perform load balancing at
				4852	* this level.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4853	*/
Peter Zijlstra	8f190fb	2009-12-24 14:18:21 +0100	[diff] [blame]	4854	if (!(*balance))
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4855	goto ret;
				4856
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4857	if ((env->idle == CPU_IDLE \|\| env->idle == CPU_NEWLY_IDLE) &&
				4858	check_asym_packing(env, &sds))
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4859	return sds.busiest;
				4860
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	4861	/* There is no busy sibling group to pull tasks from */
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4862	if (!sds.busiest \|\| sds.busiest_nr_running == 0)
				4863	goto out_balanced;
				4864
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4865	sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
Ken Chen	b0432d8	2011-04-07 17:23:22 -0700	[diff] [blame]	4866
Peter Zijlstra	866ab43	2011-02-21 18:56:47 +0100	[diff] [blame]	4867	/*
				4868	* If the busiest group is imbalanced the below checks don't
				4869	* work because they assumes all things are equal, which typically
				4870	* isn't true due to cpus_allowed constraints and the like.
				4871	*/
				4872	if (sds.group_imb)
				4873	goto force_balance;
				4874
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	4875	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4876	if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4877	!sds.busiest_has_capacity)
				4878	goto force_balance;
				4879
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	4880	/*
				4881	* If the local group is more busy than the selected busiest group
				4882	* don't try and pull any tasks.
				4883	*/
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4884	if (sds.this_load >= sds.max_load)
				4885	goto out_balanced;
				4886
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	4887	/*
				4888	* Don't pull any tasks if this group is already above the domain
				4889	* average load.
				4890	*/
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4891	if (sds.this_load >= sds.avg_load)
				4892	goto out_balanced;
				4893
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4894	if (env->idle == CPU_IDLE) {
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4895	/*
				4896	* This cpu is idle. If the busiest group load doesn't
				4897	* have more tasks than the number of available cpu's and
				4898	* there is no imbalance between this and busiest group
				4899	* wrt to idle cpu's, it is balanced.
				4900	*/
Peter Zijlstra	c186faf	2011-02-21 18:52:53 +0100	[diff] [blame]	4901	if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4902	sds.busiest_nr_running <= sds.busiest_group_weight)
				4903	goto out_balanced;
Peter Zijlstra	c186faf	2011-02-21 18:52:53 +0100	[diff] [blame]	4904	} else {
				4905	/*
				4906	* In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
				4907	* imbalance_pct to be conservative.
				4908	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4909	if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
Peter Zijlstra	c186faf	2011-02-21 18:52:53 +0100	[diff] [blame]	4910	goto out_balanced;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4911	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4912
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4913	force_balance:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4914	/* Looks like there is an imbalance. Compute it */
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4915	calculate_imbalance(env, &sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4916	return sds.busiest;
				4917
				4918	out_balanced:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4919	ret:
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4920	env->imbalance = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4921	return NULL;
				4922	}
				4923
				4924	/*
				4925	* find_busiest_queue - find the busiest runqueue among the cpus in group.
				4926	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4927	static struct rq find_busiest_queue(struct lb_env env,
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	4928	struct sched_group *group)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4929	{
				4930	struct rq busiest = NULL, rq;
				4931	unsigned long max_load = 0;
				4932	int i;
				4933
				4934	for_each_cpu(i, sched_group_cpus(group)) {
				4935	unsigned long power = power_of(i);
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4936	unsigned long capacity = DIV_ROUND_CLOSEST(power,
				4937	SCHED_POWER_SCALE);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4938	unsigned long wl;
				4939
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4940	if (!capacity)
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4941	capacity = fix_small_capacity(env->sd, group);
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4942
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	4943	if (!cpumask_test_cpu(i, env->cpus))
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4944	continue;
				4945
				4946	rq = cpu_rq(i);
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	4947	wl = weighted_cpuload(i);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4948
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	4949	/*
				4950	* When comparing with imbalance, use weighted_cpuload()
				4951	* which is not scaled with the cpu power.
				4952	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4953	if (capacity && rq->nr_running == 1 && wl > env->imbalance)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4954	continue;
				4955
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	4956	/*
				4957	* For the load comparisons with the other cpu's, consider
				4958	* the weighted_cpuload() scaled with the cpu power, so that
				4959	* the load can be moved away from the cpu that is potentially
				4960	* running at a lower capacity.
				4961	*/
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4962	wl = (wl * SCHED_POWER_SCALE) / power;
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	4963
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4964	if (wl > max_load) {
				4965	max_load = wl;
				4966	busiest = rq;
				4967	}
				4968	}
				4969
				4970	return busiest;
				4971	}
				4972
				4973	/*
				4974	* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
				4975	* so long as it is large enough.
				4976	*/
				4977	#define MAX_PINNED_INTERVAL 512
				4978
				4979	/* Working cpumask for load_balance and load_balance_newidle. */
Joonsoo Kim	e6252c3	2013-04-23 17:27:41 +0900	[diff] [blame^]	4980	DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4981
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4982	static int need_active_balance(struct lb_env *env)
Peter Zijlstra	1af3ed3	2009-12-23 15:10:31 +0100	[diff] [blame]	4983	{
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4984	struct sched_domain *sd = env->sd;
				4985
				4986	if (env->idle == CPU_NEWLY_IDLE) {
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4987
				4988	/*
				4989	* ASYM_PACKING needs to force migrate tasks from busy but
				4990	* higher numbered CPUs in order to pack all tasks in the
				4991	* lowest numbered CPUs.
				4992	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4993	if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4994	return 1;
Peter Zijlstra	1af3ed3	2009-12-23 15:10:31 +0100	[diff] [blame]	4995	}
				4996
				4997	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
				4998	}
				4999
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5000	static int active_load_balance_cpu_stop(void *data);
				5001
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5002	/*
				5003	* Check this_cpu to ensure it is balanced within domain. Attempt to move
				5004	* tasks if there is an imbalance.
				5005	*/
				5006	static int load_balance(int this_cpu, struct rq *this_rq,
				5007	struct sched_domain *sd, enum cpu_idle_type idle,
				5008	int *balance)
				5009	{
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5010	int ld_moved, cur_ld_moved, active_balance = 0;
				5011	int lb_iterations, max_lb_iterations;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5012	struct sched_group *group;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5013	struct rq *busiest;
				5014	unsigned long flags;
Joonsoo Kim	e6252c3	2013-04-23 17:27:41 +0900	[diff] [blame^]	5015	struct cpumask *cpus = __get_cpu_var(load_balance_mask);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5016
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5017	struct lb_env env = {
				5018	.sd = sd,
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	5019	.dst_cpu = this_cpu,
				5020	.dst_rq = this_rq,
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5021	.dst_grpmask = sched_group_cpus(sd->groups),
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5022	.idle = idle,
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	5023	.loop_break = sched_nr_migrate_break,
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	5024	.cpus = cpus,
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5025	};
				5026
Joonsoo Kim	cfc0311	2013-04-23 17:27:39 +0900	[diff] [blame]	5027	/*
				5028	* For NEWLY_IDLE load_balancing, we don't need to consider
				5029	* other cpus in our group
				5030	*/
				5031	if (idle == CPU_NEWLY_IDLE) {
				5032	env.dst_grpmask = NULL;
				5033	/*
				5034	* we don't care max_lb_iterations in this case,
				5035	* in following patch, this will be removed
				5036	*/
				5037	max_lb_iterations = 0;
				5038	} else
				5039	max_lb_iterations = cpumask_weight(env.dst_grpmask);
				5040
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5041	cpumask_copy(cpus, cpu_active_mask);
				5042
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5043	schedstat_inc(sd, lb_count[idle]);
				5044
				5045	redo:
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	5046	group = find_busiest_group(&env, balance);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5047
				5048	if (*balance == 0)
				5049	goto out_balanced;
				5050
				5051	if (!group) {
				5052	schedstat_inc(sd, lb_nobusyg[idle]);
				5053	goto out_balanced;
				5054	}
				5055
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	5056	busiest = find_busiest_queue(&env, group);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5057	if (!busiest) {
				5058	schedstat_inc(sd, lb_nobusyq[idle]);
				5059	goto out_balanced;
				5060	}
				5061
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	5062	BUG_ON(busiest == env.dst_rq);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5063
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5064	schedstat_add(sd, lb_imbalance[idle], env.imbalance);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5065
				5066	ld_moved = 0;
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5067	lb_iterations = 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5068	if (busiest->nr_running > 1) {
				5069	/*
				5070	* Attempt to move tasks. If find_busiest_group has found
				5071	* an imbalance but busiest->nr_running <= 1, the group is
				5072	* still unbalanced. ld_moved simply stays zero, so it is
				5073	* correctly treated as an imbalance.
				5074	*/
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5075	env.flags \|= LBF_ALL_PINNED;
Peter Zijlstra	c82513e	2012-04-26 13:12:27 +0200	[diff] [blame]	5076	env.src_cpu = busiest->cpu;
				5077	env.src_rq = busiest;
				5078	env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5079
Peter Zijlstra	a35b646	2012-08-08 21:46:40 +0200	[diff] [blame]	5080	update_h_load(env.src_cpu);
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	5081	more_balance:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5082	local_irq_save(flags);
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	5083	double_rq_lock(env.dst_rq, busiest);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5084
				5085	/*
				5086	* cur_ld_moved - load moved in current iteration
				5087	* ld_moved - cumulative load moved across iterations
				5088	*/
				5089	cur_ld_moved = move_tasks(&env);
				5090	ld_moved += cur_ld_moved;
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	5091	double_rq_unlock(env.dst_rq, busiest);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5092	local_irq_restore(flags);
				5093
				5094	/*
				5095	* some other cpu did the load balance for us.
				5096	*/
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5097	if (cur_ld_moved && env.dst_cpu != smp_processor_id())
				5098	resched_cpu(env.dst_cpu);
				5099
Joonsoo Kim	f1cd085	2013-04-23 17:27:37 +0900	[diff] [blame]	5100	if (env.flags & LBF_NEED_BREAK) {
				5101	env.flags &= ~LBF_NEED_BREAK;
				5102	goto more_balance;
				5103	}
				5104
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5105	/*
				5106	* Revisit (affine) tasks on src_cpu that couldn't be moved to
				5107	* us and move them to an alternate dst_cpu in our sched_group
				5108	* where they can run. The upper limit on how many times we
				5109	* iterate on same src_cpu is dependent on number of cpus in our
				5110	* sched_group.
				5111	*
				5112	* This changes load balance semantics a bit on who can move
				5113	* load to a given_cpu. In addition to the given_cpu itself
				5114	* (or a ilb_cpu acting on its behalf where given_cpu is
				5115	* nohz-idle), we now have balance_cpu in a position to move
				5116	* load to given_cpu. In rare situations, this may cause
				5117	* conflicts (balance_cpu and given_cpu/ilb_cpu deciding
				5118	* _independently_ and at _same_ time to move some load to
				5119	* given_cpu) causing exceess load to be moved to given_cpu.
				5120	* This however should not happen so much in practice and
				5121	* moreover subsequent load balance cycles should correct the
				5122	* excess load moved.
				5123	*/
				5124	if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
				5125	lb_iterations++ < max_lb_iterations) {
				5126
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	5127	env.dst_rq = cpu_rq(env.new_dst_cpu);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5128	env.dst_cpu = env.new_dst_cpu;
				5129	env.flags &= ~LBF_SOME_PINNED;
				5130	env.loop = 0;
				5131	env.loop_break = sched_nr_migrate_break;
				5132	/*
				5133	* Go back to "more_balance" rather than "redo" since we
				5134	* need to continue with same src_cpu.
				5135	*/
				5136	goto more_balance;
				5137	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5138
				5139	/* All tasks on this runqueue were pinned by CPU affinity */
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5140	if (unlikely(env.flags & LBF_ALL_PINNED)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5141	cpumask_clear_cpu(cpu_of(busiest), cpus);
Prashanth Nageshappa	bbf18b1	2012-06-19 17:52:07 +0530	[diff] [blame]	5142	if (!cpumask_empty(cpus)) {
				5143	env.loop = 0;
				5144	env.loop_break = sched_nr_migrate_break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5145	goto redo;
Prashanth Nageshappa	bbf18b1	2012-06-19 17:52:07 +0530	[diff] [blame]	5146	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5147	goto out_balanced;
				5148	}
				5149	}
				5150
				5151	if (!ld_moved) {
				5152	schedstat_inc(sd, lb_failed[idle]);
Venkatesh Pallipadi	58b26c4	2010-09-10 18:19:17 -0700	[diff] [blame]	5153	/*
				5154	* Increment the failure counter only on periodic balance.
				5155	* We do not want newidle balance, which can be very
				5156	* frequent, pollute the failure counter causing
				5157	* excessive cache_hot migrations and active balances.
				5158	*/
				5159	if (idle != CPU_NEWLY_IDLE)
				5160	sd->nr_balance_failed++;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5161
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5162	if (need_active_balance(&env)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5163	raw_spin_lock_irqsave(&busiest->lock, flags);
				5164
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5165	/* don't kick the active_load_balance_cpu_stop,
				5166	* if the curr task on busiest cpu can't be
				5167	* moved to this_cpu
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5168	*/
				5169	if (!cpumask_test_cpu(this_cpu,
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	5170	tsk_cpus_allowed(busiest->curr))) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5171	raw_spin_unlock_irqrestore(&busiest->lock,
				5172	flags);
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5173	env.flags \|= LBF_ALL_PINNED;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5174	goto out_one_pinned;
				5175	}
				5176
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5177	/*
				5178	* ->active_balance synchronizes accesses to
				5179	* ->active_balance_work. Once set, it's cleared
				5180	* only after active load balance is finished.
				5181	*/
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5182	if (!busiest->active_balance) {
				5183	busiest->active_balance = 1;
				5184	busiest->push_cpu = this_cpu;
				5185	active_balance = 1;
				5186	}
				5187	raw_spin_unlock_irqrestore(&busiest->lock, flags);
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5188
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5189	if (active_balance) {
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5190	stop_one_cpu_nowait(cpu_of(busiest),
				5191	active_load_balance_cpu_stop, busiest,
				5192	&busiest->active_balance_work);
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5193	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5194
				5195	/*
				5196	* We've kicked active balancing, reset the failure
				5197	* counter.
				5198	*/
				5199	sd->nr_balance_failed = sd->cache_nice_tries+1;
				5200	}
				5201	} else
				5202	sd->nr_balance_failed = 0;
				5203
				5204	if (likely(!active_balance)) {
				5205	/* We were unbalanced, so reset the balancing interval */
				5206	sd->balance_interval = sd->min_interval;
				5207	} else {
				5208	/*
				5209	* If we've begun active balancing, start to back off. This
				5210	* case may not be covered by the all_pinned logic if there
				5211	* is only 1 task on the busy runqueue (because we don't call
				5212	* move_tasks).
				5213	*/
				5214	if (sd->balance_interval < sd->max_interval)
				5215	sd->balance_interval *= 2;
				5216	}
				5217
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5218	goto out;
				5219
				5220	out_balanced:
				5221	schedstat_inc(sd, lb_balanced[idle]);
				5222
				5223	sd->nr_balance_failed = 0;
				5224
				5225	out_one_pinned:
				5226	/* tune up the balancing interval */
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5227	if (((env.flags & LBF_ALL_PINNED) &&
Peter Zijlstra	5b54b56	2011-09-22 15:23:13 +0200	[diff] [blame]	5228	sd->balance_interval < MAX_PINNED_INTERVAL) \|\|
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5229	(sd->balance_interval < sd->max_interval))
				5230	sd->balance_interval *= 2;
				5231
Venkatesh Pallipadi	46e49b3	2011-02-14 14:38:50 -0800	[diff] [blame]	5232	ld_moved = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5233	out:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5234	return ld_moved;
				5235	}
				5236
				5237	/*
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5238	* idle_balance is called by schedule() if this_cpu is about to become
				5239	* idle. Attempts to pull tasks from other CPUs.
				5240	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5241	void idle_balance(int this_cpu, struct rq *this_rq)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5242	{
				5243	struct sched_domain *sd;
				5244	int pulled_task = 0;
				5245	unsigned long next_balance = jiffies + HZ;
				5246
				5247	this_rq->idle_stamp = this_rq->clock;
				5248
				5249	if (this_rq->avg_idle < sysctl_sched_migration_cost)
				5250	return;
				5251
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5252	/*
				5253	* Drop the rq->lock, but keep IRQ/preempt disabled.
				5254	*/
				5255	raw_spin_unlock(&this_rq->lock);
				5256
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	5257	update_blocked_averages(this_cpu);
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5258	rcu_read_lock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5259	for_each_domain(this_cpu, sd) {
				5260	unsigned long interval;
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5261	int balance = 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5262
				5263	if (!(sd->flags & SD_LOAD_BALANCE))
				5264	continue;
				5265
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5266	if (sd->flags & SD_BALANCE_NEWIDLE) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5267	/* If we've pulled tasks over stop searching: */
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5268	pulled_task = load_balance(this_cpu, this_rq,
				5269	sd, CPU_NEWLY_IDLE, &balance);
				5270	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5271
				5272	interval = msecs_to_jiffies(sd->balance_interval);
				5273	if (time_after(next_balance, sd->last_balance + interval))
				5274	next_balance = sd->last_balance + interval;
Nikhil Rao	d5ad140	2010-11-17 11:42:04 -0800	[diff] [blame]	5275	if (pulled_task) {
				5276	this_rq->idle_stamp = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5277	break;
Nikhil Rao	d5ad140	2010-11-17 11:42:04 -0800	[diff] [blame]	5278	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5279	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5280	rcu_read_unlock();
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5281
				5282	raw_spin_lock(&this_rq->lock);
				5283
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5284	if (pulled_task \|\| time_after(jiffies, this_rq->next_balance)) {
				5285	/*
				5286	* We are going idle. next_balance may be set based on
				5287	* a busy processor. So reset next_balance.
				5288	*/
				5289	this_rq->next_balance = next_balance;
				5290	}
				5291	}
				5292
				5293	/*
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5294	* active_load_balance_cpu_stop is run by cpu stopper. It pushes
				5295	* running tasks off the busiest CPU onto idle CPUs. It requires at
				5296	* least 1 task to be running on each physical CPU where possible, and
				5297	* avoids physical / logical imbalances.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5298	*/
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5299	static int active_load_balance_cpu_stop(void *data)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5300	{
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5301	struct rq *busiest_rq = data;
				5302	int busiest_cpu = cpu_of(busiest_rq);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5303	int target_cpu = busiest_rq->push_cpu;
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5304	struct rq *target_rq = cpu_rq(target_cpu);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5305	struct sched_domain *sd;
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5306
				5307	raw_spin_lock_irq(&busiest_rq->lock);
				5308
				5309	/* make sure the requested cpu hasn't gone down in the meantime */
				5310	if (unlikely(busiest_cpu != smp_processor_id() \|\|
				5311	!busiest_rq->active_balance))
				5312	goto out_unlock;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5313
				5314	/* Is there any task to move? */
				5315	if (busiest_rq->nr_running <= 1)
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5316	goto out_unlock;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5317
				5318	/*
				5319	* This condition is "impossible", if it occurs
				5320	* we need to fix it. Originally reported by
				5321	* Bjorn Helgaas on a 128-cpu setup.
				5322	*/
				5323	BUG_ON(busiest_rq == target_rq);
				5324
				5325	/* move a task from busiest_rq to target_rq */
				5326	double_lock_balance(busiest_rq, target_rq);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5327
				5328	/* Search for an sd spanning us and the target CPU. */
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5329	rcu_read_lock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5330	for_each_domain(target_cpu, sd) {
				5331	if ((sd->flags & SD_LOAD_BALANCE) &&
				5332	cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
				5333	break;
				5334	}
				5335
				5336	if (likely(sd)) {
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5337	struct lb_env env = {
				5338	.sd = sd,
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	5339	.dst_cpu = target_cpu,
				5340	.dst_rq = target_rq,
				5341	.src_cpu = busiest_rq->cpu,
				5342	.src_rq = busiest_rq,
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5343	.idle = CPU_IDLE,
				5344	};
				5345
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5346	schedstat_inc(sd, alb_count);
				5347
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5348	if (move_one_task(&env))
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5349	schedstat_inc(sd, alb_pushed);
				5350	else
				5351	schedstat_inc(sd, alb_failed);
				5352	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5353	rcu_read_unlock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5354	double_unlock_balance(busiest_rq, target_rq);
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5355	out_unlock:
				5356	busiest_rq->active_balance = 0;
				5357	raw_spin_unlock_irq(&busiest_rq->lock);
				5358	return 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5359	}
				5360
				5361	#ifdef CONFIG_NO_HZ
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5362	/*
				5363	* idle load balancing details
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5364	* - When one of the busy CPUs notice that there may be an idle rebalancing
				5365	* needed, they will kick the idle load balancer, which then does idle
				5366	* load balancing for all the idle CPUs.
				5367	*/
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5368	static struct {
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5369	cpumask_var_t idle_cpus_mask;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5370	atomic_t nr_cpus;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5371	unsigned long next_balance; /* in jiffy units */
				5372	} nohz ____cacheline_aligned;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5373
Peter Zijlstra	8e7fbcb	2012-01-09 11:28:35 +0100	[diff] [blame]	5374	static inline int find_new_ilb(int call_cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5375	{
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5376	int ilb = cpumask_first(nohz.idle_cpus_mask);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5377
Suresh Siddha	786d6dc7	2011-12-01 17:07:35 -0800	[diff] [blame]	5378	if (ilb < nr_cpu_ids && idle_cpu(ilb))
				5379	return ilb;
				5380
				5381	return nr_cpu_ids;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5382	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5383
				5384	/*
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5385	* Kick a CPU to do the nohz balancing, if it is time for it. We pick the
				5386	* nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
				5387	* CPU (if there is one).
				5388	*/
				5389	static void nohz_balancer_kick(int cpu)
				5390	{
				5391	int ilb_cpu;
				5392
				5393	nohz.next_balance++;
				5394
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5395	ilb_cpu = find_new_ilb(cpu);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5396
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5397	if (ilb_cpu >= nr_cpu_ids)
				5398	return;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5399
Suresh Siddha	cd490c5	2011-12-06 11:26:34 -0800	[diff] [blame]	5400	if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5401	return;
				5402	/*
				5403	* Use smp_send_reschedule() instead of resched_cpu().
				5404	* This way we generate a sched IPI on the target cpu which
				5405	* is idle. And the softirq performing nohz idle load balance
				5406	* will be run before returning from the IPI.
				5407	*/
				5408	smp_send_reschedule(ilb_cpu);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5409	return;
				5410	}
				5411
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5412	static inline void nohz_balance_exit_idle(int cpu)
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	5413	{
				5414	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
				5415	cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
				5416	atomic_dec(&nohz.nr_cpus);
				5417	clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
				5418	}
				5419	}
				5420
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	5421	static inline void set_cpu_sd_state_busy(void)
				5422	{
				5423	struct sched_domain *sd;
				5424	int cpu = smp_processor_id();
				5425
				5426	if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
				5427	return;
				5428	clear_bit(NOHZ_IDLE, nohz_flags(cpu));
				5429
				5430	rcu_read_lock();
				5431	for_each_domain(cpu, sd)
				5432	atomic_inc(&sd->groups->sgp->nr_busy_cpus);
				5433	rcu_read_unlock();
				5434	}
				5435
				5436	void set_cpu_sd_state_idle(void)
				5437	{
				5438	struct sched_domain *sd;
				5439	int cpu = smp_processor_id();
				5440
				5441	if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
				5442	return;
				5443	set_bit(NOHZ_IDLE, nohz_flags(cpu));
				5444
				5445	rcu_read_lock();
				5446	for_each_domain(cpu, sd)
				5447	atomic_dec(&sd->groups->sgp->nr_busy_cpus);
				5448	rcu_read_unlock();
				5449	}
				5450
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5451	/*
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5452	* This routine will record that the cpu is going idle with tick stopped.
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5453	* This info will be used in performing idle load balancing in the future.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5454	*/
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5455	void nohz_balance_enter_idle(int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5456	{
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	5457	/*
				5458	* If this cpu is going down, then nothing needs to be done.
				5459	*/
				5460	if (!cpu_active(cpu))
				5461	return;
				5462
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5463	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
				5464	return;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5465
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5466	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
				5467	atomic_inc(&nohz.nr_cpus);
				5468	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5469	}
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	5470
				5471	static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
				5472	unsigned long action, void *hcpu)
				5473	{
				5474	switch (action & ~CPU_TASKS_FROZEN) {
				5475	case CPU_DYING:
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5476	nohz_balance_exit_idle(smp_processor_id());
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	5477	return NOTIFY_OK;
				5478	default:
				5479	return NOTIFY_DONE;
				5480	}
				5481	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5482	#endif
				5483
				5484	static DEFINE_SPINLOCK(balancing);
				5485
Peter Zijlstra	49c022e	2011-04-05 10:14:25 +0200	[diff] [blame]	5486	/*
				5487	* Scale the max load_balance interval with the number of CPUs in the system.
				5488	* This trades load-balance latency on larger machines for less cross talk.
				5489	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5490	void update_max_interval(void)
Peter Zijlstra	49c022e	2011-04-05 10:14:25 +0200	[diff] [blame]	5491	{
				5492	max_load_balance_interval = HZ*num_online_cpus()/10;
				5493	}
				5494
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5495	/*
				5496	* It checks each scheduling domain to see if it is due to be balanced,
				5497	* and initiates a balancing operation if so.
				5498	*
Libin	b9b0853	2013-04-01 19:14:01 +0800	[diff] [blame]	5499	* Balancing parameters are set up in init_sched_domains.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5500	*/
				5501	static void rebalance_domains(int cpu, enum cpu_idle_type idle)
				5502	{
				5503	int balance = 1;
				5504	struct rq *rq = cpu_rq(cpu);
				5505	unsigned long interval;
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	5506	struct sched_domain *sd;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5507	/* Earliest time when we have to do rebalance again */
				5508	unsigned long next_balance = jiffies + 60*HZ;
				5509	int update_next_balance = 0;
				5510	int need_serialize;
				5511
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	5512	update_blocked_averages(cpu);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	5513
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5514	rcu_read_lock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5515	for_each_domain(cpu, sd) {
				5516	if (!(sd->flags & SD_LOAD_BALANCE))
				5517	continue;
				5518
				5519	interval = sd->balance_interval;
				5520	if (idle != CPU_IDLE)
				5521	interval *= sd->busy_factor;
				5522
				5523	/* scale ms to jiffies */
				5524	interval = msecs_to_jiffies(interval);
Peter Zijlstra	49c022e	2011-04-05 10:14:25 +0200	[diff] [blame]	5525	interval = clamp(interval, 1UL, max_load_balance_interval);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5526
				5527	need_serialize = sd->flags & SD_SERIALIZE;
				5528
				5529	if (need_serialize) {
				5530	if (!spin_trylock(&balancing))
				5531	goto out;
				5532	}
				5533
				5534	if (time_after_eq(jiffies, sd->last_balance + interval)) {
				5535	if (load_balance(cpu, rq, sd, idle, &balance)) {
				5536	/*
Joonsoo Kim	de5eb2d	2013-04-23 17:27:38 +0900	[diff] [blame]	5537	* The LBF_SOME_PINNED logic could have changed
				5538	* env->dst_cpu, so we can't know our idle
				5539	* state even if we migrated tasks. Update it.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5540	*/
Joonsoo Kim	de5eb2d	2013-04-23 17:27:38 +0900	[diff] [blame]	5541	idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5542	}
				5543	sd->last_balance = jiffies;
				5544	}
				5545	if (need_serialize)
				5546	spin_unlock(&balancing);
				5547	out:
				5548	if (time_after(next_balance, sd->last_balance + interval)) {
				5549	next_balance = sd->last_balance + interval;
				5550	update_next_balance = 1;
				5551	}
				5552
				5553	/*
				5554	* Stop the load balance at this level. There is another
				5555	* CPU in our sched group which is doing load balancing more
				5556	* actively.
				5557	*/
				5558	if (!balance)
				5559	break;
				5560	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5561	rcu_read_unlock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5562
				5563	/*
				5564	* next_balance will be updated only when there is a need.
				5565	* When the cpu is attached to null domain for ex, it will not be
				5566	* updated.
				5567	*/
				5568	if (likely(update_next_balance))
				5569	rq->next_balance = next_balance;
				5570	}
				5571
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5572	#ifdef CONFIG_NO_HZ
				5573	/*
				5574	* In CONFIG_NO_HZ case, the idle balance kickee will do the
				5575	* rebalancing for all the cpus for whom scheduler ticks are stopped.
				5576	*/
				5577	static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
				5578	{
				5579	struct rq *this_rq = cpu_rq(this_cpu);
				5580	struct rq *rq;
				5581	int balance_cpu;
				5582
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5583	if (idle != CPU_IDLE \|\|
				5584	!test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
				5585	goto end;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5586
				5587	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
Suresh Siddha	8a6d42d	2011-12-06 11:19:37 -0800	[diff] [blame]	5588	if (balance_cpu == this_cpu \|\| !idle_cpu(balance_cpu))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5589	continue;
				5590
				5591	/*
				5592	* If this cpu gets work to do, stop the load balancing
				5593	* work being done for other cpus. Next load
				5594	* balancing owner will pick it up.
				5595	*/
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5596	if (need_resched())
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5597	break;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5598
Vincent Guittot	5ed4f1d	2012-09-13 06:11:26 +0200	[diff] [blame]	5599	rq = cpu_rq(balance_cpu);
				5600
				5601	raw_spin_lock_irq(&rq->lock);
				5602	update_rq_clock(rq);
				5603	update_idle_cpu_load(rq);
				5604	raw_spin_unlock_irq(&rq->lock);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5605
				5606	rebalance_domains(balance_cpu, CPU_IDLE);
				5607
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5608	if (time_after(this_rq->next_balance, rq->next_balance))
				5609	this_rq->next_balance = rq->next_balance;
				5610	}
				5611	nohz.next_balance = this_rq->next_balance;
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5612	end:
				5613	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5614	}
				5615
				5616	/*
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5617	* Current heuristic for kicking the idle load balancer in the presence
				5618	* of an idle cpu is the system.
				5619	* - This rq has more than one task.
				5620	* - At any scheduler domain level, this cpu's scheduler group has multiple
				5621	* busy cpu's exceeding the group's power.
				5622	* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
				5623	* domain span are idle.
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5624	*/
				5625	static inline int nohz_kick_needed(struct rq *rq, int cpu)
				5626	{
				5627	unsigned long now = jiffies;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5628	struct sched_domain *sd;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5629
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5630	if (unlikely(idle_cpu(cpu)))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5631	return 0;
				5632
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5633	/*
				5634	* We may be recently in ticked or tickless idle mode. At the first
				5635	* busy tick after returning from idle, we will update the busy stats.
				5636	*/
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	5637	set_cpu_sd_state_busy();
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5638	nohz_balance_exit_idle(cpu);
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5639
				5640	/*
				5641	* None are in tickless mode and hence no need for NOHZ idle load
				5642	* balancing.
				5643	*/
				5644	if (likely(!atomic_read(&nohz.nr_cpus)))
				5645	return 0;
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5646
				5647	if (time_before(now, nohz.next_balance))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5648	return 0;
				5649
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5650	if (rq->nr_running >= 2)
				5651	goto need_kick;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5652
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5653	rcu_read_lock();
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5654	for_each_domain(cpu, sd) {
				5655	struct sched_group *sg = sd->groups;
				5656	struct sched_group_power *sgp = sg->sgp;
				5657	int nr_busy = atomic_read(&sgp->nr_busy_cpus);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5658
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5659	if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5660	goto need_kick_unlock;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5661
				5662	if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
				5663	&& (cpumask_first_and(nohz.idle_cpus_mask,
				5664	sched_domain_span(sd)) < cpu))
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5665	goto need_kick_unlock;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5666
				5667	if (!(sd->flags & (SD_SHARE_PKG_RESOURCES \| SD_ASYM_PACKING)))
				5668	break;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5669	}
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5670	rcu_read_unlock();
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5671	return 0;
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5672
				5673	need_kick_unlock:
				5674	rcu_read_unlock();
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5675	need_kick:
				5676	return 1;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5677	}
				5678	#else
				5679	static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
				5680	#endif
				5681
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5682	/*
				5683	* run_rebalance_domains is triggered when needed from the scheduler tick.
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5684	* Also triggered for nohz idle balancing (with nohz_balancing_kick set).
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5685	*/
				5686	static void run_rebalance_domains(struct softirq_action *h)
				5687	{
				5688	int this_cpu = smp_processor_id();
				5689	struct rq *this_rq = cpu_rq(this_cpu);
Suresh Siddha	6eb57e0	2011-10-03 15:09:01 -0700	[diff] [blame]	5690	enum cpu_idle_type idle = this_rq->idle_balance ?
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5691	CPU_IDLE : CPU_NOT_IDLE;
				5692
				5693	rebalance_domains(this_cpu, idle);
				5694
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5695	/*
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5696	* If this cpu has a pending nohz_balance_kick, then do the
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5697	* balancing on behalf of the other idle cpus whose ticks are
				5698	* stopped.
				5699	*/
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5700	nohz_idle_balance(this_cpu, idle);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5701	}
				5702
				5703	static inline int on_null_domain(int cpu)
				5704	{
Paul E. McKenney	90a6501	2010-02-28 08:32:18 -0800	[diff] [blame]	5705	return !rcu_dereference_sched(cpu_rq(cpu)->sd);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5706	}
				5707
				5708	/*
				5709	* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5710	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5711	void trigger_load_balance(struct rq *rq, int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5712	{
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5713	/* Don't need to rebalance while attached to NULL domain */
				5714	if (time_after_eq(jiffies, rq->next_balance) &&
				5715	likely(!on_null_domain(cpu)))
				5716	raise_softirq(SCHED_SOFTIRQ);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5717	#ifdef CONFIG_NO_HZ
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5718	if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5719	nohz_balancer_kick(cpu);
				5720	#endif
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5721	}
				5722
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	5723	static void rq_online_fair(struct rq *rq)
				5724	{
				5725	update_sysctl();
				5726	}
				5727
				5728	static void rq_offline_fair(struct rq *rq)
				5729	{
				5730	update_sysctl();
Peter Boonstoppel	a4c96ae	2012-08-09 15:34:47 -0700	[diff] [blame]	5731
				5732	/* Ensure any throttled groups are reachable by pick_next_task */
				5733	unthrottle_offline_cfs_rqs(rq);
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	5734	}
				5735
Dhaval Giani	55e12e5	2008-06-24 23:39:43 +0530	[diff] [blame]	5736	#endif /* CONFIG_SMP */
Peter Williams	e1d1484	2007-10-24 18:23:51 +0200	[diff] [blame]	5737
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5738	/*
				5739	* scheduler tick hitting a task of our scheduling class:
				5740	*/
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	5741	static void task_tick_fair(struct rq rq, struct task_struct curr, int queued)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5742	{
				5743	struct cfs_rq *cfs_rq;
				5744	struct sched_entity *se = &curr->se;
				5745
				5746	for_each_sched_entity(se) {
				5747	cfs_rq = cfs_rq_of(se);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	5748	entity_tick(cfs_rq, se, queued);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5749	}
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	5750
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	5751	if (sched_feat_numa(NUMA))
				5752	task_tick_numa(rq, curr);
Linus Torvalds	3d59eeb	2012-12-16 14:33:25 -0800	[diff] [blame]	5753
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	5754	update_rq_runnable_avg(rq, 1);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5755	}
				5756
				5757	/*
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5758	* called on fork with the child task as argument from the parent's context
				5759	* - child not yet on the tasklist
				5760	* - preemption disabled
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5761	*/
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5762	static void task_fork_fair(struct task_struct *p)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5763	{
Daisuke Nishimura	4fc420c	2011-12-15 14:36:55 +0900	[diff] [blame]	5764	struct cfs_rq *cfs_rq;
				5765	struct sched_entity se = &p->se, curr;
Ingo Molnar	00bf7bf	2007-10-15 17:00:14 +0200	[diff] [blame]	5766	int this_cpu = smp_processor_id();
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5767	struct rq *rq = this_rq();
				5768	unsigned long flags;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5769
Thomas Gleixner	05fa785	2009-11-17 14:28:38 +0100	[diff] [blame]	5770	raw_spin_lock_irqsave(&rq->lock, flags);
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5771
Peter Zijlstra	861d034	2010-08-19 13:31:43 +0200	[diff] [blame]	5772	update_rq_clock(rq);
				5773
Daisuke Nishimura	4fc420c	2011-12-15 14:36:55 +0900	[diff] [blame]	5774	cfs_rq = task_cfs_rq(current);
				5775	curr = cfs_rq->curr;
				5776
Paul E. McKenney	b0a0f66	2010-10-06 17:32:51 -0700	[diff] [blame]	5777	if (unlikely(task_cpu(p) != this_cpu)) {
				5778	rcu_read_lock();
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5779	__set_task_cpu(p, this_cpu);
Paul E. McKenney	b0a0f66	2010-10-06 17:32:51 -0700	[diff] [blame]	5780	rcu_read_unlock();
				5781	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5782
Ting Yang	7109c44	2007-08-28 12:53:24 +0200	[diff] [blame]	5783	update_curr(cfs_rq);
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5784
Mike Galbraith	b5d9d73	2009-09-08 11:12:28 +0200	[diff] [blame]	5785	if (curr)
				5786	se->vruntime = curr->vruntime;
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	5787	place_entity(cfs_rq, se, 1);
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	5788
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5789	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
Dmitry Adamushko	87fefa3	2007-10-15 17:00:08 +0200	[diff] [blame]	5790	/*
Ingo Molnar	edcb60a	2007-10-15 17:00:08 +0200	[diff] [blame]	5791	* Upon rescheduling, sched_class::put_prev_task() will place
				5792	* 'current' within the tree based on its new key value.
				5793	*/
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	5794	swap(curr->vruntime, se->vruntime);
Bharata B Rao	aec0a51	2008-08-28 14:42:49 +0530	[diff] [blame]	5795	resched_task(rq->curr);
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	5796	}
				5797
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	5798	se->vruntime -= cfs_rq->min_vruntime;
				5799
Thomas Gleixner	05fa785	2009-11-17 14:28:38 +0100	[diff] [blame]	5800	raw_spin_unlock_irqrestore(&rq->lock, flags);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5801	}
				5802
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5803	/*
				5804	* Priority of the task has changed. Check to see if we preempt
				5805	* the current task.
				5806	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5807	static void
				5808	prio_changed_fair(struct rq rq, struct task_struct p, int oldprio)
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5809	{
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5810	if (!p->se.on_rq)
				5811	return;
				5812
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5813	/*
				5814	* Reschedule if we are currently running on this runqueue and
				5815	* our priority decreased, or if we are not currently running on
				5816	* this runqueue and our priority is higher than the current's
				5817	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5818	if (rq->curr == p) {
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5819	if (p->prio > oldprio)
				5820	resched_task(rq->curr);
				5821	} else
Peter Zijlstra	15afe09	2008-09-20 23:38:02 +0200	[diff] [blame]	5822	check_preempt_curr(rq, p, 0);
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5823	}
				5824
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5825	static void switched_from_fair(struct rq rq, struct task_struct p)
				5826	{
				5827	struct sched_entity *se = &p->se;
				5828	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				5829
				5830	/*
				5831	* Ensure the task's vruntime is normalized, so that when its
				5832	* switched back to the fair class the enqueue_entity(.flags=0) will
				5833	* do the right thing.
				5834	*
				5835	* If it was on_rq, then the dequeue_entity(.flags=0) will already
				5836	* have normalized the vruntime, if it was !on_rq, then only when
				5837	* the task is sleeping will it still have non-normalized vruntime.
				5838	*/
				5839	if (!se->on_rq && p->state != TASK_RUNNING) {
				5840	/*
				5841	* Fix up our vruntime so that the current sleep doesn't
				5842	* cause 'unlimited' sleep bonus.
				5843	*/
				5844	place_entity(cfs_rq, se, 0);
				5845	se->vruntime -= cfs_rq->min_vruntime;
				5846	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	5847
				5848	#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
				5849	/*
				5850	* Remove our load from contribution when we leave sched_fair
				5851	* and ensure we don't carry in an old decay_count if we
				5852	* switch back.
				5853	*/
				5854	if (p->se.avg.decay_count) {
				5855	struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
				5856	__synchronize_entity_decay(&p->se);
				5857	subtract_blocked_load_contrib(cfs_rq,
				5858	p->se.avg.load_avg_contrib);
				5859	}
				5860	#endif
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5861	}
				5862
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5863	/*
				5864	* We switched to the sched_fair class.
				5865	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5866	static void switched_to_fair(struct rq rq, struct task_struct p)
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5867	{
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5868	if (!p->se.on_rq)
				5869	return;
				5870
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5871	/*
				5872	* We were most likely switched from sched_rt, so
				5873	* kick off the schedule if running, otherwise just see
				5874	* if we can still preempt the current task.
				5875	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5876	if (rq->curr == p)
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5877	resched_task(rq->curr);
				5878	else
Peter Zijlstra	15afe09	2008-09-20 23:38:02 +0200	[diff] [blame]	5879	check_preempt_curr(rq, p, 0);
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5880	}
				5881
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	5882	/* Account for a task changing its policy or group.
				5883	*
				5884	* This routine is mostly called to set cfs_rq->curr field when a task
				5885	* migrates between groups/classes.
				5886	*/
				5887	static void set_curr_task_fair(struct rq *rq)
				5888	{
				5889	struct sched_entity *se = &rq->curr->se;
				5890
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	5891	for_each_sched_entity(se) {
				5892	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				5893
				5894	set_next_entity(cfs_rq, se);
				5895	/* ensure bandwidth has been allocated on our new cfs_rq */
				5896	account_cfs_rq_runtime(cfs_rq, 0);
				5897	}
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	5898	}
				5899
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5900	void init_cfs_rq(struct cfs_rq *cfs_rq)
				5901	{
				5902	cfs_rq->tasks_timeline = RB_ROOT;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5903	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
				5904	#ifndef CONFIG_64BIT
				5905	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
				5906	#endif
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	5907	#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
				5908	atomic64_set(&cfs_rq->decay_counter, 1);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	5909	atomic64_set(&cfs_rq->removed_load, 0);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	5910	#endif
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5911	}
				5912
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	5913	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	5914	static void task_move_group_fair(struct task_struct *p, int on_rq)
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	5915	{
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	5916	struct cfs_rq *cfs_rq;
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	5917	/*
				5918	* If the task was not on the rq at the time of this cgroup movement
				5919	* it must have been asleep, sleeping tasks keep their ->vruntime
				5920	* absolute on their old rq until wakeup (needed for the fair sleeper
				5921	* bonus in place_entity()).
				5922	*
				5923	* If it was on the rq, we've just 'preempted' it, which does convert
				5924	* ->vruntime to a relative base.
				5925	*
				5926	* Make sure both cases convert their relative position when migrating
				5927	* to another cgroup's rq. This does somewhat interfere with the
				5928	* fair sleeper stuff for the first placement, but who cares.
				5929	*/
Daisuke Nishimura	7ceff01	2011-12-15 14:36:07 +0900	[diff] [blame]	5930	/*
				5931	* When !on_rq, vruntime of the task has usually NOT been normalized.
				5932	* But there are some cases where it has already been normalized:
				5933	*
				5934	* - Moving a forked child which is waiting for being woken up by
				5935	* wake_up_new_task().
Daisuke Nishimura	62af378	2011-12-15 14:37:41 +0900	[diff] [blame]	5936	* - Moving a task which has been woken up by try_to_wake_up() and
				5937	* waiting for actually being woken up by sched_ttwu_pending().
Daisuke Nishimura	7ceff01	2011-12-15 14:36:07 +0900	[diff] [blame]	5938	*
				5939	* To prevent boost or penalty in the new cfs_rq caused by delta
				5940	* min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
				5941	*/
Daisuke Nishimura	62af378	2011-12-15 14:37:41 +0900	[diff] [blame]	5942	if (!on_rq && (!p->se.sum_exec_runtime \|\| p->state == TASK_WAKING))
Daisuke Nishimura	7ceff01	2011-12-15 14:36:07 +0900	[diff] [blame]	5943	on_rq = 1;
				5944
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	5945	if (!on_rq)
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	5946	p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
				5947	set_task_rq(p, task_cpu(p));
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	5948	if (!on_rq) {
				5949	cfs_rq = cfs_rq_of(&p->se);
				5950	p->se.vruntime += cfs_rq->min_vruntime;
				5951	#ifdef CONFIG_SMP
				5952	/*
				5953	* migrate_task_rq_fair() will have removed our previous
				5954	* contribution, but we must synchronize for ongoing future
				5955	* decay.
				5956	*/
				5957	p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
				5958	cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
				5959	#endif
				5960	}
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	5961	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5962
				5963	void free_fair_sched_group(struct task_group *tg)
				5964	{
				5965	int i;
				5966
				5967	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
				5968
				5969	for_each_possible_cpu(i) {
				5970	if (tg->cfs_rq)
				5971	kfree(tg->cfs_rq[i]);
				5972	if (tg->se)
				5973	kfree(tg->se[i]);
				5974	}
				5975
				5976	kfree(tg->cfs_rq);
				5977	kfree(tg->se);
				5978	}
				5979
				5980	int alloc_fair_sched_group(struct task_group tg, struct task_group parent)
				5981	{
				5982	struct cfs_rq *cfs_rq;
				5983	struct sched_entity *se;
				5984	int i;
				5985
				5986	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
				5987	if (!tg->cfs_rq)
				5988	goto err;
				5989	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
				5990	if (!tg->se)
				5991	goto err;
				5992
				5993	tg->shares = NICE_0_LOAD;
				5994
				5995	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
				5996
				5997	for_each_possible_cpu(i) {
				5998	cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
				5999	GFP_KERNEL, cpu_to_node(i));
				6000	if (!cfs_rq)
				6001	goto err;
				6002
				6003	se = kzalloc_node(sizeof(struct sched_entity),
				6004	GFP_KERNEL, cpu_to_node(i));
				6005	if (!se)
				6006	goto err_free_rq;
				6007
				6008	init_cfs_rq(cfs_rq);
				6009	init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
				6010	}
				6011
				6012	return 1;
				6013
				6014	err_free_rq:
				6015	kfree(cfs_rq);
				6016	err:
				6017	return 0;
				6018	}
				6019
				6020	void unregister_fair_sched_group(struct task_group *tg, int cpu)
				6021	{
				6022	struct rq *rq = cpu_rq(cpu);
				6023	unsigned long flags;
				6024
				6025	/*
				6026	* Only empty task groups can be destroyed; so we can speculatively
				6027	* check on_list without danger of it being re-added.
				6028	*/
				6029	if (!tg->cfs_rq[cpu]->on_list)
				6030	return;
				6031
				6032	raw_spin_lock_irqsave(&rq->lock, flags);
				6033	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
				6034	raw_spin_unlock_irqrestore(&rq->lock, flags);
				6035	}
				6036
				6037	void init_tg_cfs_entry(struct task_group tg, struct cfs_rq cfs_rq,
				6038	struct sched_entity *se, int cpu,
				6039	struct sched_entity *parent)
				6040	{
				6041	struct rq *rq = cpu_rq(cpu);
				6042
				6043	cfs_rq->tg = tg;
				6044	cfs_rq->rq = rq;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6045	init_cfs_rq_runtime(cfs_rq);
				6046
				6047	tg->cfs_rq[cpu] = cfs_rq;
				6048	tg->se[cpu] = se;
				6049
				6050	/* se could be NULL for root_task_group */
				6051	if (!se)
				6052	return;
				6053
				6054	if (!parent)
				6055	se->cfs_rq = &rq->cfs;
				6056	else
				6057	se->cfs_rq = parent->my_q;
				6058
				6059	se->my_q = cfs_rq;
				6060	update_load_set(&se->load, 0);
				6061	se->parent = parent;
				6062	}
				6063
				6064	static DEFINE_MUTEX(shares_mutex);
				6065
				6066	int sched_group_set_shares(struct task_group *tg, unsigned long shares)
				6067	{
				6068	int i;
				6069	unsigned long flags;
				6070
				6071	/*
				6072	* We can't change the weight of the root cgroup.
				6073	*/
				6074	if (!tg->se[0])
				6075	return -EINVAL;
				6076
				6077	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
				6078
				6079	mutex_lock(&shares_mutex);
				6080	if (tg->shares == shares)
				6081	goto done;
				6082
				6083	tg->shares = shares;
				6084	for_each_possible_cpu(i) {
				6085	struct rq *rq = cpu_rq(i);
				6086	struct sched_entity *se;
				6087
				6088	se = tg->se[i];
				6089	/* Propagate contribution to hierarchy */
				6090	raw_spin_lock_irqsave(&rq->lock, flags);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	6091	for_each_sched_entity(se)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6092	update_cfs_shares(group_cfs_rq(se));
				6093	raw_spin_unlock_irqrestore(&rq->lock, flags);
				6094	}
				6095
				6096	done:
				6097	mutex_unlock(&shares_mutex);
				6098	return 0;
				6099	}
				6100	#else /* CONFIG_FAIR_GROUP_SCHED */
				6101
				6102	void free_fair_sched_group(struct task_group *tg) { }
				6103
				6104	int alloc_fair_sched_group(struct task_group tg, struct task_group parent)
				6105	{
				6106	return 1;
				6107	}
				6108
				6109	void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
				6110
				6111	#endif /* CONFIG_FAIR_GROUP_SCHED */
				6112
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6113
H Hartley Sweeten	6d686f4	2010-01-13 20:21:52 -0700	[diff] [blame]	6114	static unsigned int get_rr_interval_fair(struct rq rq, struct task_struct task)
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6115	{
				6116	struct sched_entity *se = &task->se;
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6117	unsigned int rr_interval = 0;
				6118
				6119	/*
				6120	* Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
				6121	* idle runqueue:
				6122	*/
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6123	if (rq->cfs.load.weight)
Zhu Yanhai	a59f4e0	2013-01-08 12:56:52 +0800	[diff] [blame]	6124	rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6125
				6126	return rr_interval;
				6127	}
				6128
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6129	/*
				6130	* All the scheduling class methods:
				6131	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6132	const struct sched_class fair_sched_class = {
Ingo Molnar	5522d5d	2007-10-15 17:00:12 +0200	[diff] [blame]	6133	.next = &idle_sched_class,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6134	.enqueue_task = enqueue_task_fair,
				6135	.dequeue_task = dequeue_task_fair,
				6136	.yield_task = yield_task_fair,
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	6137	.yield_to_task = yield_to_task_fair,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6138
Ingo Molnar	2e09bf5	2007-10-15 17:00:05 +0200	[diff] [blame]	6139	.check_preempt_curr = check_preempt_wakeup,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6140
				6141	.pick_next_task = pick_next_task_fair,
				6142	.put_prev_task = put_prev_task_fair,
				6143
Peter Williams	681f3e6	2007-10-24 18:23:51 +0200	[diff] [blame]	6144	#ifdef CONFIG_SMP
Li Zefan	4ce72a2	2008-10-22 15:25:26 +0800	[diff] [blame]	6145	.select_task_rq = select_task_rq_fair,
Paul Turner	f4e26b1	2012-10-04 13:18:32 +0200	[diff] [blame]	6146	#ifdef CONFIG_FAIR_GROUP_SCHED
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	6147	.migrate_task_rq = migrate_task_rq_fair,
Paul Turner	f4e26b1	2012-10-04 13:18:32 +0200	[diff] [blame]	6148	#endif
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	6149	.rq_online = rq_online_fair,
				6150	.rq_offline = rq_offline_fair,
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	6151
				6152	.task_waking = task_waking_fair,
Peter Williams	681f3e6	2007-10-24 18:23:51 +0200	[diff] [blame]	6153	#endif
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6154
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	6155	.set_curr_task = set_curr_task_fair,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6156	.task_tick = task_tick_fair,
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	6157	.task_fork = task_fork_fair,
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6158
				6159	.prio_changed = prio_changed_fair,
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	6160	.switched_from = switched_from_fair,
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6161	.switched_to = switched_to_fair,
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6162
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6163	.get_rr_interval = get_rr_interval_fair,
				6164
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6165	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	6166	.task_move_group = task_move_group_fair,
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6167	#endif
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6168	};
				6169
				6170	#ifdef CONFIG_SCHED_DEBUG
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6171	void print_cfs_stats(struct seq_file *m, int cpu)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6172	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6173	struct cfs_rq *cfs_rq;
				6174
Peter Zijlstra	5973e5b	2008-01-25 21:08:34 +0100	[diff] [blame]	6175	rcu_read_lock();
Ingo Molnar	c3b64f1	2007-08-09 11:16:51 +0200	[diff] [blame]	6176	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
Ingo Molnar	5cef9ec	2007-08-09 11:16:47 +0200	[diff] [blame]	6177	print_cfs_rq(m, cpu, cfs_rq);
Peter Zijlstra	5973e5b	2008-01-25 21:08:34 +0100	[diff] [blame]	6178	rcu_read_unlock();
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6179	}
				6180	#endif
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6181
				6182	__init void init_sched_fair_class(void)
				6183	{
				6184	#ifdef CONFIG_SMP
				6185	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
				6186
				6187	#ifdef CONFIG_NO_HZ
Diwakar Tundlam	554ceca	2012-03-07 14:44:26 -0800	[diff] [blame]	6188	nohz.next_balance = jiffies;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6189	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	6190	cpu_notifier(sched_ilb_notifier, 0);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6191	#endif
				6192	#endif /* SMP */
				6193
				6194	}