Blame - kernel/sched/fair.c - SHIFTPHONES/android_kernel_shift_sdm845

blob: c61a614465c8ebf13b5f71aabf23a78c9e8533a6 [file] [log] [blame]

Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1	/*
				2	* Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
				3	*
				4	* Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
				5	*
				6	* Interactivity improvements by Mike Galbraith
				7	* (C) 2007 Mike Galbraith <efault@gmx.de>
				8	*
				9	* Various enhancements by Dmitry Adamushko.
				10	* (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
				11	*
				12	* Group scheduling enhancements by Srivatsa Vaddagiri
				13	* Copyright IBM Corporation, 2007
				14	* Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
				15	*
				16	* Scaled math optimizations by Thomas Gleixner
				17	* Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	18	*
				19	* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
				20	* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	21	*/
				22
Arjan van de Ven	9745512	2008-01-25 21:08:34 +0100	[diff] [blame]	23	#include <linux/latencytop.h>
Christian Ehrhardt	1983a92	2009-11-30 12:16:47 +0100	[diff] [blame]	24	#include <linux/sched.h>
Sisir Koppaka	3436ae1	2011-03-26 18:22:55 +0530	[diff] [blame]	25	#include <linux/cpumask.h>
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	26	#include <linux/slab.h>
				27	#include <linux/profile.h>
				28	#include <linux/interrupt.h>
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	29	#include <linux/mempolicy.h>
Mel Gorman	e14808b	2012-11-19 10:59:15 +0000	[diff] [blame]	30	#include <linux/migrate.h>
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	31	#include <linux/task_work.h>
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	32
				33	#include <trace/events/sched.h>
				34
				35	#include "sched.h"
Arjan van de Ven	9745512	2008-01-25 21:08:34 +0100	[diff] [blame]	36
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	37	/*
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	38	* Targeted preemption latency for CPU-bound tasks:
Takuya Yoshikawa	864616e	2010-10-14 16:09:13 +0900	[diff] [blame]	39	* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	40	*
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	41	* NOTE: this latency value is not the same as the concept of
Ingo Molnar	d274a4c	2007-10-15 17:00:14 +0200	[diff] [blame]	42	* 'timeslice length' - timeslices in CFS are of variable length
				43	* and have no persistent notion like in traditional, time-slice
				44	* based scheduling concepts.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	45	*
Ingo Molnar	d274a4c	2007-10-15 17:00:14 +0200	[diff] [blame]	46	* (to see the precise effective timeslice length of your workload,
				47	* run vmstat and monitor the context-switches (cs) field)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	48	*/
Mike Galbraith	2140692	2010-03-11 17:17:15 +0100	[diff] [blame]	49	unsigned int sysctl_sched_latency = 6000000ULL;
				50	unsigned int normalized_sysctl_sched_latency = 6000000ULL;
Ingo Molnar	2bd8e6d	2007-10-15 17:00:02 +0200	[diff] [blame]	51
				52	/*
Christian Ehrhardt	1983a92	2009-11-30 12:16:47 +0100	[diff] [blame]	53	* The initial- and re-scaling of tunables is configurable
				54	* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
				55	*
				56	* Options are:
				57	* SCHED_TUNABLESCALING_NONE - unscaled, always *1
				58	* SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
				59	* SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
				60	*/
				61	enum sched_tunable_scaling sysctl_sched_tunable_scaling
				62	= SCHED_TUNABLESCALING_LOG;
				63
				64	/*
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	65	* Minimal preemption granularity for CPU-bound tasks:
Takuya Yoshikawa	864616e	2010-10-14 16:09:13 +0900	[diff] [blame]	66	* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	67	*/
Ingo Molnar	0bf377b	2010-09-12 08:14:52 +0200	[diff] [blame]	68	unsigned int sysctl_sched_min_granularity = 750000ULL;
				69	unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	70
				71	/*
				72	* is kept at sysctl_sched_latency / sysctl_sched_min_granularity
				73	*/
Ingo Molnar	0bf377b	2010-09-12 08:14:52 +0200	[diff] [blame]	74	static unsigned int sched_nr_latency = 8;
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	75
				76	/*
Mike Galbraith	2bba22c	2009-09-09 15:41:37 +0200	[diff] [blame]	77	* After fork, child runs first. If set to 0 (default) then
Ingo Molnar	2bd8e6d	2007-10-15 17:00:02 +0200	[diff] [blame]	78	* parent will (try to) run first.
				79	*/
Mike Galbraith	2bba22c	2009-09-09 15:41:37 +0200	[diff] [blame]	80	unsigned int sysctl_sched_child_runs_first __read_mostly;
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	81
				82	/*
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	83	* SCHED_OTHER wake-up granularity.
Mike Galbraith	172e082	2009-09-09 15:41:37 +0200	[diff] [blame]	84	* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	85	*
				86	* This option delays the preemption effects of decoupled workloads
				87	* and reduces their over-scheduling. Synchronous workloads will still
				88	* have immediate wakeup/sleep latencies.
				89	*/
Mike Galbraith	172e082	2009-09-09 15:41:37 +0200	[diff] [blame]	90	unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	91	unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	92
Ingo Molnar	da84d96	2007-10-15 17:00:18 +0200	[diff] [blame]	93	const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
				94
Paul Turner	a7a4f8a	2010-11-15 15:47:06 -0800	[diff] [blame]	95	/*
				96	* The exponential sliding window over which load is averaged for shares
				97	* distribution.
				98	* (default: 10msec)
				99	*/
				100	unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
				101
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	102	#ifdef CONFIG_CFS_BANDWIDTH
				103	/*
				104	* Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
				105	* each time a cfs_rq requests quota.
				106	*
				107	* Note: in the case that the slice exceeds the runtime remaining (either due
				108	* to consumption or the quota being specified to be smaller than the slice)
				109	* we will always only issue the remaining available time.
				110	*
				111	* default: 5 msec, units: microseconds
				112	*/
				113	unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
				114	#endif
				115
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	116	/*
				117	* Increase the granularity value when there are more CPUs,
				118	* because with more CPUs the 'effective latency' as visible
				119	* to users decreases. But the relationship is not linear,
				120	* so pick a second-best guess by going with the log2 of the
				121	* number of CPUs.
				122	*
				123	* This idea comes from the SD scheduler of Con Kolivas:
				124	*/
				125	static int get_update_sysctl_factor(void)
				126	{
				127	unsigned int cpus = min_t(int, num_online_cpus(), 8);
				128	unsigned int factor;
				129
				130	switch (sysctl_sched_tunable_scaling) {
				131	case SCHED_TUNABLESCALING_NONE:
				132	factor = 1;
				133	break;
				134	case SCHED_TUNABLESCALING_LINEAR:
				135	factor = cpus;
				136	break;
				137	case SCHED_TUNABLESCALING_LOG:
				138	default:
				139	factor = 1 + ilog2(cpus);
				140	break;
				141	}
				142
				143	return factor;
				144	}
				145
				146	static void update_sysctl(void)
				147	{
				148	unsigned int factor = get_update_sysctl_factor();
				149
				150	#define SET_SYSCTL(name) \
				151	(sysctl_##name = (factor) * normalized_sysctl_##name)
				152	SET_SYSCTL(sched_min_granularity);
				153	SET_SYSCTL(sched_latency);
				154	SET_SYSCTL(sched_wakeup_granularity);
				155	#undef SET_SYSCTL
				156	}
				157
				158	void sched_init_granularity(void)
				159	{
				160	update_sysctl();
				161	}
				162
				163	#if BITS_PER_LONG == 32
				164	# define WMULT_CONST (~0UL)
				165	#else
				166	# define WMULT_CONST (1UL << 32)
				167	#endif
				168
				169	#define WMULT_SHIFT 32
				170
				171	/*
				172	* Shift right and round:
				173	*/
				174	#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
				175
				176	/*
				177	* delta *= weight / lw
				178	*/
				179	static unsigned long
				180	calc_delta_mine(unsigned long delta_exec, unsigned long weight,
				181	struct load_weight *lw)
				182	{
				183	u64 tmp;
				184
				185	/*
				186	* weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
				187	* entities since MIN_SHARES = 2. Treat weight as 1 if less than
				188	* 2^SCHED_LOAD_RESOLUTION.
				189	*/
				190	if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
				191	tmp = (u64)delta_exec * scale_load_down(weight);
				192	else
				193	tmp = (u64)delta_exec;
				194
				195	if (!lw->inv_weight) {
				196	unsigned long w = scale_load_down(lw->weight);
				197
				198	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
				199	lw->inv_weight = 1;
				200	else if (unlikely(!w))
				201	lw->inv_weight = WMULT_CONST;
				202	else
				203	lw->inv_weight = WMULT_CONST / w;
				204	}
				205
				206	/*
				207	* Check whether we'd overflow the 64-bit multiplication:
				208	*/
				209	if (unlikely(tmp > WMULT_CONST))
				210	tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
				211	WMULT_SHIFT/2);
				212	else
				213	tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
				214
				215	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
				216	}
				217
				218
				219	const struct sched_class fair_sched_class;
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	220
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	221	/**************************************************************
				222	* CFS operations on generic schedulable entities:
				223	*/
				224
				225	#ifdef CONFIG_FAIR_GROUP_SCHED
				226
				227	/* cpu runqueue to which this cfs_rq is attached */
				228	static inline struct rq rq_of(struct cfs_rq cfs_rq)
				229	{
				230	return cfs_rq->rq;
				231	}
				232
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	233	/* An entity is a task if it doesn't "own" a runqueue */
				234	#define entity_is_task(se) (!se->my_q)
				235
Peter Zijlstra	8f48894	2009-07-24 12:25:30 +0200	[diff] [blame]	236	static inline struct task_struct task_of(struct sched_entity se)
				237	{
				238	#ifdef CONFIG_SCHED_DEBUG
				239	WARN_ON_ONCE(!entity_is_task(se));
				240	#endif
				241	return container_of(se, struct task_struct, se);
				242	}
				243
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	244	/* Walk up scheduling entities hierarchy */
				245	#define for_each_sched_entity(se) \
				246	for (; se; se = se->parent)
				247
				248	static inline struct cfs_rq task_cfs_rq(struct task_struct p)
				249	{
				250	return p->se.cfs_rq;
				251	}
				252
				253	/* runqueue on which this entity is (to be) queued */
				254	static inline struct cfs_rq cfs_rq_of(struct sched_entity se)
				255	{
				256	return se->cfs_rq;
				257	}
				258
				259	/* runqueue "owned" by this group */
				260	static inline struct cfs_rq group_cfs_rq(struct sched_entity grp)
				261	{
				262	return grp->my_q;
				263	}
				264
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	265	static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
				266	int force_update);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	267
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	268	static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				269	{
				270	if (!cfs_rq->on_list) {
Paul Turner	67e8625	2010-11-15 15:47:05 -0800	[diff] [blame]	271	/*
				272	* Ensure we either appear before our parent (if already
				273	* enqueued) or force our parent to appear after us when it is
				274	* enqueued. The fact that we always enqueue bottom-up
				275	* reduces this to two cases.
				276	*/
				277	if (cfs_rq->tg->parent &&
				278	cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
				279	list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	280	&rq_of(cfs_rq)->leaf_cfs_rq_list);
Paul Turner	67e8625	2010-11-15 15:47:05 -0800	[diff] [blame]	281	} else {
				282	list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
				283	&rq_of(cfs_rq)->leaf_cfs_rq_list);
				284	}
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	285
				286	cfs_rq->on_list = 1;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	287	/* We should have no load, but we need to update last_decay. */
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	288	update_cfs_rq_blocked_load(cfs_rq, 0);
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	289	}
				290	}
				291
				292	static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				293	{
				294	if (cfs_rq->on_list) {
				295	list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
				296	cfs_rq->on_list = 0;
				297	}
				298	}
				299
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	300	/* Iterate thr' all leaf cfs_rq's on a runqueue */
				301	#define for_each_leaf_cfs_rq(rq, cfs_rq) \
				302	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
				303
				304	/* Do the two (enqueued) entities belong to the same group ? */
				305	static inline int
				306	is_same_group(struct sched_entity se, struct sched_entity pse)
				307	{
				308	if (se->cfs_rq == pse->cfs_rq)
				309	return 1;
				310
				311	return 0;
				312	}
				313
				314	static inline struct sched_entity parent_entity(struct sched_entity se)
				315	{
				316	return se->parent;
				317	}
				318
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	319	/* return depth at which a sched entity is present in the hierarchy */
				320	static inline int depth_se(struct sched_entity *se)
				321	{
				322	int depth = 0;
				323
				324	for_each_sched_entity(se)
				325	depth++;
				326
				327	return depth;
				328	}
				329
				330	static void
				331	find_matching_se(struct sched_entity se, struct sched_entity pse)
				332	{
				333	int se_depth, pse_depth;
				334
				335	/*
				336	* preemption test can be made between sibling entities who are in the
				337	* same cfs_rq i.e who have a common parent. Walk up the hierarchy of
				338	* both tasks until we find their ancestors who are siblings of common
				339	* parent.
				340	*/
				341
				342	/* First walk up until both entities are at same depth */
				343	se_depth = depth_se(*se);
				344	pse_depth = depth_se(*pse);
				345
				346	while (se_depth > pse_depth) {
				347	se_depth--;
				348	se = parent_entity(se);
				349	}
				350
				351	while (pse_depth > se_depth) {
				352	pse_depth--;
				353	pse = parent_entity(pse);
				354	}
				355
				356	while (!is_same_group(se, pse)) {
				357	se = parent_entity(se);
				358	pse = parent_entity(pse);
				359	}
				360	}
				361
Peter Zijlstra	8f48894	2009-07-24 12:25:30 +0200	[diff] [blame]	362	#else /* !CONFIG_FAIR_GROUP_SCHED */
				363
				364	static inline struct task_struct task_of(struct sched_entity se)
				365	{
				366	return container_of(se, struct task_struct, se);
				367	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	368
				369	static inline struct rq rq_of(struct cfs_rq cfs_rq)
				370	{
				371	return container_of(cfs_rq, struct rq, cfs);
				372	}
				373
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	374	#define entity_is_task(se) 1
				375
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	376	#define for_each_sched_entity(se) \
				377	for (; se; se = NULL)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	378
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	379	static inline struct cfs_rq task_cfs_rq(struct task_struct p)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	380	{
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	381	return &task_rq(p)->cfs;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	382	}
				383
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	384	static inline struct cfs_rq cfs_rq_of(struct sched_entity se)
				385	{
				386	struct task_struct *p = task_of(se);
				387	struct rq *rq = task_rq(p);
				388
				389	return &rq->cfs;
				390	}
				391
				392	/* runqueue "owned" by this group */
				393	static inline struct cfs_rq group_cfs_rq(struct sched_entity grp)
				394	{
				395	return NULL;
				396	}
				397
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	398	static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				399	{
				400	}
				401
				402	static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				403	{
				404	}
				405
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	406	#define for_each_leaf_cfs_rq(rq, cfs_rq) \
				407	for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
				408
				409	static inline int
				410	is_same_group(struct sched_entity se, struct sched_entity pse)
				411	{
				412	return 1;
				413	}
				414
				415	static inline struct sched_entity parent_entity(struct sched_entity se)
				416	{
				417	return NULL;
				418	}
				419
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	420	static inline void
				421	find_matching_se(struct sched_entity se, struct sched_entity pse)
				422	{
				423	}
				424
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	425	#endif /* CONFIG_FAIR_GROUP_SCHED */
				426
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	427	static __always_inline
				428	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	429
				430	/**************************************************************
				431	* Scheduling class tree data structure manipulation methods:
				432	*/
				433
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	434	static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
Peter Zijlstra	02e0431	2007-10-15 17:00:07 +0200	[diff] [blame]	435	{
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	436	s64 delta = (s64)(vruntime - max_vruntime);
Peter Zijlstra	368059a	2007-10-15 17:00:11 +0200	[diff] [blame]	437	if (delta > 0)
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	438	max_vruntime = vruntime;
Peter Zijlstra	02e0431	2007-10-15 17:00:07 +0200	[diff] [blame]	439
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	440	return max_vruntime;
Peter Zijlstra	02e0431	2007-10-15 17:00:07 +0200	[diff] [blame]	441	}
				442
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	443	static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
Peter Zijlstra	b0ffd24	2007-10-15 17:00:12 +0200	[diff] [blame]	444	{
				445	s64 delta = (s64)(vruntime - min_vruntime);
				446	if (delta < 0)
				447	min_vruntime = vruntime;
				448
				449	return min_vruntime;
				450	}
				451
Fabio Checconi	54fdc58	2009-07-16 12:32:27 +0200	[diff] [blame]	452	static inline int entity_before(struct sched_entity *a,
				453	struct sched_entity *b)
				454	{
				455	return (s64)(a->vruntime - b->vruntime) < 0;
				456	}
				457
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	458	static void update_min_vruntime(struct cfs_rq *cfs_rq)
				459	{
				460	u64 vruntime = cfs_rq->min_vruntime;
				461
				462	if (cfs_rq->curr)
				463	vruntime = cfs_rq->curr->vruntime;
				464
				465	if (cfs_rq->rb_leftmost) {
				466	struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
				467	struct sched_entity,
				468	run_node);
				469
Peter Zijlstra	e17036d	2009-01-15 14:53:39 +0100	[diff] [blame]	470	if (!cfs_rq->curr)
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	471	vruntime = se->vruntime;
				472	else
				473	vruntime = min_vruntime(vruntime, se->vruntime);
				474	}
				475
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	476	/* ensure we never gain time by being placed backwards. */
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	477	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	478	#ifndef CONFIG_64BIT
				479	smp_wmb();
				480	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
				481	#endif
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	482	}
				483
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	484	/*
				485	* Enqueue an entity into the rb-tree:
				486	*/
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	487	static void __enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	488	{
				489	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
				490	struct rb_node *parent = NULL;
				491	struct sched_entity *entry;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	492	int leftmost = 1;
				493
				494	/*
				495	* Find the right place in the rbtree:
				496	*/
				497	while (*link) {
				498	parent = *link;
				499	entry = rb_entry(parent, struct sched_entity, run_node);
				500	/*
				501	* We dont care about collisions. Nodes with
				502	* the same key stay together.
				503	*/
Stephan Baerwolf	2bd2d6f	2011-07-20 14:46:59 +0200	[diff] [blame]	504	if (entity_before(se, entry)) {
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	505	link = &parent->rb_left;
				506	} else {
				507	link = &parent->rb_right;
				508	leftmost = 0;
				509	}
				510	}
				511
				512	/*
				513	* Maintain a cache of leftmost tree entries (it is frequently
				514	* used):
				515	*/
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	516	if (leftmost)
Ingo Molnar	57cb499	2007-10-15 17:00:11 +0200	[diff] [blame]	517	cfs_rq->rb_leftmost = &se->run_node;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	518
				519	rb_link_node(&se->run_node, parent, link);
				520	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	521	}
				522
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	523	static void __dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	524	{
Peter Zijlstra	3fe6974	2008-03-14 20:55:51 +0100	[diff] [blame]	525	if (cfs_rq->rb_leftmost == &se->run_node) {
				526	struct rb_node *next_node;
Peter Zijlstra	3fe6974	2008-03-14 20:55:51 +0100	[diff] [blame]	527
				528	next_node = rb_next(&se->run_node);
				529	cfs_rq->rb_leftmost = next_node;
Peter Zijlstra	3fe6974	2008-03-14 20:55:51 +0100	[diff] [blame]	530	}
Ingo Molnar	e9acbff	2007-10-15 17:00:04 +0200	[diff] [blame]	531
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	532	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	533	}
				534
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	535	struct sched_entity __pick_first_entity(struct cfs_rq cfs_rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	536	{
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	537	struct rb_node *left = cfs_rq->rb_leftmost;
				538
				539	if (!left)
				540	return NULL;
				541
				542	return rb_entry(left, struct sched_entity, run_node);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	543	}
				544
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	545	static struct sched_entity __pick_next_entity(struct sched_entity se)
				546	{
				547	struct rb_node *next = rb_next(&se->run_node);
				548
				549	if (!next)
				550	return NULL;
				551
				552	return rb_entry(next, struct sched_entity, run_node);
				553	}
				554
				555	#ifdef CONFIG_SCHED_DEBUG
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	556	struct sched_entity __pick_last_entity(struct cfs_rq cfs_rq)
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	557	{
Ingo Molnar	7eee3e6	2008-02-22 10:32:21 +0100	[diff] [blame]	558	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	559
Balbir Singh	70eee74	2008-02-22 13:25:53 +0530	[diff] [blame]	560	if (!last)
				561	return NULL;
Ingo Molnar	7eee3e6	2008-02-22 10:32:21 +0100	[diff] [blame]	562
				563	return rb_entry(last, struct sched_entity, run_node);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	564	}
				565
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	566	/**************************************************************
				567	* Scheduling class statistics methods:
				568	*/
				569
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	570	int sched_proc_update_handler(struct ctl_table *table, int write,
Alexey Dobriyan	8d65af7	2009-09-23 15:57:19 -0700	[diff] [blame]	571	void __user buffer, size_t lenp,
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	572	loff_t *ppos)
				573	{
Alexey Dobriyan	8d65af7	2009-09-23 15:57:19 -0700	[diff] [blame]	574	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	575	int factor = get_update_sysctl_factor();
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	576
				577	if (ret \|\| !write)
				578	return ret;
				579
				580	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
				581	sysctl_sched_min_granularity);
				582
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	583	#define WRT_SYSCTL(name) \
				584	(normalized_sysctl_##name = sysctl_##name / (factor))
				585	WRT_SYSCTL(sched_min_granularity);
				586	WRT_SYSCTL(sched_latency);
				587	WRT_SYSCTL(sched_wakeup_granularity);
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	588	#undef WRT_SYSCTL
				589
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	590	return 0;
				591	}
				592	#endif
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	593
				594	/*
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	595	* delta /= w
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	596	*/
				597	static inline unsigned long
				598	calc_delta_fair(unsigned long delta, struct sched_entity *se)
				599	{
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	600	if (unlikely(se->load.weight != NICE_0_LOAD))
				601	delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	602
				603	return delta;
				604	}
				605
				606	/*
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	607	* The idea is to set a period in which each task runs once.
				608	*
Borislav Petkov	532b185	2012-08-08 16:16:04 +0200	[diff] [blame]	609	* When there are too many tasks (sched_nr_latency) we have to stretch
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	610	* this period because otherwise the slices get too small.
				611	*
				612	* p = (nr <= nl) ? l : l*nr/nl
				613	*/
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	614	static u64 __sched_period(unsigned long nr_running)
				615	{
				616	u64 period = sysctl_sched_latency;
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	617	unsigned long nr_latency = sched_nr_latency;
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	618
				619	if (unlikely(nr_running > nr_latency)) {
Peter Zijlstra	4bf0b77	2008-01-25 21:08:21 +0100	[diff] [blame]	620	period = sysctl_sched_min_granularity;
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	621	period *= nr_running;
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	622	}
				623
				624	return period;
				625	}
				626
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	627	/*
				628	* We calculate the wall-time slice from the period by taking a part
				629	* proportional to the weight.
				630	*
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	631	* s = p*P[w/rw]
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	632	*/
Peter Zijlstra	6d0f0ebd	2007-10-15 17:00:05 +0200	[diff] [blame]	633	static u64 sched_slice(struct cfs_rq cfs_rq, struct sched_entity se)
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	634	{
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	635	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	636
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	637	for_each_sched_entity(se) {
Lin Ming	6272d68	2009-01-15 17:17:15 +0100	[diff] [blame]	638	struct load_weight *load;
Christian Engelmayer	3104bf0	2009-06-16 10:35:12 +0200	[diff] [blame]	639	struct load_weight lw;
Lin Ming	6272d68	2009-01-15 17:17:15 +0100	[diff] [blame]	640
				641	cfs_rq = cfs_rq_of(se);
				642	load = &cfs_rq->load;
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	643
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	644	if (unlikely(!se->on_rq)) {
Christian Engelmayer	3104bf0	2009-06-16 10:35:12 +0200	[diff] [blame]	645	lw = cfs_rq->load;
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	646
				647	update_load_add(&lw, se->load.weight);
				648	load = &lw;
				649	}
				650	slice = calc_delta_mine(slice, se->load.weight, load);
				651	}
				652	return slice;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	653	}
				654
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	655	/*
Andrei Epure	660cc00	2013-03-11 12:03:20 +0200	[diff] [blame]	656	* We calculate the vruntime slice of a to-be-inserted task.
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	657	*
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	658	* vs = s/w
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	659	*/
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	660	static u64 sched_vslice(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	661	{
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	662	return calc_delta_fair(sched_slice(cfs_rq, se), se);
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	663	}
				664
				665	/*
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	666	* Update the current task's runtime statistics. Skip current tasks that
				667	* are not in our scheduling class.
				668	*/
				669	static inline void
Ingo Molnar	8ebc91d	2007-10-15 17:00:03 +0200	[diff] [blame]	670	__update_curr(struct cfs_rq cfs_rq, struct sched_entity curr,
				671	unsigned long delta_exec)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	672	{
Ingo Molnar	bbdba7c	2007-10-15 17:00:06 +0200	[diff] [blame]	673	unsigned long delta_exec_weighted;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	674
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	675	schedstat_set(curr->statistics.exec_max,
				676	max((u64)delta_exec, curr->statistics.exec_max));
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	677
				678	curr->sum_exec_runtime += delta_exec;
Ingo Molnar	7a62eab	2007-10-15 17:00:06 +0200	[diff] [blame]	679	schedstat_add(cfs_rq, exec_clock, delta_exec);
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	680	delta_exec_weighted = calc_delta_fair(delta_exec, curr);
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	681
Ingo Molnar	e9acbff	2007-10-15 17:00:04 +0200	[diff] [blame]	682	curr->vruntime += delta_exec_weighted;
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	683	update_min_vruntime(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	684	}
				685
Ingo Molnar	b7cc089	2007-08-09 11:16:47 +0200	[diff] [blame]	686	static void update_curr(struct cfs_rq *cfs_rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	687	{
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	688	struct sched_entity *curr = cfs_rq->curr;
Venkatesh Pallipadi	305e683	2010-10-04 17:03:21 -0700	[diff] [blame]	689	u64 now = rq_of(cfs_rq)->clock_task;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	690	unsigned long delta_exec;
				691
				692	if (unlikely(!curr))
				693	return;
				694
				695	/*
				696	* Get the amount of time the current task was running
				697	* since the last time we changed load (this cannot
				698	* overflow on 32 bits):
				699	*/
Ingo Molnar	8ebc91d	2007-10-15 17:00:03 +0200	[diff] [blame]	700	delta_exec = (unsigned long)(now - curr->exec_start);
Peter Zijlstra	34f28ec	2008-12-16 08:45:31 +0100	[diff] [blame]	701	if (!delta_exec)
				702	return;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	703
Ingo Molnar	8ebc91d	2007-10-15 17:00:03 +0200	[diff] [blame]	704	__update_curr(cfs_rq, curr, delta_exec);
				705	curr->exec_start = now;
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	706
				707	if (entity_is_task(curr)) {
				708	struct task_struct *curtask = task_of(curr);
				709
Ingo Molnar	f977bb4	2009-09-13 18:15:54 +0200	[diff] [blame]	710	trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	711	cpuacct_charge(curtask, delta_exec);
Frank Mayhar	f06febc	2008-09-12 09:54:39 -0700	[diff] [blame]	712	account_group_exec_runtime(curtask, delta_exec);
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	713	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	714
				715	account_cfs_rq_runtime(cfs_rq, delta_exec);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	716	}
				717
				718	static inline void
Ingo Molnar	5870db5	2007-08-09 11:16:47 +0200	[diff] [blame]	719	update_stats_wait_start(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	720	{
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	721	schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	722	}
				723
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	724	/*
				725	* Task is being enqueued - update stats:
				726	*/
Ingo Molnar	d2417e5	2007-08-09 11:16:47 +0200	[diff] [blame]	727	static void update_stats_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	728	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	729	/*
				730	* Are we enqueueing a waiting task? (for current tasks
				731	* a dequeue/enqueue event is a NOP)
				732	*/
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	733	if (se != cfs_rq->curr)
Ingo Molnar	5870db5	2007-08-09 11:16:47 +0200	[diff] [blame]	734	update_stats_wait_start(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	735	}
				736
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	737	static void
Ingo Molnar	9ef0a96	2007-08-09 11:16:47 +0200	[diff] [blame]	738	update_stats_wait_end(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	739	{
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	740	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
				741	rq_of(cfs_rq)->clock - se->statistics.wait_start));
				742	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
				743	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
				744	rq_of(cfs_rq)->clock - se->statistics.wait_start);
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	745	#ifdef CONFIG_SCHEDSTATS
				746	if (entity_is_task(se)) {
				747	trace_sched_stat_wait(task_of(se),
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	748	rq_of(cfs_rq)->clock - se->statistics.wait_start);
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	749	}
				750	#endif
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	751	schedstat_set(se->statistics.wait_start, 0);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	752	}
				753
				754	static inline void
Ingo Molnar	19b6a2e	2007-08-09 11:16:48 +0200	[diff] [blame]	755	update_stats_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	756	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	757	/*
				758	* Mark the end of the wait period if dequeueing a
				759	* waiting task:
				760	*/
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	761	if (se != cfs_rq->curr)
Ingo Molnar	9ef0a96	2007-08-09 11:16:47 +0200	[diff] [blame]	762	update_stats_wait_end(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	763	}
				764
				765	/*
				766	* We are picking a new current task - update its stats:
				767	*/
				768	static inline void
Ingo Molnar	79303e9	2007-08-09 11:16:47 +0200	[diff] [blame]	769	update_stats_curr_start(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	770	{
				771	/*
				772	* We are starting a new run period:
				773	*/
Venkatesh Pallipadi	305e683	2010-10-04 17:03:21 -0700	[diff] [blame]	774	se->exec_start = rq_of(cfs_rq)->clock_task;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	775	}
				776
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	777	/**************************************************
				778	* Scheduling class queueing methods:
				779	*/
				780
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	781	#ifdef CONFIG_NUMA_BALANCING
				782	/*
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	783	* numa task sample period in ms
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	784	*/
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	785	unsigned int sysctl_numa_balancing_scan_period_min = 100;
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	786	unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
				787	unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	788
				789	/* Portion of address space to scan in MB */
				790	unsigned int sysctl_numa_balancing_scan_size = 256;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	791
Peter Zijlstra	4b96a29	2012-10-25 14:16:47 +0200	[diff] [blame]	792	/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
				793	unsigned int sysctl_numa_balancing_scan_delay = 1000;
				794
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	795	static void task_numa_placement(struct task_struct *p)
				796	{
Hugh Dickins	2832bc1	2012-12-19 17:42:16 -0800	[diff] [blame]	797	int seq;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	798
Hugh Dickins	2832bc1	2012-12-19 17:42:16 -0800	[diff] [blame]	799	if (!p->mm) /* for example, ksmd faulting in a user's mm */
				800	return;
				801	seq = ACCESS_ONCE(p->mm->numa_scan_seq);
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	802	if (p->numa_scan_seq == seq)
				803	return;
				804	p->numa_scan_seq = seq;
				805
				806	/* FIXME: Scheduling placement policy hints go here */
				807	}
				808
				809	/*
				810	* Got a PROT_NONE fault for a page on @node.
				811	*/
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	812	void task_numa_fault(int node, int pages, bool migrated)
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	813	{
				814	struct task_struct *p = current;
				815
Mel Gorman	1a687c2	2012-11-22 11:16:36 +0000	[diff] [blame]	816	if (!sched_feat_numa(NUMA))
				817	return;
				818
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	819	/* FIXME: Allocate task-specific structure for placement policy here */
				820
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	821	/*
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	822	* If pages are properly placed (did not migrate) then scan slower.
				823	* This is reset periodically in case of phase changes
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	824	*/
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	825	if (!migrated)
				826	p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
				827	p->numa_scan_period + jiffies_to_msecs(10));
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	828
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	829	task_numa_placement(p);
				830	}
				831
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	832	static void reset_ptenuma_scan(struct task_struct *p)
				833	{
				834	ACCESS_ONCE(p->mm->numa_scan_seq)++;
				835	p->mm->numa_scan_offset = 0;
				836	}
				837
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	838	/*
				839	* The expensive part of numa migration is done from task_work context.
				840	* Triggered from task_tick_numa().
				841	*/
				842	void task_numa_work(struct callback_head *work)
				843	{
				844	unsigned long migrate, next_scan, now = jiffies;
				845	struct task_struct *p = current;
				846	struct mm_struct *mm = p->mm;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	847	struct vm_area_struct *vma;
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	848	unsigned long start, end;
				849	long pages;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	850
				851	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
				852
				853	work->next = work; /* protect against double add */
				854	/*
				855	* Who cares about NUMA placement when they're dying.
				856	*
				857	* NOTE: make sure not to dereference p->mm before this check,
				858	* exit_task_work() happens _after_ exit_mm() so we could be called
				859	* without p->mm even though we still had it when we enqueued this
				860	* work.
				861	*/
				862	if (p->flags & PF_EXITING)
				863	return;
				864
				865	/*
Mel Gorman	5bca230	2012-11-22 14:40:03 +0000	[diff] [blame]	866	* We do not care about task placement until a task runs on a node
				867	* other than the first one used by the address space. This is
				868	* largely because migrations are driven by what CPU the task
				869	* is running on. If it's never scheduled on another node, it'll
				870	* not migrate so why bother trapping the fault.
				871	*/
				872	if (mm->first_nid == NUMA_PTE_SCAN_INIT)
				873	mm->first_nid = numa_node_id();
				874	if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
				875	/* Are we running on a new node yet? */
				876	if (numa_node_id() == mm->first_nid &&
				877	!sched_feat_numa(NUMA_FORCE))
				878	return;
				879
				880	mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
				881	}
				882
				883	/*
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	884	* Reset the scan period if enough time has gone by. Objective is that
				885	* scanning will be reduced if pages are properly placed. As tasks
				886	* can enter different phases this needs to be re-examined. Lacking
				887	* proper tracking of reference behaviour, this blunt hammer is used.
				888	*/
				889	migrate = mm->numa_next_reset;
				890	if (time_after(now, migrate)) {
				891	p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
				892	next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
				893	xchg(&mm->numa_next_reset, next_scan);
				894	}
				895
				896	/*
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	897	* Enforce maximal scan/migration frequency..
				898	*/
				899	migrate = mm->numa_next_scan;
				900	if (time_before(now, migrate))
				901	return;
				902
				903	if (p->numa_scan_period == 0)
				904	p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
				905
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	906	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	907	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
				908	return;
				909
Mel Gorman	e14808b	2012-11-19 10:59:15 +0000	[diff] [blame]	910	/*
				911	* Do not set pte_numa if the current running node is rate-limited.
				912	* This loses statistics on the fault but if we are unwilling to
				913	* migrate to this node, it is less likely we can do useful work
				914	*/
				915	if (migrate_ratelimited(numa_node_id()))
				916	return;
				917
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	918	start = mm->numa_scan_offset;
				919	pages = sysctl_numa_balancing_scan_size;
				920	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
				921	if (!pages)
				922	return;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	923
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	924	down_read(&mm->mmap_sem);
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	925	vma = find_vma(mm, start);
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	926	if (!vma) {
				927	reset_ptenuma_scan(p);
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	928	start = 0;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	929	vma = mm->mmap;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	930	}
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	931	for (; vma; vma = vma->vm_next) {
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	932	if (!vma_migratable(vma))
				933	continue;
				934
				935	/* Skip small VMAs. They are not likely to be of relevance */
Mel Gorman	221392c	2012-12-17 14:05:53 +0000	[diff] [blame]	936	if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	937	continue;
				938
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	939	do {
				940	start = max(start, vma->vm_start);
				941	end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
				942	end = min(end, vma->vm_end);
				943	pages -= change_prot_numa(vma, start, end);
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	944
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	945	start = end;
				946	if (pages <= 0)
				947	goto out;
				948	} while (end != vma->vm_end);
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	949	}
				950
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	951	out:
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	952	/*
				953	* It is possible to reach the end of the VMA list but the last few VMAs are
				954	* not guaranteed to the vma_migratable. If they are not, we would find the
				955	* !migratable VMA on the next scan but not reset the scanner to the start
				956	* so check it now.
				957	*/
				958	if (vma)
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	959	mm->numa_scan_offset = start;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	960	else
				961	reset_ptenuma_scan(p);
				962	up_read(&mm->mmap_sem);
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	963	}
				964
				965	/*
				966	* Drive the periodic memory faults..
				967	*/
				968	void task_tick_numa(struct rq rq, struct task_struct curr)
				969	{
				970	struct callback_head *work = &curr->numa_work;
				971	u64 period, now;
				972
				973	/*
				974	* We don't care about NUMA placement if we don't have memory.
				975	*/
				976	if (!curr->mm \|\| (curr->flags & PF_EXITING) \|\| work->next != work)
				977	return;
				978
				979	/*
				980	* Using runtime rather than walltime has the dual advantage that
				981	* we (mostly) drive the selection from busy threads and that the
				982	* task needs to have done some actual work before we bother with
				983	* NUMA placement.
				984	*/
				985	now = curr->se.sum_exec_runtime;
				986	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
				987
				988	if (now - curr->node_stamp > period) {
Peter Zijlstra	4b96a29	2012-10-25 14:16:47 +0200	[diff] [blame]	989	if (!curr->node_stamp)
				990	curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	991	curr->node_stamp = now;
				992
				993	if (!time_before(jiffies, curr->mm->numa_next_scan)) {
				994	init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
				995	task_work_add(curr, work, true);
				996	}
				997	}
				998	}
				999	#else
				1000	static void task_tick_numa(struct rq rq, struct task_struct curr)
				1001	{
				1002	}
				1003	#endif /* CONFIG_NUMA_BALANCING */
				1004
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1005	static void
				1006	account_entity_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
				1007	{
				1008	update_load_add(&cfs_rq->load, se->load.weight);
Peter Zijlstra	c09595f	2008-06-27 13:41:14 +0200	[diff] [blame]	1009	if (!parent_entity(se))
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	1010	update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	1011	#ifdef CONFIG_SMP
				1012	if (entity_is_task(se))
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	1013	list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	1014	#endif
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1015	cfs_rq->nr_running++;
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1016	}
				1017
				1018	static void
				1019	account_entity_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)
				1020	{
				1021	update_load_sub(&cfs_rq->load, se->load.weight);
Peter Zijlstra	c09595f	2008-06-27 13:41:14 +0200	[diff] [blame]	1022	if (!parent_entity(se))
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	1023	update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	1024	if (entity_is_task(se))
Bharata B Rao	b87f172	2008-09-25 09:53:54 +0530	[diff] [blame]	1025	list_del_init(&se->group_node);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1026	cfs_rq->nr_running--;
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1027	}
				1028
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1029	#ifdef CONFIG_FAIR_GROUP_SCHED
				1030	# ifdef CONFIG_SMP
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1031	static inline long calc_tg_weight(struct task_group tg, struct cfs_rq cfs_rq)
				1032	{
				1033	long tg_weight;
				1034
				1035	/*
				1036	* Use this CPU's actual weight instead of the last load_contribution
				1037	* to gain a more accurate current total weight. See
				1038	* update_cfs_rq_load_contribution().
				1039	*/
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	1040	tg_weight = atomic64_read(&tg->load_avg);
				1041	tg_weight -= cfs_rq->tg_load_contrib;
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1042	tg_weight += cfs_rq->load.weight;
				1043
				1044	return tg_weight;
				1045	}
				1046
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1047	static long calc_cfs_shares(struct cfs_rq cfs_rq, struct task_group tg)
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1048	{
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1049	long tg_weight, load, shares;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1050
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1051	tg_weight = calc_tg_weight(tg, cfs_rq);
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1052	load = cfs_rq->load.weight;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1053
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1054	shares = (tg->shares * load);
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1055	if (tg_weight)
				1056	shares /= tg_weight;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1057
				1058	if (shares < MIN_SHARES)
				1059	shares = MIN_SHARES;
				1060	if (shares > tg->shares)
				1061	shares = tg->shares;
				1062
				1063	return shares;
				1064	}
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1065	# else /* CONFIG_SMP */
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1066	static inline long calc_cfs_shares(struct cfs_rq cfs_rq, struct task_group tg)
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1067	{
				1068	return tg->shares;
				1069	}
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1070	# endif /* CONFIG_SMP */
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1071	static void reweight_entity(struct cfs_rq cfs_rq, struct sched_entity se,
				1072	unsigned long weight)
				1073	{
Paul Turner	19e5eeb	2010-12-15 19:10:18 -0800	[diff] [blame]	1074	if (se->on_rq) {
				1075	/* commit outstanding execution time */
				1076	if (cfs_rq->curr == se)
				1077	update_curr(cfs_rq);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1078	account_entity_dequeue(cfs_rq, se);
Paul Turner	19e5eeb	2010-12-15 19:10:18 -0800	[diff] [blame]	1079	}
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1080
				1081	update_load_set(&se->load, weight);
				1082
				1083	if (se->on_rq)
				1084	account_entity_enqueue(cfs_rq, se);
				1085	}
				1086
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	1087	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
				1088
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1089	static void update_cfs_shares(struct cfs_rq *cfs_rq)
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1090	{
				1091	struct task_group *tg;
				1092	struct sched_entity *se;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1093	long shares;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1094
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1095	tg = cfs_rq->tg;
				1096	se = tg->se[cpu_of(rq_of(cfs_rq))];
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	1097	if (!se \|\| throttled_hierarchy(cfs_rq))
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1098	return;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1099	#ifndef CONFIG_SMP
				1100	if (likely(se->load.weight == tg->shares))
				1101	return;
				1102	#endif
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1103	shares = calc_cfs_shares(cfs_rq, tg);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1104
				1105	reweight_entity(cfs_rq_of(se), se, shares);
				1106	}
				1107	#else /* CONFIG_FAIR_GROUP_SCHED */
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1108	static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1109	{
				1110	}
				1111	#endif /* CONFIG_FAIR_GROUP_SCHED */
				1112
Paul Turner	f4e26b1	2012-10-04 13:18:32 +0200	[diff] [blame]	1113	/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
				1114	#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1115	/*
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1116	* We choose a half-life close to 1 scheduling period.
				1117	* Note: The tables below are dependent on this value.
				1118	*/
				1119	#define LOAD_AVG_PERIOD 32
				1120	#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
				1121	#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
				1122
				1123	/* Precomputed fixed inverse multiplies for multiplication by y^n */
				1124	static const u32 runnable_avg_yN_inv[] = {
				1125	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
				1126	0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
				1127	0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
				1128	0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
				1129	0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
				1130	0x85aac367, 0x82cd8698,
				1131	};
				1132
				1133	/*
				1134	* Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
				1135	* over-estimates when re-combining.
				1136	*/
				1137	static const u32 runnable_avg_yN_sum[] = {
				1138	0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
				1139	9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
				1140	17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
				1141	};
				1142
				1143	/*
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1144	* Approximate:
				1145	* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
				1146	*/
				1147	static __always_inline u64 decay_load(u64 val, u64 n)
				1148	{
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1149	unsigned int local_n;
				1150
				1151	if (!n)
				1152	return val;
				1153	else if (unlikely(n > LOAD_AVG_PERIOD * 63))
				1154	return 0;
				1155
				1156	/* after bounds checking we can collapse to 32-bit */
				1157	local_n = n;
				1158
				1159	/*
				1160	* As y^PERIOD = 1/2, we can combine
				1161	* y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
				1162	* With a look-up table which covers k^n (n<PERIOD)
				1163	*
				1164	* To achieve constant time decay_load.
				1165	*/
				1166	if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
				1167	val >>= local_n / LOAD_AVG_PERIOD;
				1168	local_n %= LOAD_AVG_PERIOD;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1169	}
				1170
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1171	val *= runnable_avg_yN_inv[local_n];
				1172	/* We don't use SRR here since we always want to round down. */
				1173	return val >> 32;
				1174	}
				1175
				1176	/*
				1177	* For updates fully spanning n periods, the contribution to runnable
				1178	* average will be: \Sum 1024*y^n
				1179	*
				1180	* We can compute this reasonably efficiently by combining:
				1181	* y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
				1182	*/
				1183	static u32 __compute_runnable_contrib(u64 n)
				1184	{
				1185	u32 contrib = 0;
				1186
				1187	if (likely(n <= LOAD_AVG_PERIOD))
				1188	return runnable_avg_yN_sum[n];
				1189	else if (unlikely(n >= LOAD_AVG_MAX_N))
				1190	return LOAD_AVG_MAX;
				1191
				1192	/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
				1193	do {
				1194	contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
				1195	contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
				1196
				1197	n -= LOAD_AVG_PERIOD;
				1198	} while (n > LOAD_AVG_PERIOD);
				1199
				1200	contrib = decay_load(contrib, n);
				1201	return contrib + runnable_avg_yN_sum[n];
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1202	}
				1203
				1204	/*
				1205	* We can represent the historical contribution to runnable average as the
				1206	* coefficients of a geometric series. To do this we sub-divide our runnable
				1207	* history into segments of approximately 1ms (1024us); label the segment that
				1208	* occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
				1209	*
				1210	* [<- 1024us ->\|<- 1024us ->\|<- 1024us ->\| ...
				1211	* p0 p1 p2
				1212	* (now) (~1ms ago) (~2ms ago)
				1213	*
				1214	* Let u_i denote the fraction of p_i that the entity was runnable.
				1215	*
				1216	* We then designate the fractions u_i as our co-efficients, yielding the
				1217	* following representation of historical load:
				1218	* u_0 + u_1y + u_2y^2 + u_3*y^3 + ...
				1219	*
				1220	* We choose y based on the with of a reasonably scheduling period, fixing:
				1221	* y^32 = 0.5
				1222	*
				1223	* This means that the contribution to load ~32ms ago (u_32) will be weighted
				1224	* approximately half as much as the contribution to load within the last ms
				1225	* (u_0).
				1226	*
				1227	* When a period "rolls over" and we have new u_0`, multiplying the previous
				1228	* sum again by y is sufficient to update:
				1229	* load_avg = u_0` + y(u_0 + u_1y + u_2*y^2 + ... )
				1230	* = u_0 + u_1y + u_2y^2 + ... [re-labeling u_i --> u_{i+1}]
				1231	*/
				1232	static __always_inline int __update_entity_runnable_avg(u64 now,
				1233	struct sched_avg *sa,
				1234	int runnable)
				1235	{
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1236	u64 delta, periods;
				1237	u32 runnable_contrib;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1238	int delta_w, decayed = 0;
				1239
				1240	delta = now - sa->last_runnable_update;
				1241	/*
				1242	* This should only happen when time goes backwards, which it
				1243	* unfortunately does during sched clock init when we swap over to TSC.
				1244	*/
				1245	if ((s64)delta < 0) {
				1246	sa->last_runnable_update = now;
				1247	return 0;
				1248	}
				1249
				1250	/*
				1251	* Use 1024ns as the unit of measurement since it's a reasonable
				1252	* approximation of 1us and fast to compute.
				1253	*/
				1254	delta >>= 10;
				1255	if (!delta)
				1256	return 0;
				1257	sa->last_runnable_update = now;
				1258
				1259	/* delta_w is the amount already accumulated against our next period */
				1260	delta_w = sa->runnable_avg_period % 1024;
				1261	if (delta + delta_w >= 1024) {
				1262	/* period roll-over */
				1263	decayed = 1;
				1264
				1265	/*
				1266	* Now that we know we're crossing a period boundary, figure
				1267	* out how much from delta we need to complete the current
				1268	* period and accrue it.
				1269	*/
				1270	delta_w = 1024 - delta_w;
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1271	if (runnable)
				1272	sa->runnable_avg_sum += delta_w;
				1273	sa->runnable_avg_period += delta_w;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1274
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1275	delta -= delta_w;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1276
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1277	/* Figure out how many additional periods this update spans */
				1278	periods = delta / 1024;
				1279	delta %= 1024;
				1280
				1281	sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
				1282	periods + 1);
				1283	sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
				1284	periods + 1);
				1285
				1286	/* Efficiently calculate \sum (1..n_period) 1024y^i /
				1287	runnable_contrib = __compute_runnable_contrib(periods);
				1288	if (runnable)
				1289	sa->runnable_avg_sum += runnable_contrib;
				1290	sa->runnable_avg_period += runnable_contrib;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1291	}
				1292
				1293	/* Remainder of delta accrued against u_0` */
				1294	if (runnable)
				1295	sa->runnable_avg_sum += delta;
				1296	sa->runnable_avg_period += delta;
				1297
				1298	return decayed;
				1299	}
				1300
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1301	/* Synchronize an entity's decay with its parenting cfs_rq.*/
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1302	static inline u64 __synchronize_entity_decay(struct sched_entity *se)
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1303	{
				1304	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1305	u64 decays = atomic64_read(&cfs_rq->decay_counter);
				1306
				1307	decays -= se->avg.decay_count;
				1308	if (!decays)
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1309	return 0;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1310
				1311	se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
				1312	se->avg.decay_count = 0;
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1313
				1314	return decays;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1315	}
				1316
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1317	#ifdef CONFIG_FAIR_GROUP_SCHED
				1318	static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
				1319	int force_update)
				1320	{
				1321	struct task_group *tg = cfs_rq->tg;
				1322	s64 tg_contrib;
				1323
				1324	tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
				1325	tg_contrib -= cfs_rq->tg_load_contrib;
				1326
				1327	if (force_update \|\| abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
				1328	atomic64_add(tg_contrib, &tg->load_avg);
				1329	cfs_rq->tg_load_contrib += tg_contrib;
				1330	}
				1331	}
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1332
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1333	/*
				1334	* Aggregate cfs_rq runnable averages into an equivalent task_group
				1335	* representation for computing load contributions.
				1336	*/
				1337	static inline void __update_tg_runnable_avg(struct sched_avg *sa,
				1338	struct cfs_rq *cfs_rq)
				1339	{
				1340	struct task_group *tg = cfs_rq->tg;
				1341	long contrib;
				1342
				1343	/* The fraction of a cpu used by this cfs_rq */
				1344	contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
				1345	sa->runnable_avg_period + 1);
				1346	contrib -= cfs_rq->tg_runnable_contrib;
				1347
				1348	if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
				1349	atomic_add(contrib, &tg->runnable_avg);
				1350	cfs_rq->tg_runnable_contrib += contrib;
				1351	}
				1352	}
				1353
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1354	static inline void __update_group_entity_contrib(struct sched_entity *se)
				1355	{
				1356	struct cfs_rq *cfs_rq = group_cfs_rq(se);
				1357	struct task_group *tg = cfs_rq->tg;
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1358	int runnable_avg;
				1359
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1360	u64 contrib;
				1361
				1362	contrib = cfs_rq->tg_load_contrib * tg->shares;
				1363	se->avg.load_avg_contrib = div64_u64(contrib,
				1364	atomic64_read(&tg->load_avg) + 1);
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1365
				1366	/*
				1367	* For group entities we need to compute a correction term in the case
				1368	* that they are consuming <1 cpu so that we would contribute the same
				1369	* load as a task of equal weight.
				1370	*
				1371	* Explicitly co-ordinating this measurement would be expensive, but
				1372	* fortunately the sum of each cpus contribution forms a usable
				1373	* lower-bound on the true value.
				1374	*
				1375	* Consider the aggregate of 2 contributions. Either they are disjoint
				1376	* (and the sum represents true value) or they are disjoint and we are
				1377	* understating by the aggregate of their overlap.
				1378	*
				1379	* Extending this to N cpus, for a given overlap, the maximum amount we
				1380	* understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
				1381	* cpus that overlap for this interval and w_i is the interval width.
				1382	*
				1383	* On a small machine; the first term is well-bounded which bounds the
				1384	* total error since w_i is a subset of the period. Whereas on a
				1385	* larger machine, while this first term can be larger, if w_i is the
				1386	* of consequential size guaranteed to see n_i*w_i quickly converge to
				1387	* our upper bound of 1-cpu.
				1388	*/
				1389	runnable_avg = atomic_read(&tg->runnable_avg);
				1390	if (runnable_avg < NICE_0_LOAD) {
				1391	se->avg.load_avg_contrib *= runnable_avg;
				1392	se->avg.load_avg_contrib >>= NICE_0_SHIFT;
				1393	}
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1394	}
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1395	#else
				1396	static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
				1397	int force_update) {}
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1398	static inline void __update_tg_runnable_avg(struct sched_avg *sa,
				1399	struct cfs_rq *cfs_rq) {}
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1400	static inline void __update_group_entity_contrib(struct sched_entity *se) {}
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1401	#endif
				1402
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1403	static inline void __update_task_entity_contrib(struct sched_entity *se)
				1404	{
				1405	u32 contrib;
				1406
				1407	/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
				1408	contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
				1409	contrib /= (se->avg.runnable_avg_period + 1);
				1410	se->avg.load_avg_contrib = scale_load(contrib);
				1411	}
				1412
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1413	/* Compute the current contribution to load_avg by se, return any delta */
				1414	static long __update_entity_load_avg_contrib(struct sched_entity *se)
				1415	{
				1416	long old_contrib = se->avg.load_avg_contrib;
				1417
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1418	if (entity_is_task(se)) {
				1419	__update_task_entity_contrib(se);
				1420	} else {
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1421	__update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1422	__update_group_entity_contrib(se);
				1423	}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1424
				1425	return se->avg.load_avg_contrib - old_contrib;
				1426	}
				1427
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1428	static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
				1429	long load_contrib)
				1430	{
				1431	if (likely(load_contrib < cfs_rq->blocked_load_avg))
				1432	cfs_rq->blocked_load_avg -= load_contrib;
				1433	else
				1434	cfs_rq->blocked_load_avg = 0;
				1435	}
				1436
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	1437	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
				1438
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1439	/* Update a sched_entity's runnable average */
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1440	static inline void update_entity_load_avg(struct sched_entity *se,
				1441	int update_cfs_rq)
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1442	{
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1443	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1444	long contrib_delta;
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	1445	u64 now;
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1446
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	1447	/*
				1448	* For a group entity we need to use their owned cfs_rq_clock_task() in
				1449	* case they are the parent of a throttled hierarchy.
				1450	*/
				1451	if (entity_is_task(se))
				1452	now = cfs_rq_clock_task(cfs_rq);
				1453	else
				1454	now = cfs_rq_clock_task(group_cfs_rq(se));
				1455
				1456	if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1457	return;
				1458
				1459	contrib_delta = __update_entity_load_avg_contrib(se);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1460
				1461	if (!update_cfs_rq)
				1462	return;
				1463
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1464	if (se->on_rq)
				1465	cfs_rq->runnable_load_avg += contrib_delta;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1466	else
				1467	subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
				1468	}
				1469
				1470	/*
				1471	* Decay the load contributed by all blocked children and account this so that
				1472	* their contribution may appropriately discounted when they wake up.
				1473	*/
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1474	static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1475	{
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	1476	u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1477	u64 decays;
				1478
				1479	decays = now - cfs_rq->last_decay;
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1480	if (!decays && !force_update)
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1481	return;
				1482
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1483	if (atomic64_read(&cfs_rq->removed_load)) {
				1484	u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
				1485	subtract_blocked_load_contrib(cfs_rq, removed_load);
				1486	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1487
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1488	if (decays) {
				1489	cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
				1490	decays);
				1491	atomic64_add(decays, &cfs_rq->decay_counter);
				1492	cfs_rq->last_decay = now;
				1493	}
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1494
				1495	__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1496	}
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	1497
				1498	static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
				1499	{
				1500	__update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1501	__update_tg_runnable_avg(&rq->avg, &rq->cfs);
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	1502	}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1503
				1504	/* Add the load generated by se into cfs_rq's child load-average */
				1505	static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1506	struct sched_entity *se,
				1507	int wakeup)
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1508	{
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1509	/*
				1510	* We track migrations using entity decay_count <= 0, on a wake-up
				1511	* migration we use a negative decay count to track the remote decays
				1512	* accumulated while sleeping.
				1513	*/
				1514	if (unlikely(se->avg.decay_count <= 0)) {
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1515	se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1516	if (se->avg.decay_count) {
				1517	/*
				1518	* In a wake-up migration we have to approximate the
				1519	* time sleeping. This is because we can't synchronize
				1520	* clock_task between the two cpus, and it is not
				1521	* guaranteed to be read-safe. Instead, we can
				1522	* approximate this using our carried decays, which are
				1523	* explicitly atomically readable.
				1524	*/
				1525	se->avg.last_runnable_update -= (-se->avg.decay_count)
				1526	<< 20;
				1527	update_entity_load_avg(se, 0);
				1528	/* Indicate that we're now synchronized and on-rq */
				1529	se->avg.decay_count = 0;
				1530	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1531	wakeup = 0;
				1532	} else {
				1533	__synchronize_entity_decay(se);
				1534	}
				1535
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1536	/* migrated tasks did not contribute to our blocked load */
				1537	if (wakeup) {
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1538	subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1539	update_entity_load_avg(se, 0);
				1540	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1541
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1542	cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1543	/* we force update consideration on load-balancer moves */
				1544	update_cfs_rq_blocked_load(cfs_rq, !wakeup);
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1545	}
				1546
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1547	/*
				1548	* Remove se's load from this cfs_rq child load-average, if the entity is
				1549	* transitioning to a blocked state we track its projected decay using
				1550	* blocked_load_avg.
				1551	*/
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1552	static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1553	struct sched_entity *se,
				1554	int sleep)
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1555	{
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1556	update_entity_load_avg(se, 1);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1557	/* we force update consideration on load-balancer moves */
				1558	update_cfs_rq_blocked_load(cfs_rq, !sleep);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1559
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1560	cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1561	if (sleep) {
				1562	cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
				1563	se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
				1564	} /* migrations, e.g. sleep=0 leave decay_count == 0 */
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1565	}
Vincent Guittot	642dbc3	2013-04-18 18:34:26 +0200	[diff] [blame]	1566
				1567	/*
				1568	* Update the rq's load with the elapsed running time before entering
				1569	* idle. if the last scheduled task is not a CFS task, idle_enter will
				1570	* be the only way to update the runnable statistic.
				1571	*/
				1572	void idle_enter_fair(struct rq *this_rq)
				1573	{
				1574	update_rq_runnable_avg(this_rq, 1);
				1575	}
				1576
				1577	/*
				1578	* Update the rq's load with the elapsed idle time before a task is
				1579	* scheduled. if the newly scheduled task is not a CFS task, idle_exit will
				1580	* be the only way to update the runnable statistic.
				1581	*/
				1582	void idle_exit_fair(struct rq *this_rq)
				1583	{
				1584	update_rq_runnable_avg(this_rq, 0);
				1585	}
				1586
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1587	#else
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1588	static inline void update_entity_load_avg(struct sched_entity *se,
				1589	int update_cfs_rq) {}
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	1590	static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1591	static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1592	struct sched_entity *se,
				1593	int wakeup) {}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1594	static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1595	struct sched_entity *se,
				1596	int sleep) {}
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1597	static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
				1598	int force_update) {}
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1599	#endif
				1600
Ingo Molnar	2396af6	2007-08-09 11:16:48 +0200	[diff] [blame]	1601	static void enqueue_sleeper(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1602	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1603	#ifdef CONFIG_SCHEDSTATS
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	1604	struct task_struct *tsk = NULL;
				1605
				1606	if (entity_is_task(se))
				1607	tsk = task_of(se);
				1608
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1609	if (se->statistics.sleep_start) {
				1610	u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1611
				1612	if ((s64)delta < 0)
				1613	delta = 0;
				1614
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1615	if (unlikely(delta > se->statistics.sleep_max))
				1616	se->statistics.sleep_max = delta;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1617
Peter Zijlstra	8c79a04	2012-01-30 14:51:37 +0100	[diff] [blame]	1618	se->statistics.sleep_start = 0;
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1619	se->statistics.sum_sleep_runtime += delta;
Arjan van de Ven	9745512	2008-01-25 21:08:34 +0100	[diff] [blame]	1620
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	1621	if (tsk) {
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	1622	account_scheduler_latency(tsk, delta >> 10, 1);
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	1623	trace_sched_stat_sleep(tsk, delta);
				1624	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1625	}
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1626	if (se->statistics.block_start) {
				1627	u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1628
				1629	if ((s64)delta < 0)
				1630	delta = 0;
				1631
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1632	if (unlikely(delta > se->statistics.block_max))
				1633	se->statistics.block_max = delta;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1634
Peter Zijlstra	8c79a04	2012-01-30 14:51:37 +0100	[diff] [blame]	1635	se->statistics.block_start = 0;
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1636	se->statistics.sum_sleep_runtime += delta;
Ingo Molnar	30084fb	2007-10-02 14:13:08 +0200	[diff] [blame]	1637
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	1638	if (tsk) {
Arjan van de Ven	8f0dfc3	2009-07-20 11:26:58 -0700	[diff] [blame]	1639	if (tsk->in_iowait) {
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1640	se->statistics.iowait_sum += delta;
				1641	se->statistics.iowait_count++;
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	1642	trace_sched_stat_iowait(tsk, delta);
Arjan van de Ven	8f0dfc3	2009-07-20 11:26:58 -0700	[diff] [blame]	1643	}
				1644
Andrew Vagin	b781a60	2011-11-28 12:03:35 +0300	[diff] [blame]	1645	trace_sched_stat_blocked(tsk, delta);
				1646
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	1647	/*
				1648	* Blocking time is in units of nanosecs, so shift by
				1649	* 20 to get a milliseconds-range estimation of the
				1650	* amount of time that the task spent sleeping:
				1651	*/
				1652	if (unlikely(prof_on == SLEEP_PROFILING)) {
				1653	profile_hits(SLEEP_PROFILING,
				1654	(void *)get_wchan(tsk),
				1655	delta >> 20);
				1656	}
				1657	account_scheduler_latency(tsk, delta >> 10, 0);
Ingo Molnar	30084fb	2007-10-02 14:13:08 +0200	[diff] [blame]	1658	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1659	}
				1660	#endif
				1661	}
				1662
Peter Zijlstra	ddc9729	2007-10-15 17:00:10 +0200	[diff] [blame]	1663	static void check_spread(struct cfs_rq cfs_rq, struct sched_entity se)
				1664	{
				1665	#ifdef CONFIG_SCHED_DEBUG
				1666	s64 d = se->vruntime - cfs_rq->min_vruntime;
				1667
				1668	if (d < 0)
				1669	d = -d;
				1670
				1671	if (d > 3*sysctl_sched_latency)
				1672	schedstat_inc(cfs_rq, nr_spread_over);
				1673	#endif
				1674	}
				1675
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1676	static void
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1677	place_entity(struct cfs_rq cfs_rq, struct sched_entity se, int initial)
				1678	{
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	1679	u64 vruntime = cfs_rq->min_vruntime;
Peter Zijlstra	94dfb5e	2007-10-15 17:00:05 +0200	[diff] [blame]	1680
Peter Zijlstra	2cb8600	2007-11-09 22:39:37 +0100	[diff] [blame]	1681	/*
				1682	* The 'current' period is already promised to the current tasks,
				1683	* however the extra weight of the new task will slow them down a
				1684	* little, place the new task so that it fits in the slot that
				1685	* stays open at the end.
				1686	*/
Peter Zijlstra	94dfb5e	2007-10-15 17:00:05 +0200	[diff] [blame]	1687	if (initial && sched_feat(START_DEBIT))
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	1688	vruntime += sched_vslice(cfs_rq, se);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1689
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1690	/* sleeps up to a single latency don't count. */
Mike Galbraith	5ca9880	2010-03-11 17:17:17 +0100	[diff] [blame]	1691	if (!initial) {
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1692	unsigned long thresh = sysctl_sched_latency;
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	1693
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1694	/*
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1695	* Halve their sleep time's effect, to allow
				1696	* for a gentler effect of sleepers:
				1697	*/
				1698	if (sched_feat(GENTLE_FAIR_SLEEPERS))
				1699	thresh >>= 1;
Ingo Molnar	51e0304	2009-09-16 08:54:45 +0200	[diff] [blame]	1700
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1701	vruntime -= thresh;
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1702	}
				1703
Mike Galbraith	b5d9d73	2009-09-08 11:12:28 +0200	[diff] [blame]	1704	/* ensure we never gain time by being placed backwards. */
Viresh Kumar	16c8f1c	2012-11-08 13:33:46 +0530	[diff] [blame]	1705	se->vruntime = max_vruntime(se->vruntime, vruntime);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1706	}
				1707
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	1708	static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
				1709
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1710	static void
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1711	enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1712	{
				1713	/*
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1714	* Update the normalized vruntime before updating min_vruntime
				1715	* through callig update_curr().
				1716	*/
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	1717	if (!(flags & ENQUEUE_WAKEUP) \|\| (flags & ENQUEUE_WAKING))
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1718	se->vruntime += cfs_rq->min_vruntime;
				1719
				1720	/*
Dmitry Adamushko	a2a2d68	2007-10-15 17:00:13 +0200	[diff] [blame]	1721	* Update run-time statistics of the 'current'.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1722	*/
Ingo Molnar	b7cc089	2007-08-09 11:16:47 +0200	[diff] [blame]	1723	update_curr(cfs_rq);
Paul Turner	f269ae0	2012-10-04 13:18:31 +0200	[diff] [blame]	1724	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	1725	account_entity_enqueue(cfs_rq, se);
				1726	update_cfs_shares(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1727
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1728	if (flags & ENQUEUE_WAKEUP) {
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1729	place_entity(cfs_rq, se, 0);
Ingo Molnar	2396af6	2007-08-09 11:16:48 +0200	[diff] [blame]	1730	enqueue_sleeper(cfs_rq, se);
Ingo Molnar	e9acbff	2007-10-15 17:00:04 +0200	[diff] [blame]	1731	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1732
Ingo Molnar	d2417e5	2007-08-09 11:16:47 +0200	[diff] [blame]	1733	update_stats_enqueue(cfs_rq, se);
Peter Zijlstra	ddc9729	2007-10-15 17:00:10 +0200	[diff] [blame]	1734	check_spread(cfs_rq, se);
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	1735	if (se != cfs_rq->curr)
				1736	__enqueue_entity(cfs_rq, se);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1737	se->on_rq = 1;
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	1738
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	1739	if (cfs_rq->nr_running == 1) {
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	1740	list_add_leaf_cfs_rq(cfs_rq);
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	1741	check_enqueue_throttle(cfs_rq);
				1742	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1743	}
				1744
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	1745	static void __clear_buddies_last(struct sched_entity *se)
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	1746	{
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	1747	for_each_sched_entity(se) {
				1748	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1749	if (cfs_rq->last == se)
				1750	cfs_rq->last = NULL;
				1751	else
				1752	break;
				1753	}
				1754	}
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	1755
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	1756	static void __clear_buddies_next(struct sched_entity *se)
				1757	{
				1758	for_each_sched_entity(se) {
				1759	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1760	if (cfs_rq->next == se)
				1761	cfs_rq->next = NULL;
				1762	else
				1763	break;
				1764	}
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	1765	}
				1766
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1767	static void __clear_buddies_skip(struct sched_entity *se)
				1768	{
				1769	for_each_sched_entity(se) {
				1770	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1771	if (cfs_rq->skip == se)
				1772	cfs_rq->skip = NULL;
				1773	else
				1774	break;
				1775	}
				1776	}
				1777
Peter Zijlstra	a571bbe	2009-01-28 14:51:40 +0100	[diff] [blame]	1778	static void clear_buddies(struct cfs_rq cfs_rq, struct sched_entity se)
				1779	{
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	1780	if (cfs_rq->last == se)
				1781	__clear_buddies_last(se);
				1782
				1783	if (cfs_rq->next == se)
				1784	__clear_buddies_next(se);
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1785
				1786	if (cfs_rq->skip == se)
				1787	__clear_buddies_skip(se);
Peter Zijlstra	a571bbe	2009-01-28 14:51:40 +0100	[diff] [blame]	1788	}
				1789
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	1790	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	1791
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1792	static void
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	1793	dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1794	{
Dmitry Adamushko	a2a2d68	2007-10-15 17:00:13 +0200	[diff] [blame]	1795	/*
				1796	* Update run-time statistics of the 'current'.
				1797	*/
				1798	update_curr(cfs_rq);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	1799	dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
Dmitry Adamushko	a2a2d68	2007-10-15 17:00:13 +0200	[diff] [blame]	1800
Ingo Molnar	19b6a2e	2007-08-09 11:16:48 +0200	[diff] [blame]	1801	update_stats_dequeue(cfs_rq, se);
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	1802	if (flags & DEQUEUE_SLEEP) {
Peter Zijlstra	67e9fb2	2007-10-15 17:00:10 +0200	[diff] [blame]	1803	#ifdef CONFIG_SCHEDSTATS
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1804	if (entity_is_task(se)) {
				1805	struct task_struct *tsk = task_of(se);
				1806
				1807	if (tsk->state & TASK_INTERRUPTIBLE)
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1808	se->statistics.sleep_start = rq_of(cfs_rq)->clock;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1809	if (tsk->state & TASK_UNINTERRUPTIBLE)
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1810	se->statistics.block_start = rq_of(cfs_rq)->clock;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1811	}
Dmitry Adamushko	db36cc7	2007-10-15 17:00:06 +0200	[diff] [blame]	1812	#endif
Peter Zijlstra	67e9fb2	2007-10-15 17:00:10 +0200	[diff] [blame]	1813	}
				1814
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	1815	clear_buddies(cfs_rq, se);
Peter Zijlstra	4793241	2008-11-04 21:25:09 +0100	[diff] [blame]	1816
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	1817	if (se != cfs_rq->curr)
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1818	__dequeue_entity(cfs_rq, se);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	1819	se->on_rq = 0;
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1820	account_entity_dequeue(cfs_rq, se);
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1821
				1822	/*
				1823	* Normalize the entity after updating the min_vruntime because the
				1824	* update can refer to the ->curr item and we need to reflect this
				1825	* movement in our normalized position.
				1826	*/
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	1827	if (!(flags & DEQUEUE_SLEEP))
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1828	se->vruntime -= cfs_rq->min_vruntime;
Peter Zijlstra	1e87623	2011-05-17 16:21:10 -0700	[diff] [blame]	1829
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	1830	/* return excess runtime on last dequeue */
				1831	return_cfs_rq_runtime(cfs_rq);
				1832
Peter Zijlstra	1e87623	2011-05-17 16:21:10 -0700	[diff] [blame]	1833	update_min_vruntime(cfs_rq);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	1834	update_cfs_shares(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1835	}
				1836
				1837	/*
				1838	* Preempt the current task with a newly woken task if needed:
				1839	*/
Peter Zijlstra	7c92e54	2007-09-05 14:32:49 +0200	[diff] [blame]	1840	static void
Ingo Molnar	2e09bf5	2007-10-15 17:00:05 +0200	[diff] [blame]	1841	check_preempt_tick(struct cfs_rq cfs_rq, struct sched_entity curr)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1842	{
Peter Zijlstra	1169783	2007-09-05 14:32:49 +0200	[diff] [blame]	1843	unsigned long ideal_runtime, delta_exec;
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	1844	struct sched_entity *se;
				1845	s64 delta;
Peter Zijlstra	1169783	2007-09-05 14:32:49 +0200	[diff] [blame]	1846
Peter Zijlstra	6d0f0ebd	2007-10-15 17:00:05 +0200	[diff] [blame]	1847	ideal_runtime = sched_slice(cfs_rq, curr);
Peter Zijlstra	1169783	2007-09-05 14:32:49 +0200	[diff] [blame]	1848	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
Mike Galbraith	a9f3e2b	2009-01-28 14:51:39 +0100	[diff] [blame]	1849	if (delta_exec > ideal_runtime) {
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1850	resched_task(rq_of(cfs_rq)->curr);
Mike Galbraith	a9f3e2b	2009-01-28 14:51:39 +0100	[diff] [blame]	1851	/*
				1852	* The current task ran long enough, ensure it doesn't get
				1853	* re-elected due to buddy favours.
				1854	*/
				1855	clear_buddies(cfs_rq, curr);
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1856	return;
				1857	}
				1858
				1859	/*
				1860	* Ensure that a task that missed wakeup preemption by a
				1861	* narrow margin doesn't have to wait for a full slice.
				1862	* This also mitigates buddy induced latencies under load.
				1863	*/
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1864	if (delta_exec < sysctl_sched_min_granularity)
				1865	return;
				1866
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	1867	se = __pick_first_entity(cfs_rq);
				1868	delta = curr->vruntime - se->vruntime;
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1869
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	1870	if (delta < 0)
				1871	return;
Mike Galbraith	d7d8294	2011-01-05 05:41:17 +0100	[diff] [blame]	1872
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	1873	if (delta > ideal_runtime)
				1874	resched_task(rq_of(cfs_rq)->curr);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1875	}
				1876
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	1877	static void
Ingo Molnar	8494f41	2007-08-09 11:16:48 +0200	[diff] [blame]	1878	set_next_entity(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1879	{
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	1880	/* 'current' is not kept within the tree. */
				1881	if (se->on_rq) {
				1882	/*
				1883	* Any task has to be enqueued before it get to execute on
				1884	* a CPU. So account for the time it spent waiting on the
				1885	* runqueue.
				1886	*/
				1887	update_stats_wait_end(cfs_rq, se);
				1888	__dequeue_entity(cfs_rq, se);
				1889	}
				1890
Ingo Molnar	79303e9	2007-08-09 11:16:47 +0200	[diff] [blame]	1891	update_stats_curr_start(cfs_rq, se);
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	1892	cfs_rq->curr = se;
Ingo Molnar	eba1ed4	2007-10-15 17:00:02 +0200	[diff] [blame]	1893	#ifdef CONFIG_SCHEDSTATS
				1894	/*
				1895	* Track our maximum slice length, if the CPU's load is at
				1896	* least twice that of our own weight (i.e. dont track it
				1897	* when there are only lesser-weight tasks around):
				1898	*/
Dmitry Adamushko	495eca4	2007-10-15 17:00:06 +0200	[diff] [blame]	1899	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1900	se->statistics.slice_max = max(se->statistics.slice_max,
Ingo Molnar	eba1ed4	2007-10-15 17:00:02 +0200	[diff] [blame]	1901	se->sum_exec_runtime - se->prev_sum_exec_runtime);
				1902	}
				1903	#endif
Peter Zijlstra	4a55b45	2007-09-05 14:32:49 +0200	[diff] [blame]	1904	se->prev_sum_exec_runtime = se->sum_exec_runtime;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1905	}
				1906
Peter Zijlstra	3f3a490	2008-10-24 11:06:16 +0200	[diff] [blame]	1907	static int
				1908	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se);
				1909
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1910	/*
				1911	* Pick the next process, keeping these things in mind, in this order:
				1912	* 1) keep things fair between processes/task groups
				1913	* 2) pick the "next" process, since someone really wants that to run
				1914	* 3) pick the "last" process, for cache locality
				1915	* 4) do not run the "skip" process, if something else is available
				1916	*/
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	1917	static struct sched_entity pick_next_entity(struct cfs_rq cfs_rq)
Peter Zijlstra	aa2ac25	2008-03-14 21:12:12 +0100	[diff] [blame]	1918	{
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1919	struct sched_entity *se = __pick_first_entity(cfs_rq);
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1920	struct sched_entity *left = se;
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	1921
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1922	/*
				1923	* Avoid running the skip buddy, if running something else can
				1924	* be done without getting too unfair.
				1925	*/
				1926	if (cfs_rq->skip == se) {
				1927	struct sched_entity *second = __pick_next_entity(se);
				1928	if (second && wakeup_preempt_entity(second, left) < 1)
				1929	se = second;
				1930	}
Peter Zijlstra	aa2ac25	2008-03-14 21:12:12 +0100	[diff] [blame]	1931
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1932	/*
				1933	* Prefer last buddy, try to return the CPU to a preempted task.
				1934	*/
				1935	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
				1936	se = cfs_rq->last;
				1937
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1938	/*
				1939	* Someone really wants this to run. If it's not unfair, run it.
				1940	*/
				1941	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
				1942	se = cfs_rq->next;
				1943
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1944	clear_buddies(cfs_rq, se);
Peter Zijlstra	4793241	2008-11-04 21:25:09 +0100	[diff] [blame]	1945
				1946	return se;
Peter Zijlstra	aa2ac25	2008-03-14 21:12:12 +0100	[diff] [blame]	1947	}
				1948
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	1949	static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
				1950
Ingo Molnar	ab6cde2	2007-08-09 11:16:48 +0200	[diff] [blame]	1951	static void put_prev_entity(struct cfs_rq cfs_rq, struct sched_entity prev)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1952	{
				1953	/*
				1954	* If still on the runqueue then deactivate_task()
				1955	* was not called and update_curr() has to be done:
				1956	*/
				1957	if (prev->on_rq)
Ingo Molnar	b7cc089	2007-08-09 11:16:47 +0200	[diff] [blame]	1958	update_curr(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1959
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	1960	/* throttle cfs_rqs exceeding runtime */
				1961	check_cfs_rq_runtime(cfs_rq);
				1962
Peter Zijlstra	ddc9729	2007-10-15 17:00:10 +0200	[diff] [blame]	1963	check_spread(cfs_rq, prev);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1964	if (prev->on_rq) {
Ingo Molnar	5870db5	2007-08-09 11:16:47 +0200	[diff] [blame]	1965	update_stats_wait_start(cfs_rq, prev);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1966	/* Put 'current' back into the tree. */
				1967	__enqueue_entity(cfs_rq, prev);
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1968	/* in !on_rq case, update occurred at dequeue */
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1969	update_entity_load_avg(prev, 1);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1970	}
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	1971	cfs_rq->curr = NULL;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1972	}
				1973
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	1974	static void
				1975	entity_tick(struct cfs_rq cfs_rq, struct sched_entity curr, int queued)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1976	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1977	/*
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1978	* Update run-time statistics of the 'current'.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1979	*/
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1980	update_curr(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1981
Paul Turner	43365bd	2010-12-15 19:10:17 -0800	[diff] [blame]	1982	/*
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1983	* Ensure that runnable average is periodically updated.
				1984	*/
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1985	update_entity_load_avg(curr, 1);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1986	update_cfs_rq_blocked_load(cfs_rq, 1);
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1987
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	1988	#ifdef CONFIG_SCHED_HRTICK
				1989	/*
				1990	* queued ticks are scheduled to match the slice, so don't bother
				1991	* validating it and just reschedule.
				1992	*/
Harvey Harrison	983ed7a	2008-04-24 18:17:55 -0700	[diff] [blame]	1993	if (queued) {
				1994	resched_task(rq_of(cfs_rq)->curr);
				1995	return;
				1996	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	1997	/*
				1998	* don't let the period tick interfere with the hrtick preemption
				1999	*/
				2000	if (!sched_feat(DOUBLE_TICK) &&
				2001	hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
				2002	return;
				2003	#endif
				2004
Yong Zhang	2c2efae	2011-07-29 16:20:33 +0800	[diff] [blame]	2005	if (cfs_rq->nr_running > 1)
Ingo Molnar	2e09bf5	2007-10-15 17:00:05 +0200	[diff] [blame]	2006	check_preempt_tick(cfs_rq, curr);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2007	}
				2008
Paul Turner	ab84d31	2011-07-21 09:43:28 -0700	[diff] [blame]	2009
				2010	/**************************************************
				2011	* CFS bandwidth control machinery
				2012	*/
				2013
				2014	#ifdef CONFIG_CFS_BANDWIDTH
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2015
				2016	#ifdef HAVE_JUMP_LABEL
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	2017	static struct static_key __cfs_bandwidth_used;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2018
				2019	static inline bool cfs_bandwidth_used(void)
				2020	{
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	2021	return static_key_false(&__cfs_bandwidth_used);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2022	}
				2023
				2024	void account_cfs_bandwidth_used(int enabled, int was_enabled)
				2025	{
				2026	/* only need to count groups transitioning between enabled/!enabled */
				2027	if (enabled && !was_enabled)
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	2028	static_key_slow_inc(&__cfs_bandwidth_used);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2029	else if (!enabled && was_enabled)
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	2030	static_key_slow_dec(&__cfs_bandwidth_used);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2031	}
				2032	#else /* HAVE_JUMP_LABEL */
				2033	static bool cfs_bandwidth_used(void)
				2034	{
				2035	return true;
				2036	}
				2037
				2038	void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
				2039	#endif /* HAVE_JUMP_LABEL */
				2040
Paul Turner	ab84d31	2011-07-21 09:43:28 -0700	[diff] [blame]	2041	/*
				2042	* default period for cfs group bandwidth.
				2043	* default: 0.1s, units: nanoseconds
				2044	*/
				2045	static inline u64 default_cfs_period(void)
				2046	{
				2047	return 100000000ULL;
				2048	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2049
				2050	static inline u64 sched_cfs_bandwidth_slice(void)
				2051	{
				2052	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
				2053	}
				2054
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2055	/*
				2056	* Replenish runtime according to assigned quota and update expiration time.
				2057	* We use sched_clock_cpu directly instead of rq->clock to avoid adding
				2058	* additional synchronization around rq->lock.
				2059	*
				2060	* requires cfs_b->lock
				2061	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2062	void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2063	{
				2064	u64 now;
				2065
				2066	if (cfs_b->quota == RUNTIME_INF)
				2067	return;
				2068
				2069	now = sched_clock_cpu(smp_processor_id());
				2070	cfs_b->runtime = cfs_b->quota;
				2071	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
				2072	}
				2073
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2074	static inline struct cfs_bandwidth tg_cfs_bandwidth(struct task_group tg)
				2075	{
				2076	return &tg->cfs_bandwidth;
				2077	}
				2078
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2079	/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
				2080	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
				2081	{
				2082	if (unlikely(cfs_rq->throttle_count))
				2083	return cfs_rq->throttled_clock_task;
				2084
				2085	return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
				2086	}
				2087
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2088	/* returns 0 on failure to allocate runtime */
				2089	static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2090	{
				2091	struct task_group *tg = cfs_rq->tg;
				2092	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2093	u64 amount = 0, min_amount, expires;
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2094
				2095	/* note: this is a positive sum as runtime_remaining <= 0 */
				2096	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
				2097
				2098	raw_spin_lock(&cfs_b->lock);
				2099	if (cfs_b->quota == RUNTIME_INF)
				2100	amount = min_amount;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2101	else {
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2102	/*
				2103	* If the bandwidth pool has become inactive, then at least one
				2104	* period must have elapsed since the last consumption.
				2105	* Refresh the global state and ensure bandwidth timer becomes
				2106	* active.
				2107	*/
				2108	if (!cfs_b->timer_active) {
				2109	__refill_cfs_bandwidth_runtime(cfs_b);
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2110	__start_cfs_bandwidth(cfs_b);
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2111	}
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2112
				2113	if (cfs_b->runtime > 0) {
				2114	amount = min(cfs_b->runtime, min_amount);
				2115	cfs_b->runtime -= amount;
				2116	cfs_b->idle = 0;
				2117	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2118	}
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2119	expires = cfs_b->runtime_expires;
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2120	raw_spin_unlock(&cfs_b->lock);
				2121
				2122	cfs_rq->runtime_remaining += amount;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2123	/*
				2124	* we may have advanced our local expiration to account for allowed
				2125	* spread between our sched_clock and the one on which runtime was
				2126	* issued.
				2127	*/
				2128	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
				2129	cfs_rq->runtime_expires = expires;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2130
				2131	return cfs_rq->runtime_remaining > 0;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2132	}
				2133
				2134	/*
				2135	* Note: This depends on the synchronization provided by sched_clock and the
				2136	* fact that rq->clock snapshots this value.
				2137	*/
				2138	static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2139	{
				2140	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2141	struct rq *rq = rq_of(cfs_rq);
				2142
				2143	/* if the deadline is ahead of our clock, nothing to do */
				2144	if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
				2145	return;
				2146
				2147	if (cfs_rq->runtime_remaining < 0)
				2148	return;
				2149
				2150	/*
				2151	* If the local deadline has passed we have to consider the
				2152	* possibility that our sched_clock is 'fast' and the global deadline
				2153	* has not truly expired.
				2154	*
				2155	* Fortunately we can check determine whether this the case by checking
				2156	* whether the global deadline has advanced.
				2157	*/
				2158
				2159	if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
				2160	/* extend local deadline, drift is bounded above by 2 ticks */
				2161	cfs_rq->runtime_expires += TICK_NSEC;
				2162	} else {
				2163	/* global deadline is ahead, expiration has passed */
				2164	cfs_rq->runtime_remaining = 0;
				2165	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2166	}
				2167
				2168	static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
				2169	unsigned long delta_exec)
				2170	{
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2171	/* dock delta_exec before expiring quota (as it could span periods) */
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2172	cfs_rq->runtime_remaining -= delta_exec;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2173	expire_cfs_rq_runtime(cfs_rq);
				2174
				2175	if (likely(cfs_rq->runtime_remaining > 0))
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2176	return;
				2177
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2178	/*
				2179	* if we're unable to extend our runtime we resched so that the active
				2180	* hierarchy can be throttled
				2181	*/
				2182	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
				2183	resched_task(rq_of(cfs_rq)->curr);
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2184	}
				2185
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	2186	static __always_inline
				2187	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2188	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2189	if (!cfs_bandwidth_used() \|\| !cfs_rq->runtime_enabled)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2190	return;
				2191
				2192	__account_cfs_rq_runtime(cfs_rq, delta_exec);
				2193	}
				2194
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2195	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
				2196	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2197	return cfs_bandwidth_used() && cfs_rq->throttled;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2198	}
				2199
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2200	/* check whether cfs_rq, or any parent, is throttled */
				2201	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
				2202	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2203	return cfs_bandwidth_used() && cfs_rq->throttle_count;
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2204	}
				2205
				2206	/*
				2207	* Ensure that neither of the group entities corresponding to src_cpu or
				2208	* dest_cpu are members of a throttled hierarchy when performing group
				2209	* load-balance operations.
				2210	*/
				2211	static inline int throttled_lb_pair(struct task_group *tg,
				2212	int src_cpu, int dest_cpu)
				2213	{
				2214	struct cfs_rq src_cfs_rq, dest_cfs_rq;
				2215
				2216	src_cfs_rq = tg->cfs_rq[src_cpu];
				2217	dest_cfs_rq = tg->cfs_rq[dest_cpu];
				2218
				2219	return throttled_hierarchy(src_cfs_rq) \|\|
				2220	throttled_hierarchy(dest_cfs_rq);
				2221	}
				2222
				2223	/* updated child weight may affect parent so we have to do this bottom up */
				2224	static int tg_unthrottle_up(struct task_group tg, void data)
				2225	{
				2226	struct rq *rq = data;
				2227	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
				2228
				2229	cfs_rq->throttle_count--;
				2230	#ifdef CONFIG_SMP
				2231	if (!cfs_rq->throttle_count) {
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2232	/* adjust cfs_rq_clock_task() */
				2233	cfs_rq->throttled_clock_task_time += rq->clock_task -
				2234	cfs_rq->throttled_clock_task;
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2235	}
				2236	#endif
				2237
				2238	return 0;
				2239	}
				2240
				2241	static int tg_throttle_down(struct task_group tg, void data)
				2242	{
				2243	struct rq *rq = data;
				2244	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
				2245
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	2246	/* group is entering throttled state, stop time */
				2247	if (!cfs_rq->throttle_count)
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2248	cfs_rq->throttled_clock_task = rq->clock_task;
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2249	cfs_rq->throttle_count++;
				2250
				2251	return 0;
				2252	}
				2253
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2254	static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2255	{
				2256	struct rq *rq = rq_of(cfs_rq);
				2257	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2258	struct sched_entity *se;
				2259	long task_delta, dequeue = 1;
				2260
				2261	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
				2262
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2263	/* freeze hierarchy runnable averages while throttled */
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2264	rcu_read_lock();
				2265	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
				2266	rcu_read_unlock();
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2267
				2268	task_delta = cfs_rq->h_nr_running;
				2269	for_each_sched_entity(se) {
				2270	struct cfs_rq *qcfs_rq = cfs_rq_of(se);
				2271	/* throttled entity or throttle-on-deactivate */
				2272	if (!se->on_rq)
				2273	break;
				2274
				2275	if (dequeue)
				2276	dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
				2277	qcfs_rq->h_nr_running -= task_delta;
				2278
				2279	if (qcfs_rq->load.weight)
				2280	dequeue = 0;
				2281	}
				2282
				2283	if (!se)
				2284	rq->nr_running -= task_delta;
				2285
				2286	cfs_rq->throttled = 1;
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2287	cfs_rq->throttled_clock = rq->clock;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2288	raw_spin_lock(&cfs_b->lock);
				2289	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
				2290	raw_spin_unlock(&cfs_b->lock);
				2291	}
				2292
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2293	void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2294	{
				2295	struct rq *rq = rq_of(cfs_rq);
				2296	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2297	struct sched_entity *se;
				2298	int enqueue = 1;
				2299	long task_delta;
				2300
				2301	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
				2302
				2303	cfs_rq->throttled = 0;
				2304	raw_spin_lock(&cfs_b->lock);
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2305	cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2306	list_del_rcu(&cfs_rq->throttled_list);
				2307	raw_spin_unlock(&cfs_b->lock);
				2308
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2309	update_rq_clock(rq);
				2310	/* update hierarchical throttle state */
				2311	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
				2312
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2313	if (!cfs_rq->load.weight)
				2314	return;
				2315
				2316	task_delta = cfs_rq->h_nr_running;
				2317	for_each_sched_entity(se) {
				2318	if (se->on_rq)
				2319	enqueue = 0;
				2320
				2321	cfs_rq = cfs_rq_of(se);
				2322	if (enqueue)
				2323	enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
				2324	cfs_rq->h_nr_running += task_delta;
				2325
				2326	if (cfs_rq_throttled(cfs_rq))
				2327	break;
				2328	}
				2329
				2330	if (!se)
				2331	rq->nr_running += task_delta;
				2332
				2333	/* determine whether we need to wake up potentially idle cpu */
				2334	if (rq->curr == rq->idle && rq->cfs.nr_running)
				2335	resched_task(rq->curr);
				2336	}
				2337
				2338	static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
				2339	u64 remaining, u64 expires)
				2340	{
				2341	struct cfs_rq *cfs_rq;
				2342	u64 runtime = remaining;
				2343
				2344	rcu_read_lock();
				2345	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
				2346	throttled_list) {
				2347	struct rq *rq = rq_of(cfs_rq);
				2348
				2349	raw_spin_lock(&rq->lock);
				2350	if (!cfs_rq_throttled(cfs_rq))
				2351	goto next;
				2352
				2353	runtime = -cfs_rq->runtime_remaining + 1;
				2354	if (runtime > remaining)
				2355	runtime = remaining;
				2356	remaining -= runtime;
				2357
				2358	cfs_rq->runtime_remaining += runtime;
				2359	cfs_rq->runtime_expires = expires;
				2360
				2361	/* we check whether we're throttled above */
				2362	if (cfs_rq->runtime_remaining > 0)
				2363	unthrottle_cfs_rq(cfs_rq);
				2364
				2365	next:
				2366	raw_spin_unlock(&rq->lock);
				2367
				2368	if (!remaining)
				2369	break;
				2370	}
				2371	rcu_read_unlock();
				2372
				2373	return remaining;
				2374	}
				2375
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2376	/*
				2377	* Responsible for refilling a task_group's bandwidth and unthrottling its
				2378	* cfs_rqs as appropriate. If there has been no activity within the last
				2379	* period the timer is deactivated until scheduling resumes; cfs_b->idle is
				2380	* used to track this state.
				2381	*/
				2382	static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
				2383	{
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2384	u64 runtime, runtime_expires;
				2385	int idle = 1, throttled;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2386
				2387	raw_spin_lock(&cfs_b->lock);
				2388	/* no need to continue the timer with no bandwidth constraint */
				2389	if (cfs_b->quota == RUNTIME_INF)
				2390	goto out_unlock;
				2391
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2392	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
				2393	/* idle depends on !throttled (for the case of a large deficit) */
				2394	idle = cfs_b->idle && !throttled;
Nikhil Rao	e8da1b1	2011-07-21 09:43:40 -0700	[diff] [blame]	2395	cfs_b->nr_periods += overrun;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2396
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2397	/* if we're going inactive then everything else can be deferred */
				2398	if (idle)
				2399	goto out_unlock;
				2400
				2401	__refill_cfs_bandwidth_runtime(cfs_b);
				2402
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2403	if (!throttled) {
				2404	/* mark as potentially idle for the upcoming period */
				2405	cfs_b->idle = 1;
				2406	goto out_unlock;
				2407	}
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2408
Nikhil Rao	e8da1b1	2011-07-21 09:43:40 -0700	[diff] [blame]	2409	/* account preceding periods in which throttling occurred */
				2410	cfs_b->nr_throttled += overrun;
				2411
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2412	/*
				2413	* There are throttled entities so we must first use the new bandwidth
				2414	* to unthrottle them before making it generally available. This
				2415	* ensures that all existing debts will be paid before a new cfs_rq is
				2416	* allowed to run.
				2417	*/
				2418	runtime = cfs_b->runtime;
				2419	runtime_expires = cfs_b->runtime_expires;
				2420	cfs_b->runtime = 0;
				2421
				2422	/*
				2423	* This check is repeated as we are holding onto the new bandwidth
				2424	* while we unthrottle. This can potentially race with an unthrottled
				2425	* group trying to acquire new bandwidth from the global pool.
				2426	*/
				2427	while (throttled && runtime > 0) {
				2428	raw_spin_unlock(&cfs_b->lock);
				2429	/* we can't nest cfs_b->lock while distributing bandwidth */
				2430	runtime = distribute_cfs_runtime(cfs_b, runtime,
				2431	runtime_expires);
				2432	raw_spin_lock(&cfs_b->lock);
				2433
				2434	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
				2435	}
				2436
				2437	/* return (any) remaining runtime */
				2438	cfs_b->runtime = runtime;
				2439	/*
				2440	* While we are ensured activity in the period following an
				2441	* unthrottle, this also covers the case in which the new bandwidth is
				2442	* insufficient to cover the existing bandwidth deficit. (Forcing the
				2443	* timer to remain active while there are any throttled entities.)
				2444	*/
				2445	cfs_b->idle = 0;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2446	out_unlock:
				2447	if (idle)
				2448	cfs_b->timer_active = 0;
				2449	raw_spin_unlock(&cfs_b->lock);
				2450
				2451	return idle;
				2452	}
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2453
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	2454	/* a cfs_rq won't donate quota below this amount */
				2455	static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
				2456	/* minimum remaining period time to redistribute slack quota */
				2457	static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
				2458	/* how long we wait to gather additional slack before distributing */
				2459	static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
				2460
				2461	/* are we near the end of the current quota period? */
				2462	static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
				2463	{
				2464	struct hrtimer *refresh_timer = &cfs_b->period_timer;
				2465	u64 remaining;
				2466
				2467	/* if the call-back is running a quota refresh is already occurring */
				2468	if (hrtimer_callback_running(refresh_timer))
				2469	return 1;
				2470
				2471	/* is a quota refresh about to occur? */
				2472	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
				2473	if (remaining < min_expire)
				2474	return 1;
				2475
				2476	return 0;
				2477	}
				2478
				2479	static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
				2480	{
				2481	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
				2482
				2483	/* if there's a quota refresh soon don't bother with slack */
				2484	if (runtime_refresh_within(cfs_b, min_left))
				2485	return;
				2486
				2487	start_bandwidth_timer(&cfs_b->slack_timer,
				2488	ns_to_ktime(cfs_bandwidth_slack_period));
				2489	}
				2490
				2491	/* we know any runtime found here is valid as update_curr() precedes return */
				2492	static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2493	{
				2494	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2495	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
				2496
				2497	if (slack_runtime <= 0)
				2498	return;
				2499
				2500	raw_spin_lock(&cfs_b->lock);
				2501	if (cfs_b->quota != RUNTIME_INF &&
				2502	cfs_rq->runtime_expires == cfs_b->runtime_expires) {
				2503	cfs_b->runtime += slack_runtime;
				2504
				2505	/* we are under rq->lock, defer unthrottling using a timer */
				2506	if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
				2507	!list_empty(&cfs_b->throttled_cfs_rq))
				2508	start_cfs_slack_bandwidth(cfs_b);
				2509	}
				2510	raw_spin_unlock(&cfs_b->lock);
				2511
				2512	/* even if it's not valid for return we don't want to try again */
				2513	cfs_rq->runtime_remaining -= slack_runtime;
				2514	}
				2515
				2516	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2517	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2518	if (!cfs_bandwidth_used())
				2519	return;
				2520
Paul Turner	fccfdc6	2011-11-07 20:26:34 -0800	[diff] [blame]	2521	if (!cfs_rq->runtime_enabled \|\| cfs_rq->nr_running)
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	2522	return;
				2523
				2524	__return_cfs_rq_runtime(cfs_rq);
				2525	}
				2526
				2527	/*
				2528	* This is done with a timer (instead of inline with bandwidth return) since
				2529	* it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
				2530	*/
				2531	static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
				2532	{
				2533	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
				2534	u64 expires;
				2535
				2536	/* confirm we're still not at a refresh boundary */
				2537	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
				2538	return;
				2539
				2540	raw_spin_lock(&cfs_b->lock);
				2541	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
				2542	runtime = cfs_b->runtime;
				2543	cfs_b->runtime = 0;
				2544	}
				2545	expires = cfs_b->runtime_expires;
				2546	raw_spin_unlock(&cfs_b->lock);
				2547
				2548	if (!runtime)
				2549	return;
				2550
				2551	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
				2552
				2553	raw_spin_lock(&cfs_b->lock);
				2554	if (expires == cfs_b->runtime_expires)
				2555	cfs_b->runtime = runtime;
				2556	raw_spin_unlock(&cfs_b->lock);
				2557	}
				2558
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2559	/*
				2560	* When a group wakes up we want to make sure that its quota is not already
				2561	* expired/exceeded, otherwise it may be allowed to steal additional ticks of
				2562	* runtime as update_curr() throttling can not not trigger until it's on-rq.
				2563	*/
				2564	static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
				2565	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2566	if (!cfs_bandwidth_used())
				2567	return;
				2568
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2569	/* an active group must be handled by the update_curr()->put() path */
				2570	if (!cfs_rq->runtime_enabled \|\| cfs_rq->curr)
				2571	return;
				2572
				2573	/* ensure the group is not already throttled */
				2574	if (cfs_rq_throttled(cfs_rq))
				2575	return;
				2576
				2577	/* update runtime allocation */
				2578	account_cfs_rq_runtime(cfs_rq, 0);
				2579	if (cfs_rq->runtime_remaining <= 0)
				2580	throttle_cfs_rq(cfs_rq);
				2581	}
				2582
				2583	/* conditionally throttle active cfs_rq's from put_prev_entity() */
				2584	static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2585	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2586	if (!cfs_bandwidth_used())
				2587	return;
				2588
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2589	if (likely(!cfs_rq->runtime_enabled \|\| cfs_rq->runtime_remaining > 0))
				2590	return;
				2591
				2592	/*
				2593	* it's possible for a throttled entity to be forced into a running
				2594	* state (e.g. set_curr_task), in this case we're finished.
				2595	*/
				2596	if (cfs_rq_throttled(cfs_rq))
				2597	return;
				2598
				2599	throttle_cfs_rq(cfs_rq);
				2600	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2601
				2602	static inline u64 default_cfs_period(void);
				2603	static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
				2604	static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
				2605
				2606	static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
				2607	{
				2608	struct cfs_bandwidth *cfs_b =
				2609	container_of(timer, struct cfs_bandwidth, slack_timer);
				2610	do_sched_cfs_slack_timer(cfs_b);
				2611
				2612	return HRTIMER_NORESTART;
				2613	}
				2614
				2615	static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
				2616	{
				2617	struct cfs_bandwidth *cfs_b =
				2618	container_of(timer, struct cfs_bandwidth, period_timer);
				2619	ktime_t now;
				2620	int overrun;
				2621	int idle = 0;
				2622
				2623	for (;;) {
				2624	now = hrtimer_cb_get_time(timer);
				2625	overrun = hrtimer_forward(timer, now, cfs_b->period);
				2626
				2627	if (!overrun)
				2628	break;
				2629
				2630	idle = do_sched_cfs_period_timer(cfs_b, overrun);
				2631	}
				2632
				2633	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
				2634	}
				2635
				2636	void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				2637	{
				2638	raw_spin_lock_init(&cfs_b->lock);
				2639	cfs_b->runtime = 0;
				2640	cfs_b->quota = RUNTIME_INF;
				2641	cfs_b->period = ns_to_ktime(default_cfs_period());
				2642
				2643	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
				2644	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
				2645	cfs_b->period_timer.function = sched_cfs_period_timer;
				2646	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
				2647	cfs_b->slack_timer.function = sched_cfs_slack_timer;
				2648	}
				2649
				2650	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2651	{
				2652	cfs_rq->runtime_enabled = 0;
				2653	INIT_LIST_HEAD(&cfs_rq->throttled_list);
				2654	}
				2655
				2656	/* requires cfs_b->lock, may release to reprogram timer */
				2657	void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				2658	{
				2659	/*
				2660	* The timer may be active because we're trying to set a new bandwidth
				2661	* period or because we're racing with the tear-down path
				2662	* (timer_active==0 becomes visible before the hrtimer call-back
				2663	* terminates). In either case we ensure that it's re-programmed
				2664	*/
				2665	while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
				2666	raw_spin_unlock(&cfs_b->lock);
				2667	/* ensure cfs_b->lock is available while we wait */
				2668	hrtimer_cancel(&cfs_b->period_timer);
				2669
				2670	raw_spin_lock(&cfs_b->lock);
				2671	/* if someone else restarted the timer then we're done */
				2672	if (cfs_b->timer_active)
				2673	return;
				2674	}
				2675
				2676	cfs_b->timer_active = 1;
				2677	start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
				2678	}
				2679
				2680	static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				2681	{
				2682	hrtimer_cancel(&cfs_b->period_timer);
				2683	hrtimer_cancel(&cfs_b->slack_timer);
				2684	}
				2685
Arnd Bergmann	38dc334	2013-01-25 14:14:22 +0000	[diff] [blame]	2686	static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2687	{
				2688	struct cfs_rq *cfs_rq;
				2689
				2690	for_each_leaf_cfs_rq(rq, cfs_rq) {
				2691	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2692
				2693	if (!cfs_rq->runtime_enabled)
				2694	continue;
				2695
				2696	/*
				2697	* clock_task is not advancing so we just need to make sure
				2698	* there's some valid quota amount
				2699	*/
				2700	cfs_rq->runtime_remaining = cfs_b->quota;
				2701	if (cfs_rq_throttled(cfs_rq))
				2702	unthrottle_cfs_rq(cfs_rq);
				2703	}
				2704	}
				2705
				2706	#else /* CONFIG_CFS_BANDWIDTH */
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2707	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
				2708	{
				2709	return rq_of(cfs_rq)->clock_task;
				2710	}
				2711
				2712	static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
				2713	unsigned long delta_exec) {}
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2714	static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
				2715	static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	2716	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2717
				2718	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
				2719	{
				2720	return 0;
				2721	}
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2722
				2723	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
				2724	{
				2725	return 0;
				2726	}
				2727
				2728	static inline int throttled_lb_pair(struct task_group *tg,
				2729	int src_cpu, int dest_cpu)
				2730	{
				2731	return 0;
				2732	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2733
				2734	void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
				2735
				2736	#ifdef CONFIG_FAIR_GROUP_SCHED
				2737	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
Paul Turner	ab84d31	2011-07-21 09:43:28 -0700	[diff] [blame]	2738	#endif
				2739
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2740	static inline struct cfs_bandwidth tg_cfs_bandwidth(struct task_group tg)
				2741	{
				2742	return NULL;
				2743	}
				2744	static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
Peter Boonstoppel	a4c96ae	2012-08-09 15:34:47 -0700	[diff] [blame]	2745	static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2746
				2747	#endif /* CONFIG_CFS_BANDWIDTH */
				2748
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2749	/**************************************************
				2750	* CFS operations on tasks:
				2751	*/
				2752
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2753	#ifdef CONFIG_SCHED_HRTICK
				2754	static void hrtick_start_fair(struct rq rq, struct task_struct p)
				2755	{
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2756	struct sched_entity *se = &p->se;
				2757	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				2758
				2759	WARN_ON(task_rq(p) != rq);
				2760
Mike Galbraith	b39e66e	2011-11-22 15:20:07 +0100	[diff] [blame]	2761	if (cfs_rq->nr_running > 1) {
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2762	u64 slice = sched_slice(cfs_rq, se);
				2763	u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
				2764	s64 delta = slice - ran;
				2765
				2766	if (delta < 0) {
				2767	if (rq->curr == p)
				2768	resched_task(p);
				2769	return;
				2770	}
				2771
				2772	/*
				2773	* Don't schedule slices shorter than 10000ns, that just
				2774	* doesn't make sense. Rely on vruntime for fairness.
				2775	*/
Peter Zijlstra	3165651	2008-07-18 18:01:23 +0200	[diff] [blame]	2776	if (rq->curr != p)
Peter Zijlstra	157124c	2008-07-28 11:53:11 +0200	[diff] [blame]	2777	delta = max_t(s64, 10000LL, delta);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2778
Peter Zijlstra	3165651	2008-07-18 18:01:23 +0200	[diff] [blame]	2779	hrtick_start(rq, delta);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2780	}
				2781	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	2782
				2783	/*
				2784	* called from enqueue/dequeue and updates the hrtick when the
				2785	* current task is from our class and nr_running is low enough
				2786	* to matter.
				2787	*/
				2788	static void hrtick_update(struct rq *rq)
				2789	{
				2790	struct task_struct *curr = rq->curr;
				2791
Mike Galbraith	b39e66e	2011-11-22 15:20:07 +0100	[diff] [blame]	2792	if (!hrtick_enabled(rq) \|\| curr->sched_class != &fair_sched_class)
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	2793	return;
				2794
				2795	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
				2796	hrtick_start_fair(rq, curr);
				2797	}
Dhaval Giani	55e12e5	2008-06-24 23:39:43 +0530	[diff] [blame]	2798	#else /* !CONFIG_SCHED_HRTICK */
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2799	static inline void
				2800	hrtick_start_fair(struct rq rq, struct task_struct p)
				2801	{
				2802	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	2803
				2804	static inline void hrtick_update(struct rq *rq)
				2805	{
				2806	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2807	#endif
				2808
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2809	/*
				2810	* The enqueue_task method is called before nr_running is
				2811	* increased. Here we update the fair scheduling stats and
				2812	* then put the task into the rbtree:
				2813	*/
Thomas Gleixner	ea87bb7	2010-01-20 20:58:57 +0000	[diff] [blame]	2814	static void
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	2815	enqueue_task_fair(struct rq rq, struct task_struct p, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2816	{
				2817	struct cfs_rq *cfs_rq;
Peter Zijlstra	62fb185	2008-02-25 17:34:02 +0100	[diff] [blame]	2818	struct sched_entity *se = &p->se;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2819
				2820	for_each_sched_entity(se) {
Peter Zijlstra	62fb185	2008-02-25 17:34:02 +0100	[diff] [blame]	2821	if (se->on_rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2822	break;
				2823	cfs_rq = cfs_rq_of(se);
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2824	enqueue_entity(cfs_rq, se, flags);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2825
				2826	/*
				2827	* end evaluation on encountering a throttled cfs_rq
				2828	*
				2829	* note: in the case of encountering a throttled cfs_rq we will
				2830	* post the final h_nr_running increment below.
				2831	*/
				2832	if (cfs_rq_throttled(cfs_rq))
				2833	break;
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	2834	cfs_rq->h_nr_running++;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2835
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2836	flags = ENQUEUE_WAKEUP;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2837	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2838
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2839	for_each_sched_entity(se) {
Lin Ming	0f31714	2011-07-22 09:14:31 +0800	[diff] [blame]	2840	cfs_rq = cfs_rq_of(se);
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	2841	cfs_rq->h_nr_running++;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2842
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2843	if (cfs_rq_throttled(cfs_rq))
				2844	break;
				2845
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	2846	update_cfs_shares(cfs_rq);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2847	update_entity_load_avg(se, 1);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2848	}
				2849
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	2850	if (!se) {
				2851	update_rq_runnable_avg(rq, rq->nr_running);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2852	inc_nr_running(rq);
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	2853	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	2854	hrtick_update(rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2855	}
				2856
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	2857	static void set_next_buddy(struct sched_entity *se);
				2858
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2859	/*
				2860	* The dequeue_task method is called before nr_running is
				2861	* decreased. We remove the task from the rbtree and
				2862	* update the fair scheduling stats:
				2863	*/
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	2864	static void dequeue_task_fair(struct rq rq, struct task_struct p, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2865	{
				2866	struct cfs_rq *cfs_rq;
Peter Zijlstra	62fb185	2008-02-25 17:34:02 +0100	[diff] [blame]	2867	struct sched_entity *se = &p->se;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	2868	int task_sleep = flags & DEQUEUE_SLEEP;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2869
				2870	for_each_sched_entity(se) {
				2871	cfs_rq = cfs_rq_of(se);
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	2872	dequeue_entity(cfs_rq, se, flags);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2873
				2874	/*
				2875	* end evaluation on encountering a throttled cfs_rq
				2876	*
				2877	* note: in the case of encountering a throttled cfs_rq we will
				2878	* post the final h_nr_running decrement below.
				2879	*/
				2880	if (cfs_rq_throttled(cfs_rq))
				2881	break;
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	2882	cfs_rq->h_nr_running--;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2883
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2884	/* Don't dequeue parent if it has other entities besides us */
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	2885	if (cfs_rq->load.weight) {
				2886	/*
				2887	* Bias pick_next to pick a task from this cfs_rq, as
				2888	* p is sleeping when it is within its sched_slice.
				2889	*/
				2890	if (task_sleep && parent_entity(se))
				2891	set_next_buddy(parent_entity(se));
Paul Turner	9598c82	2011-07-06 22:30:37 -0700	[diff] [blame]	2892
				2893	/* avoid re-evaluating load for this entity */
				2894	se = parent_entity(se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2895	break;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	2896	}
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	2897	flags \|= DEQUEUE_SLEEP;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2898	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2899
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2900	for_each_sched_entity(se) {
Lin Ming	0f31714	2011-07-22 09:14:31 +0800	[diff] [blame]	2901	cfs_rq = cfs_rq_of(se);
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	2902	cfs_rq->h_nr_running--;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2903
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2904	if (cfs_rq_throttled(cfs_rq))
				2905	break;
				2906
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	2907	update_cfs_shares(cfs_rq);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2908	update_entity_load_avg(se, 1);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2909	}
				2910
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	2911	if (!se) {
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2912	dec_nr_running(rq);
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	2913	update_rq_runnable_avg(rq, 1);
				2914	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	2915	hrtick_update(rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2916	}
				2917
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	2918	#ifdef CONFIG_SMP
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2919	/* Used instead of source_load when we know the type == 0 */
				2920	static unsigned long weighted_cpuload(const int cpu)
				2921	{
				2922	return cpu_rq(cpu)->load.weight;
				2923	}
				2924
				2925	/*
				2926	* Return a low guess at the load of a migration-source cpu weighted
				2927	* according to the scheduling class and "nice" value.
				2928	*
				2929	* We want to under-estimate the load of migration sources, to
				2930	* balance conservatively.
				2931	*/
				2932	static unsigned long source_load(int cpu, int type)
				2933	{
				2934	struct rq *rq = cpu_rq(cpu);
				2935	unsigned long total = weighted_cpuload(cpu);
				2936
				2937	if (type == 0 \|\| !sched_feat(LB_BIAS))
				2938	return total;
				2939
				2940	return min(rq->cpu_load[type-1], total);
				2941	}
				2942
				2943	/*
				2944	* Return a high guess at the load of a migration-target cpu weighted
				2945	* according to the scheduling class and "nice" value.
				2946	*/
				2947	static unsigned long target_load(int cpu, int type)
				2948	{
				2949	struct rq *rq = cpu_rq(cpu);
				2950	unsigned long total = weighted_cpuload(cpu);
				2951
				2952	if (type == 0 \|\| !sched_feat(LB_BIAS))
				2953	return total;
				2954
				2955	return max(rq->cpu_load[type-1], total);
				2956	}
				2957
				2958	static unsigned long power_of(int cpu)
				2959	{
				2960	return cpu_rq(cpu)->cpu_power;
				2961	}
				2962
				2963	static unsigned long cpu_avg_load_per_task(int cpu)
				2964	{
				2965	struct rq *rq = cpu_rq(cpu);
				2966	unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
				2967
				2968	if (nr_running)
				2969	return rq->load.weight / nr_running;
				2970
				2971	return 0;
				2972	}
				2973
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	2974
Peter Zijlstra	74f8e4b	2011-04-05 17:23:47 +0200	[diff] [blame]	2975	static void task_waking_fair(struct task_struct *p)
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2976	{
				2977	struct sched_entity *se = &p->se;
				2978	struct cfs_rq *cfs_rq = cfs_rq_of(se);
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	2979	u64 min_vruntime;
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2980
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	2981	#ifndef CONFIG_64BIT
				2982	u64 min_vruntime_copy;
Peter Zijlstra	74f8e4b	2011-04-05 17:23:47 +0200	[diff] [blame]	2983
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	2984	do {
				2985	min_vruntime_copy = cfs_rq->min_vruntime_copy;
				2986	smp_rmb();
				2987	min_vruntime = cfs_rq->min_vruntime;
				2988	} while (min_vruntime != min_vruntime_copy);
				2989	#else
				2990	min_vruntime = cfs_rq->min_vruntime;
				2991	#endif
				2992
				2993	se->vruntime -= min_vruntime;
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2994	}
				2995
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	2996	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	f5bfb7d	2008-06-27 13:41:39 +0200	[diff] [blame]	2997	/*
				2998	* effective_load() calculates the load change as seen from the root_task_group
				2999	*
				3000	* Adding load to a group doesn't make a group heavier, but can cause movement
				3001	* of group shares between cpus. Assuming the shares were perfectly aligned one
				3002	* can calculate the shift in shares.
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3003	*
				3004	* Calculate the effective load difference if @wl is added (subtracted) to @tg
				3005	* on this @cpu and results in a total addition (subtraction) of @wg to the
				3006	* total group weight.
				3007	*
				3008	* Given a runqueue weight distribution (rw_i) we can compute a shares
				3009	* distribution (s_i) using:
				3010	*
				3011	* s_i = rw_i / \Sum rw_j (1)
				3012	*
				3013	* Suppose we have 4 CPUs and our @tg is a direct child of the root group and
				3014	* has 7 equal weight tasks, distributed as below (rw_i), with the resulting
				3015	* shares distribution (s_i):
				3016	*
				3017	* rw_i = { 2, 4, 1, 0 }
				3018	* s_i = { 2/7, 4/7, 1/7, 0 }
				3019	*
				3020	* As per wake_affine() we're interested in the load of two CPUs (the CPU the
				3021	* task used to run on and the CPU the waker is running on), we need to
				3022	* compute the effect of waking a task on either CPU and, in case of a sync
				3023	* wakeup, compute the effect of the current task going to sleep.
				3024	*
				3025	* So for a change of @wl to the local @cpu with an overall group weight change
				3026	* of @wl we can compute the new shares distribution (s'_i) using:
				3027	*
				3028	* s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
				3029	*
				3030	* Suppose we're interested in CPUs 0 and 1, and want to compute the load
				3031	* differences in waking a task to CPU 0. The additional task changes the
				3032	* weight and shares distributions like:
				3033	*
				3034	* rw'_i = { 3, 4, 1, 0 }
				3035	* s'_i = { 3/8, 4/8, 1/8, 0 }
				3036	*
				3037	* We can then compute the difference in effective weight by using:
				3038	*
				3039	* dw_i = S * (s'_i - s_i) (3)
				3040	*
				3041	* Where 'S' is the group weight as seen by its parent.
				3042	*
				3043	* Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
				3044	* times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
				3045	* 4/7) times the weight of the group.
Peter Zijlstra	f5bfb7d	2008-06-27 13:41:39 +0200	[diff] [blame]	3046	*/
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	3047	static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3048	{
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3049	struct sched_entity *se = tg->se[cpu];
Peter Zijlstra	f1d239f	2008-06-27 13:41:38 +0200	[diff] [blame]	3050
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3051	if (!tg->parent) /* the trivial, non-cgroup case */
Peter Zijlstra	f1d239f	2008-06-27 13:41:38 +0200	[diff] [blame]	3052	return wl;
				3053
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3054	for_each_sched_entity(se) {
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3055	long w, W;
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3056
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	3057	tg = se->my_q->tg;
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3058
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3059	/*
				3060	* W = @wg + \Sum rw_j
				3061	*/
				3062	W = wg + calc_tg_weight(tg, se->my_q);
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3063
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3064	/*
				3065	* w = rw_i + @wl
				3066	*/
				3067	w = se->my_q->load.weight + wl;
Peter Zijlstra	940959e	2008-09-23 15:33:42 +0200	[diff] [blame]	3068
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3069	/*
				3070	* wl = S * s'_i; see (2)
				3071	*/
				3072	if (W > 0 && w < W)
				3073	wl = (w * tg->shares) / W;
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	3074	else
				3075	wl = tg->shares;
Peter Zijlstra	940959e	2008-09-23 15:33:42 +0200	[diff] [blame]	3076
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3077	/*
				3078	* Per the above, wl is the new se->load.weight value; since
				3079	* those are clipped to [MIN_SHARES, ...) do so now. See
				3080	* calc_cfs_shares().
				3081	*/
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	3082	if (wl < MIN_SHARES)
				3083	wl = MIN_SHARES;
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3084
				3085	/*
				3086	* wl = dw_i = S * (s'_i - s_i); see (3)
				3087	*/
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	3088	wl -= se->load.weight;
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3089
				3090	/*
				3091	* Recursively apply this logic to all parent groups to compute
				3092	* the final effective load change on the root group. Since
				3093	* only the @tg group gets extra weight, all parent groups can
				3094	* only redistribute existing shares. @wl is the shift in shares
				3095	* resulting from this level per the above.
				3096	*/
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3097	wg = 0;
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3098	}
				3099
				3100	return wl;
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3101	}
				3102	#else
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3103
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3104	static inline unsigned long effective_load(struct task_group *tg, int cpu,
				3105	unsigned long wl, unsigned long wg)
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3106	{
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3107	return wl;
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3108	}
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3109
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3110	#endif
				3111
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3112	static int wake_affine(struct sched_domain sd, struct task_struct p, int sync)
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3113	{
Paul Turner	e37b6a7	2011-01-21 20:44:59 -0800	[diff] [blame]	3114	s64 this_load, load;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3115	int idx, this_cpu, prev_cpu;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3116	unsigned long tl_per_task;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3117	struct task_group *tg;
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3118	unsigned long weight;
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	3119	int balanced;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3120
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3121	idx = sd->wake_idx;
				3122	this_cpu = smp_processor_id();
				3123	prev_cpu = task_cpu(p);
				3124	load = source_load(prev_cpu, idx);
				3125	this_load = target_load(this_cpu, idx);
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3126
				3127	/*
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3128	* If sync wakeup then subtract the (maximum possible)
				3129	* effect of the currently running task from the load
				3130	* of the current CPU:
				3131	*/
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3132	if (sync) {
				3133	tg = task_group(current);
				3134	weight = current->se.load.weight;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3135
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3136	this_load += effective_load(tg, this_cpu, -weight, -weight);
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3137	load += effective_load(tg, prev_cpu, 0, -weight);
				3138	}
				3139
				3140	tg = task_group(p);
				3141	weight = p->se.load.weight;
				3142
Peter Zijlstra	71a29aa	2009-09-07 18:28:05 +0200	[diff] [blame]	3143	/*
				3144	* In low-load situations, where prev_cpu is idle and this_cpu is idle
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3145	* due to the sync cause above having dropped this_load to 0, we'll
				3146	* always have an imbalance, but there's really nothing you can do
				3147	* about that, so that's good too.
Peter Zijlstra	71a29aa	2009-09-07 18:28:05 +0200	[diff] [blame]	3148	*
				3149	* Otherwise check if either cpus are near enough in load to allow this
				3150	* task to be woken on this_cpu.
				3151	*/
Paul Turner	e37b6a7	2011-01-21 20:44:59 -0800	[diff] [blame]	3152	if (this_load > 0) {
				3153	s64 this_eff_load, prev_eff_load;
Peter Zijlstra	e51fd5e	2010-05-31 12:37:30 +0200	[diff] [blame]	3154
				3155	this_eff_load = 100;
				3156	this_eff_load *= power_of(prev_cpu);
				3157	this_eff_load *= this_load +
				3158	effective_load(tg, this_cpu, weight, weight);
				3159
				3160	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
				3161	prev_eff_load *= power_of(this_cpu);
				3162	prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
				3163
				3164	balanced = this_eff_load <= prev_eff_load;
				3165	} else
				3166	balanced = true;
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	3167
				3168	/*
				3169	* If the currently running task will sleep within
				3170	* a reasonable amount of time then attract this newly
				3171	* woken task:
				3172	*/
Peter Zijlstra	2fb7635	2008-10-08 09:16:04 +0200	[diff] [blame]	3173	if (sync && balanced)
				3174	return 1;
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	3175
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3176	schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	3177	tl_per_task = cpu_avg_load_per_task(this_cpu);
				3178
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3179	if (balanced \|\|
				3180	(this_load <= load &&
				3181	this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3182	/*
				3183	* This domain has SD_WAKE_AFFINE and
				3184	* p is cache cold in this domain, and
				3185	* there is no bad imbalance.
				3186	*/
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3187	schedstat_inc(sd, ttwu_move_affine);
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3188	schedstat_inc(p, se.statistics.nr_wakeups_affine);
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3189
				3190	return 1;
				3191	}
				3192	return 0;
				3193	}
				3194
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3195	/*
				3196	* find_idlest_group finds and returns the least busy CPU group within the
				3197	* domain.
				3198	*/
				3199	static struct sched_group *
Peter Zijlstra	78e7ed5	2009-09-03 13:16:51 +0200	[diff] [blame]	3200	find_idlest_group(struct sched_domain sd, struct task_struct p,
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	3201	int this_cpu, int load_idx)
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3202	{
Andi Kleen	b3bd3de	2010-08-10 14:17:51 -0700	[diff] [blame]	3203	struct sched_group idlest = NULL, group = sd->groups;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3204	unsigned long min_load = ULONG_MAX, this_load = 0;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3205	int imbalance = 100 + (sd->imbalance_pct-100)/2;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3206
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3207	do {
				3208	unsigned long load, avg_load;
				3209	int local_group;
				3210	int i;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3211
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3212	/* Skip over this group if it has no CPUs allowed */
				3213	if (!cpumask_intersects(sched_group_cpus(group),
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	3214	tsk_cpus_allowed(p)))
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3215	continue;
				3216
				3217	local_group = cpumask_test_cpu(this_cpu,
				3218	sched_group_cpus(group));
				3219
				3220	/* Tally up the load of all CPUs in the group */
				3221	avg_load = 0;
				3222
				3223	for_each_cpu(i, sched_group_cpus(group)) {
				3224	/* Bias balancing toward cpus of our domain */
				3225	if (local_group)
				3226	load = source_load(i, load_idx);
				3227	else
				3228	load = target_load(i, load_idx);
				3229
				3230	avg_load += load;
				3231	}
				3232
				3233	/* Adjust by relative CPU power of the group */
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	3234	avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3235
				3236	if (local_group) {
				3237	this_load = avg_load;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3238	} else if (avg_load < min_load) {
				3239	min_load = avg_load;
				3240	idlest = group;
				3241	}
				3242	} while (group = group->next, group != sd->groups);
				3243
				3244	if (!idlest \|\| 100this_load < imbalancemin_load)
				3245	return NULL;
				3246	return idlest;
				3247	}
				3248
				3249	/*
				3250	* find_idlest_cpu - find the idlest cpu among the cpus in group.
				3251	*/
				3252	static int
				3253	find_idlest_cpu(struct sched_group group, struct task_struct p, int this_cpu)
				3254	{
				3255	unsigned long load, min_load = ULONG_MAX;
				3256	int idlest = -1;
				3257	int i;
				3258
				3259	/* Traverse only the allowed CPUs */
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	3260	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3261	load = weighted_cpuload(i);
				3262
				3263	if (load < min_load \|\| (load == min_load && i == this_cpu)) {
				3264	min_load = load;
				3265	idlest = i;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3266	}
				3267	}
				3268
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3269	return idlest;
				3270	}
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3271
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3272	/*
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3273	* Try and locate an idle CPU in the sched_domain.
				3274	*/
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3275	static int select_idle_sibling(struct task_struct *p, int target)
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3276	{
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3277	struct sched_domain *sd;
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3278	struct sched_group *sg;
Mike Galbraith	e0a79f5	2013-01-28 12:19:25 +0100	[diff] [blame]	3279	int i = task_cpu(p);
				3280
				3281	if (idle_cpu(target))
				3282	return target;
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3283
				3284	/*
Mike Galbraith	e0a79f5	2013-01-28 12:19:25 +0100	[diff] [blame]	3285	* If the prevous cpu is cache affine and idle, don't be stupid.
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3286	*/
Mike Galbraith	e0a79f5	2013-01-28 12:19:25 +0100	[diff] [blame]	3287	if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
				3288	return i;
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3289
				3290	/*
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3291	* Otherwise, iterate the domains and find an elegible idle cpu.
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3292	*/
Peter Zijlstra	518cd62	2011-12-07 15:07:31 +0100	[diff] [blame]	3293	sd = rcu_dereference(per_cpu(sd_llc, target));
Suresh Siddha	77e8136	2011-11-17 11:08:23 -0800	[diff] [blame]	3294	for_each_lower_domain(sd) {
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3295	sg = sd->groups;
				3296	do {
				3297	if (!cpumask_intersects(sched_group_cpus(sg),
				3298	tsk_cpus_allowed(p)))
				3299	goto next;
Mike Galbraith	970e178	2012-06-12 05:18:32 +0200	[diff] [blame]	3300
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3301	for_each_cpu(i, sched_group_cpus(sg)) {
Mike Galbraith	e0a79f5	2013-01-28 12:19:25 +0100	[diff] [blame]	3302	if (i == target \|\| !idle_cpu(i))
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3303	goto next;
				3304	}
				3305
				3306	target = cpumask_first_and(sched_group_cpus(sg),
				3307	tsk_cpus_allowed(p));
				3308	goto done;
				3309	next:
				3310	sg = sg->next;
				3311	} while (sg != sd->groups);
				3312	}
				3313	done:
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3314	return target;
				3315	}
				3316
				3317	/*
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3318	* sched_balance_self: balance the current task (running on cpu) in domains
				3319	* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
				3320	* SD_BALANCE_EXEC.
				3321	*
				3322	* Balance, ie. select the least loaded group.
				3323	*
				3324	* Returns the target CPU number, or the same CPU if no balancing is needed.
				3325	*
				3326	* preempt must be disabled.
				3327	*/
Peter Zijlstra	0017d73	2010-03-24 18:34:10 +0100	[diff] [blame]	3328	static int
Peter Zijlstra	7608dec	2011-04-05 17:23:46 +0200	[diff] [blame]	3329	select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3330	{
Peter Zijlstra	29cd8ba	2009-09-17 09:01:14 +0200	[diff] [blame]	3331	struct sched_domain tmp, affine_sd = NULL, *sd = NULL;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3332	int cpu = smp_processor_id();
				3333	int prev_cpu = task_cpu(p);
				3334	int new_cpu = cpu;
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3335	int want_affine = 0;
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	3336	int sync = wake_flags & WF_SYNC;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3337
Peter Zijlstra	29baa74	2012-04-23 12:11:21 +0200	[diff] [blame]	3338	if (p->nr_cpus_allowed == 1)
Mike Galbraith	76854c7	2011-11-22 15:18:24 +0100	[diff] [blame]	3339	return prev_cpu;
				3340
Peter Zijlstra	0763a66	2009-09-14 19:37:39 +0200	[diff] [blame]	3341	if (sd_flag & SD_BALANCE_WAKE) {
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	3342	if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3343	want_affine = 1;
				3344	new_cpu = prev_cpu;
				3345	}
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3346
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	3347	rcu_read_lock();
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3348	for_each_domain(cpu, tmp) {
Peter Zijlstra	e4f42888	2009-12-16 18:04:34 +0100	[diff] [blame]	3349	if (!(tmp->flags & SD_LOAD_BALANCE))
				3350	continue;
				3351
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3352	/*
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3353	* If both cpu and prev_cpu are part of this domain,
				3354	* cpu is a valid SD_WAKE_AFFINE target.
Peter Zijlstra	fe3bcfe	2009-11-12 15:55:29 +0100	[diff] [blame]	3355	*/
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3356	if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
				3357	cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
				3358	affine_sd = tmp;
Alex Shi	f03542a	2012-07-26 08:55:34 +0800	[diff] [blame]	3359	break;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3360	}
				3361
Alex Shi	f03542a	2012-07-26 08:55:34 +0800	[diff] [blame]	3362	if (tmp->flags & sd_flag)
Peter Zijlstra	29cd8ba	2009-09-17 09:01:14 +0200	[diff] [blame]	3363	sd = tmp;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3364	}
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3365
Mike Galbraith	8b911ac	2010-03-11 17:17:16 +0100	[diff] [blame]	3366	if (affine_sd) {
Alex Shi	f03542a	2012-07-26 08:55:34 +0800	[diff] [blame]	3367	if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	3368	prev_cpu = cpu;
				3369
				3370	new_cpu = select_idle_sibling(p, prev_cpu);
				3371	goto unlock;
Mike Galbraith	8b911ac	2010-03-11 17:17:16 +0100	[diff] [blame]	3372	}
Peter Zijlstra	3b64089	2009-09-16 13:44:33 +0200	[diff] [blame]	3373
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3374	while (sd) {
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	3375	int load_idx = sd->forkexec_idx;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3376	struct sched_group *group;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3377	int weight;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3378
Peter Zijlstra	0763a66	2009-09-14 19:37:39 +0200	[diff] [blame]	3379	if (!(sd->flags & sd_flag)) {
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3380	sd = sd->child;
				3381	continue;
				3382	}
				3383
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	3384	if (sd_flag & SD_BALANCE_WAKE)
				3385	load_idx = sd->wake_idx;
				3386
				3387	group = find_idlest_group(sd, p, cpu, load_idx);
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3388	if (!group) {
				3389	sd = sd->child;
				3390	continue;
				3391	}
				3392
Peter Zijlstra	d7c33c4	2009-09-11 12:45:38 +0200	[diff] [blame]	3393	new_cpu = find_idlest_cpu(group, p, cpu);
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3394	if (new_cpu == -1 \|\| new_cpu == cpu) {
				3395	/* Now try balancing at a lower domain level of cpu */
				3396	sd = sd->child;
				3397	continue;
				3398	}
				3399
				3400	/* Now try balancing at a lower domain level of new_cpu */
				3401	cpu = new_cpu;
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	3402	weight = sd->span_weight;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3403	sd = NULL;
				3404	for_each_domain(cpu, tmp) {
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	3405	if (weight <= tmp->span_weight)
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3406	break;
Peter Zijlstra	0763a66	2009-09-14 19:37:39 +0200	[diff] [blame]	3407	if (tmp->flags & sd_flag)
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3408	sd = tmp;
				3409	}
				3410	/* while loop will break here if sd == NULL */
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3411	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	3412	unlock:
				3413	rcu_read_unlock();
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3414
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3415	return new_cpu;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3416	}
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	3417
				3418	/*
Paul Turner	f4e26b1	2012-10-04 13:18:32 +0200	[diff] [blame]	3419	* Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
				3420	* removed when useful for applications beyond shares distribution (e.g.
				3421	* load-balance).
				3422	*/
				3423	#ifdef CONFIG_FAIR_GROUP_SCHED
				3424	/*
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	3425	* Called immediately before a task is migrated to a new cpu; task_cpu(p) and
				3426	* cfs_rq_of(p) references at time of call are still valid and identify the
				3427	* previous cpu. However, the caller only guarantees p->pi_lock is held; no
				3428	* other assumptions, including the state of rq->lock, should be made.
				3429	*/
				3430	static void
				3431	migrate_task_rq_fair(struct task_struct *p, int next_cpu)
				3432	{
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	3433	struct sched_entity *se = &p->se;
				3434	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				3435
				3436	/*
				3437	* Load tracking: accumulate removed load so that it can be processed
				3438	* when we next update owning cfs_rq under rq->lock. Tasks contribute
				3439	* to blocked load iff they have a positive decay-count. It can never
				3440	* be negative here since on-rq tasks have decay-count == 0.
				3441	*/
				3442	if (se->avg.decay_count) {
				3443	se->avg.decay_count = -__synchronize_entity_decay(se);
				3444	atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
				3445	}
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	3446	}
Paul Turner	f4e26b1	2012-10-04 13:18:32 +0200	[diff] [blame]	3447	#endif
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3448	#endif /* CONFIG_SMP */
				3449
Peter Zijlstra	e52fb7c	2009-01-14 12:39:19 +0100	[diff] [blame]	3450	static unsigned long
				3451	wakeup_gran(struct sched_entity curr, struct sched_entity se)
Peter Zijlstra	0bbd333	2008-04-19 19:44:57 +0200	[diff] [blame]	3452	{
				3453	unsigned long gran = sysctl_sched_wakeup_granularity;
				3454
				3455	/*
Peter Zijlstra	e52fb7c	2009-01-14 12:39:19 +0100	[diff] [blame]	3456	* Since its curr running now, convert the gran from real-time
				3457	* to virtual-time in his units.
Mike Galbraith	13814d4	2010-03-11 17:17:04 +0100	[diff] [blame]	3458	*
				3459	* By using 'se' instead of 'curr' we penalize light tasks, so
				3460	* they get preempted easier. That is, if 'se' < 'curr' then
				3461	* the resulting gran will be larger, therefore penalizing the
				3462	* lighter, if otoh 'se' > 'curr' then the resulting gran will
				3463	* be smaller, again penalizing the lighter task.
				3464	*
				3465	* This is especially important for buddies when the leftmost
				3466	* task is higher priority than the buddy.
Peter Zijlstra	0bbd333	2008-04-19 19:44:57 +0200	[diff] [blame]	3467	*/
Shaohua Li	f4ad9bd	2011-04-08 12:53:09 +0800	[diff] [blame]	3468	return calc_delta_fair(gran, se);
Peter Zijlstra	0bbd333	2008-04-19 19:44:57 +0200	[diff] [blame]	3469	}
				3470
				3471	/*
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	3472	* Should 'se' preempt 'curr'.
				3473	*
				3474	* \|s1
				3475	* \|s2
				3476	* \|s3
				3477	* g
				3478	* \|<--->\|c
				3479	*
				3480	* w(c, s1) = -1
				3481	* w(c, s2) = 0
				3482	* w(c, s3) = 1
				3483	*
				3484	*/
				3485	static int
				3486	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se)
				3487	{
				3488	s64 gran, vdiff = curr->vruntime - se->vruntime;
				3489
				3490	if (vdiff <= 0)
				3491	return -1;
				3492
Peter Zijlstra	e52fb7c	2009-01-14 12:39:19 +0100	[diff] [blame]	3493	gran = wakeup_gran(curr, se);
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	3494	if (vdiff > gran)
				3495	return 1;
				3496
				3497	return 0;
				3498	}
				3499
Peter Zijlstra	0247909	2008-11-04 21:25:10 +0100	[diff] [blame]	3500	static void set_last_buddy(struct sched_entity *se)
				3501	{
Venkatesh Pallipadi	69c80f3	2011-04-13 18:21:09 -0700	[diff] [blame]	3502	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
				3503	return;
				3504
				3505	for_each_sched_entity(se)
				3506	cfs_rq_of(se)->last = se;
Peter Zijlstra	0247909	2008-11-04 21:25:10 +0100	[diff] [blame]	3507	}
				3508
				3509	static void set_next_buddy(struct sched_entity *se)
				3510	{
Venkatesh Pallipadi	69c80f3	2011-04-13 18:21:09 -0700	[diff] [blame]	3511	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
				3512	return;
				3513
				3514	for_each_sched_entity(se)
				3515	cfs_rq_of(se)->next = se;
Peter Zijlstra	0247909	2008-11-04 21:25:10 +0100	[diff] [blame]	3516	}
				3517
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3518	static void set_skip_buddy(struct sched_entity *se)
				3519	{
Venkatesh Pallipadi	69c80f3	2011-04-13 18:21:09 -0700	[diff] [blame]	3520	for_each_sched_entity(se)
				3521	cfs_rq_of(se)->skip = se;
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3522	}
				3523
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	3524	/*
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3525	* Preempt the current task with a newly woken task if needed:
				3526	*/
Peter Zijlstra	5a9b86f	2009-09-16 13:47:58 +0200	[diff] [blame]	3527	static void check_preempt_wakeup(struct rq rq, struct task_struct p, int wake_flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3528	{
				3529	struct task_struct *curr = rq->curr;
Srivatsa Vaddagiri	8651a86	2007-10-15 17:00:12 +0200	[diff] [blame]	3530	struct sched_entity se = &curr->se, pse = &p->se;
Mike Galbraith	03e89e4	2008-12-16 08:45:30 +0100	[diff] [blame]	3531	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	3532	int scale = cfs_rq->nr_running >= sched_nr_latency;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3533	int next_buddy_marked = 0;
Mike Galbraith	03e89e4	2008-12-16 08:45:30 +0100	[diff] [blame]	3534
Ingo Molnar	4ae7d5c	2008-03-19 01:42:00 +0100	[diff] [blame]	3535	if (unlikely(se == pse))
				3536	return;
				3537
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	3538	/*
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3539	* This is possible from callers such as move_task(), in which we
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	3540	* unconditionally check_prempt_curr() after an enqueue (which may have
				3541	* lead to a throttle). This both saves work and prevents false
				3542	* next-buddy nomination below.
				3543	*/
				3544	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
				3545	return;
				3546
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3547	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
Mike Galbraith	3cb63d5	2009-09-11 12:01:17 +0200	[diff] [blame]	3548	set_next_buddy(pse);
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3549	next_buddy_marked = 1;
				3550	}
Peter Zijlstra	57fdc26	2008-09-23 15:33:45 +0200	[diff] [blame]	3551
Bharata B Rao	aec0a51	2008-08-28 14:42:49 +0530	[diff] [blame]	3552	/*
				3553	* We can come here with TIF_NEED_RESCHED already set from new task
				3554	* wake up path.
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	3555	*
				3556	* Note: this also catches the edge-case of curr being in a throttled
				3557	* group (e.g. via set_curr_task), since update_curr() (in the
				3558	* enqueue of curr) will have resulted in resched being set. This
				3559	* prevents us from potentially nominating it as a false LAST_BUDDY
				3560	* below.
Bharata B Rao	aec0a51	2008-08-28 14:42:49 +0530	[diff] [blame]	3561	*/
				3562	if (test_tsk_need_resched(curr))
				3563	return;
				3564
Darren Hart	a2f5c9a	2011-02-22 13:04:33 -0800	[diff] [blame]	3565	/* Idle tasks are by definition preempted by non-idle tasks. */
				3566	if (unlikely(curr->policy == SCHED_IDLE) &&
				3567	likely(p->policy != SCHED_IDLE))
				3568	goto preempt;
				3569
Ingo Molnar	91c234b	2007-10-15 17:00:18 +0200	[diff] [blame]	3570	/*
Darren Hart	a2f5c9a	2011-02-22 13:04:33 -0800	[diff] [blame]	3571	* Batch and idle tasks do not preempt non-idle tasks (their preemption
				3572	* is driven by the tick):
Ingo Molnar	91c234b	2007-10-15 17:00:18 +0200	[diff] [blame]	3573	*/
Ingo Molnar	8ed92e51	2012-10-14 14:28:50 +0200	[diff] [blame]	3574	if (unlikely(p->policy != SCHED_NORMAL) \|\| !sched_feat(WAKEUP_PREEMPTION))
Ingo Molnar	91c234b	2007-10-15 17:00:18 +0200	[diff] [blame]	3575	return;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3576
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	3577	find_matching_se(&se, &pse);
Paul Turner	9bbd737	2011-07-05 19:07:21 -0700	[diff] [blame]	3578	update_curr(cfs_rq_of(se));
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	3579	BUG_ON(!pse);
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3580	if (wakeup_preempt_entity(se, pse) == 1) {
				3581	/*
				3582	* Bias pick_next to pick the sched entity that is
				3583	* triggering this preemption.
				3584	*/
				3585	if (!next_buddy_marked)
				3586	set_next_buddy(pse);
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	3587	goto preempt;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3588	}
Jupyung Lee	a65ac74	2009-11-17 18:51:40 +0900	[diff] [blame]	3589
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	3590	return;
				3591
				3592	preempt:
				3593	resched_task(curr);
				3594	/*
				3595	* Only set the backward buddy when the current task is still
				3596	* on the rq. This can happen when a wakeup gets interleaved
				3597	* with schedule on the ->pre_schedule() or idle_balance()
				3598	* point, either of which can * drop the rq lock.
				3599	*
				3600	* Also, during early boot the idle thread is in the fair class,
				3601	* for obvious reasons its a bad idea to schedule back to it.
				3602	*/
				3603	if (unlikely(!se->on_rq \|\| curr == rq->idle))
				3604	return;
				3605
				3606	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
				3607	set_last_buddy(se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3608	}
				3609
Ingo Molnar	fb8d472	2007-08-09 11:16:48 +0200	[diff] [blame]	3610	static struct task_struct pick_next_task_fair(struct rq rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3611	{
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3612	struct task_struct *p;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3613	struct cfs_rq *cfs_rq = &rq->cfs;
				3614	struct sched_entity *se;
				3615
Tim Blechmann	36ace27	2009-11-24 11:55:45 +0100	[diff] [blame]	3616	if (!cfs_rq->nr_running)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3617	return NULL;
				3618
				3619	do {
Ingo Molnar	9948f4b	2007-08-09 11:16:48 +0200	[diff] [blame]	3620	se = pick_next_entity(cfs_rq);
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	3621	set_next_entity(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3622	cfs_rq = group_cfs_rq(se);
				3623	} while (cfs_rq);
				3624
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3625	p = task_of(se);
Mike Galbraith	b39e66e	2011-11-22 15:20:07 +0100	[diff] [blame]	3626	if (hrtick_enabled(rq))
				3627	hrtick_start_fair(rq, p);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3628
				3629	return p;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3630	}
				3631
				3632	/*
				3633	* Account for a descheduled task:
				3634	*/
Ingo Molnar	31ee529	2007-08-09 11:16:49 +0200	[diff] [blame]	3635	static void put_prev_task_fair(struct rq rq, struct task_struct prev)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3636	{
				3637	struct sched_entity *se = &prev->se;
				3638	struct cfs_rq *cfs_rq;
				3639
				3640	for_each_sched_entity(se) {
				3641	cfs_rq = cfs_rq_of(se);
Ingo Molnar	ab6cde2	2007-08-09 11:16:48 +0200	[diff] [blame]	3642	put_prev_entity(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3643	}
				3644	}
				3645
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3646	/*
				3647	* sched_yield() is very simple
				3648	*
				3649	* The magic of dealing with the ->skip buddy is in pick_next_entity.
				3650	*/
				3651	static void yield_task_fair(struct rq *rq)
				3652	{
				3653	struct task_struct *curr = rq->curr;
				3654	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
				3655	struct sched_entity *se = &curr->se;
				3656
				3657	/*
				3658	* Are we the only task in the tree?
				3659	*/
				3660	if (unlikely(rq->nr_running == 1))
				3661	return;
				3662
				3663	clear_buddies(cfs_rq, se);
				3664
				3665	if (curr->policy != SCHED_BATCH) {
				3666	update_rq_clock(rq);
				3667	/*
				3668	* Update run-time statistics of the 'current'.
				3669	*/
				3670	update_curr(cfs_rq);
Mike Galbraith	916671c	2011-11-22 15:21:26 +0100	[diff] [blame]	3671	/*
				3672	* Tell update_rq_clock() that we've just updated,
				3673	* so we don't do microscopic update in schedule()
				3674	* and double the fastpath cost.
				3675	*/
				3676	rq->skip_clock_update = 1;
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3677	}
				3678
				3679	set_skip_buddy(se);
				3680	}
				3681
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	3682	static bool yield_to_task_fair(struct rq rq, struct task_struct p, bool preempt)
				3683	{
				3684	struct sched_entity *se = &p->se;
				3685
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	3686	/* throttled hierarchies are not runnable */
				3687	if (!se->on_rq \|\| throttled_hierarchy(cfs_rq_of(se)))
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	3688	return false;
				3689
				3690	/* Tell the scheduler that we'd really like pse to run next. */
				3691	set_next_buddy(se);
				3692
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	3693	yield_task_fair(rq);
				3694
				3695	return true;
				3696	}
				3697
Peter Williams	681f3e6	2007-10-24 18:23:51 +0200	[diff] [blame]	3698	#ifdef CONFIG_SMP
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3699	/**************************************************
Peter Zijlstra	e9c84cb	2012-07-03 13:53:26 +0200	[diff] [blame]	3700	* Fair scheduling class load-balancing methods.
				3701	*
				3702	* BASICS
				3703	*
				3704	* The purpose of load-balancing is to achieve the same basic fairness the
				3705	* per-cpu scheduler provides, namely provide a proportional amount of compute
				3706	* time to each task. This is expressed in the following equation:
				3707	*
				3708	* W_i,n/P_i == W_j,n/P_j for all i,j (1)
				3709	*
				3710	* Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
				3711	* W_i,0 is defined as:
				3712	*
				3713	* W_i,0 = \Sum_j w_i,j (2)
				3714	*
				3715	* Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
				3716	* is derived from the nice value as per prio_to_weight[].
				3717	*
				3718	* The weight average is an exponential decay average of the instantaneous
				3719	* weight:
				3720	*
				3721	* W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
				3722	*
				3723	* P_i is the cpu power (or compute capacity) of cpu i, typically it is the
				3724	* fraction of 'recent' time available for SCHED_OTHER task execution. But it
				3725	* can also include other factors [XXX].
				3726	*
				3727	* To achieve this balance we define a measure of imbalance which follows
				3728	* directly from (1):
				3729	*
				3730	* imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4)
				3731	*
				3732	* We them move tasks around to minimize the imbalance. In the continuous
				3733	* function space it is obvious this converges, in the discrete case we get
				3734	* a few fun cases generally called infeasible weight scenarios.
				3735	*
				3736	* [XXX expand on:
				3737	* - infeasible weights;
				3738	* - local vs global optima in the discrete case. ]
				3739	*
				3740	*
				3741	* SCHED DOMAINS
				3742	*
				3743	* In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
				3744	* for all i,j solution, we create a tree of cpus that follows the hardware
				3745	* topology where each level pairs two lower groups (or better). This results
				3746	* in O(log n) layers. Furthermore we reduce the number of cpus going up the
				3747	* tree to only the first of the previous level and we decrease the frequency
				3748	* of load-balance at each level inv. proportional to the number of cpus in
				3749	* the groups.
				3750	*
				3751	* This yields:
				3752	*
				3753	* log_2 n 1 n
				3754	* \Sum { --- * --- * 2^i } = O(n) (5)
				3755	* i = 0 2^i 2^i
				3756	* `- size of each group
				3757	* \| \| `- number of cpus doing load-balance
				3758	* \| `- freq
				3759	* `- sum over all levels
				3760	*
				3761	* Coupled with a limit on how many tasks we can migrate every balance pass,
				3762	* this makes (5) the runtime complexity of the balancer.
				3763	*
				3764	* An important property here is that each CPU is still (indirectly) connected
				3765	* to every other cpu in at most O(log n) steps:
				3766	*
				3767	* The adjacency matrix of the resulting graph is given by:
				3768	*
				3769	* log_2 n
				3770	* A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
				3771	* k = 0
				3772	*
				3773	* And you'll find that:
				3774	*
				3775	* A^(log_2 n)_i,j != 0 for all i,j (7)
				3776	*
				3777	* Showing there's indeed a path between every cpu in at most O(log n) steps.
				3778	* The task movement gives a factor of O(m), giving a convergence complexity
				3779	* of:
				3780	*
				3781	* O(nm log n), n := nr_cpus, m := nr_tasks (8)
				3782	*
				3783	*
				3784	* WORK CONSERVING
				3785	*
				3786	* In order to avoid CPUs going idle while there's still work to do, new idle
				3787	* balancing is more aggressive and has the newly idle cpu iterate up the domain
				3788	* tree itself instead of relying on other CPUs to bring it work.
				3789	*
				3790	* This adds some complexity to both (5) and (8) but it reduces the total idle
				3791	* time.
				3792	*
				3793	* [XXX more?]
				3794	*
				3795	*
				3796	* CGROUPS
				3797	*
				3798	* Cgroups make a horror show out of (2), instead of a simple sum we get:
				3799	*
				3800	* s_k,i
				3801	* W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
				3802	* S_k
				3803	*
				3804	* Where
				3805	*
				3806	* s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
				3807	*
				3808	* w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
				3809	*
				3810	* The big problem is S_k, its a global sum needed to compute a local (W_i)
				3811	* property.
				3812	*
				3813	* [XXX write more on how we solve this.. _after_ merging pjt's patches that
				3814	* rewrite all of this once again.]
				3815	*/
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3816
Hiroshi Shimamoto	ed387b7	2012-01-31 11:40:32 +0900	[diff] [blame]	3817	static unsigned long __read_mostly max_load_balance_interval = HZ/10;
				3818
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3819	#define LBF_ALL_PINNED 0x01
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3820	#define LBF_NEED_BREAK 0x02
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	3821	#define LBF_SOME_PINNED 0x04
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3822
				3823	struct lb_env {
				3824	struct sched_domain *sd;
				3825
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3826	struct rq *src_rq;
Prashanth Nageshappa	85c1e7d	2012-06-19 17:47:34 +0530	[diff] [blame]	3827	int src_cpu;
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3828
				3829	int dst_cpu;
				3830	struct rq *dst_rq;
				3831
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	3832	struct cpumask *dst_grpmask;
				3833	int new_dst_cpu;
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3834	enum cpu_idle_type idle;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	3835	long imbalance;
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	3836	/* The set of CPUs under consideration for load-balancing */
				3837	struct cpumask *cpus;
				3838
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3839	unsigned int flags;
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3840
				3841	unsigned int loop;
				3842	unsigned int loop_break;
				3843	unsigned int loop_max;
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3844	};
				3845
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3846	/*
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3847	* move_task - move a task from one runqueue to another runqueue.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3848	* Both runqueues must be locked.
				3849	*/
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3850	static void move_task(struct task_struct p, struct lb_env env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3851	{
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3852	deactivate_task(env->src_rq, p, 0);
				3853	set_task_cpu(p, env->dst_cpu);
				3854	activate_task(env->dst_rq, p, 0);
				3855	check_preempt_curr(env->dst_rq, p, 0);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3856	}
				3857
				3858	/*
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3859	* Is this task likely cache-hot:
				3860	*/
				3861	static int
				3862	task_hot(struct task_struct p, u64 now, struct sched_domain sd)
				3863	{
				3864	s64 delta;
				3865
				3866	if (p->sched_class != &fair_sched_class)
				3867	return 0;
				3868
				3869	if (unlikely(p->policy == SCHED_IDLE))
				3870	return 0;
				3871
				3872	/*
				3873	* Buddy candidates are cache hot:
				3874	*/
				3875	if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
				3876	(&p->se == cfs_rq_of(&p->se)->next \|\|
				3877	&p->se == cfs_rq_of(&p->se)->last))
				3878	return 1;
				3879
				3880	if (sysctl_sched_migration_cost == -1)
				3881	return 1;
				3882	if (sysctl_sched_migration_cost == 0)
				3883	return 0;
				3884
				3885	delta = now - p->se.exec_start;
				3886
				3887	return delta < (s64)sysctl_sched_migration_cost;
				3888	}
				3889
				3890	/*
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3891	* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
				3892	*/
				3893	static
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	3894	int can_migrate_task(struct task_struct p, struct lb_env env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3895	{
				3896	int tsk_cache_hot = 0;
				3897	/*
				3898	* We do not migrate tasks that are:
Joonsoo Kim	d319808	2013-04-23 17:27:40 +0900	[diff] [blame]	3899	* 1) throttled_lb_pair, or
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3900	* 2) cannot be migrated to this CPU due to cpus_allowed, or
Joonsoo Kim	d319808	2013-04-23 17:27:40 +0900	[diff] [blame]	3901	* 3) running (obviously), or
				3902	* 4) are cache-hot on their current CPU.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3903	*/
Joonsoo Kim	d319808	2013-04-23 17:27:40 +0900	[diff] [blame]	3904	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
				3905	return 0;
				3906
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3907	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	3908	int cpu;
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	3909
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3910	schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	3911
				3912	/*
				3913	* Remember if this task can be migrated to any other cpu in
				3914	* our sched_group. We may want to revisit it if we couldn't
				3915	* meet load balance goals by pulling other tasks on src_cpu.
				3916	*
				3917	* Also avoid computing new_dst_cpu if we have already computed
				3918	* one in current iteration.
				3919	*/
				3920	if (!env->dst_grpmask \|\| (env->flags & LBF_SOME_PINNED))
				3921	return 0;
				3922
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	3923	/* Prevent to re-select dst_cpu via env's cpus */
				3924	for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
				3925	if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
				3926	env->flags \|= LBF_SOME_PINNED;
				3927	env->new_dst_cpu = cpu;
				3928	break;
				3929	}
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	3930	}
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	3931
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3932	return 0;
				3933	}
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	3934
				3935	/* Record that we found atleast one task that could run on dst_cpu */
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	3936	env->flags &= ~LBF_ALL_PINNED;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3937
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3938	if (task_running(env->src_rq, p)) {
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3939	schedstat_inc(p, se.statistics.nr_failed_migrations_running);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3940	return 0;
				3941	}
				3942
				3943	/*
				3944	* Aggressive migration if:
				3945	* 1) task is cache cold, or
				3946	* 2) too many balance attempts have failed.
				3947	*/
				3948
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3949	tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3950	if (!tsk_cache_hot \|\|
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	3951	env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
Zhang Hang	4e2dcb7	2013-04-10 14:04:55 +0800	[diff] [blame]	3952
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3953	if (tsk_cache_hot) {
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	3954	schedstat_inc(env->sd, lb_hot_gained[env->idle]);
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3955	schedstat_inc(p, se.statistics.nr_forced_migrations);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3956	}
Zhang Hang	4e2dcb7	2013-04-10 14:04:55 +0800	[diff] [blame]	3957
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3958	return 1;
				3959	}
				3960
Zhang Hang	4e2dcb7	2013-04-10 14:04:55 +0800	[diff] [blame]	3961	schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
				3962	return 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3963	}
				3964
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3965	/*
				3966	* move_one_task tries to move exactly one task from busiest to this_rq, as
				3967	* part of active balancing operations within "domain".
				3968	* Returns 1 if successful and 0 otherwise.
				3969	*
				3970	* Called with both runqueues locked.
				3971	*/
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	3972	static int move_one_task(struct lb_env *env)
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3973	{
				3974	struct task_struct p, n;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3975
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3976	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3977	if (!can_migrate_task(p, env))
				3978	continue;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3979
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3980	move_task(p, env);
				3981	/*
				3982	* Right now, this is only the second place move_task()
				3983	* is called, so we can safely collect move_task()
				3984	* stats here rather than inside move_task().
				3985	*/
				3986	schedstat_inc(env->sd, lb_gained[env->idle]);
				3987	return 1;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3988	}
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3989	return 0;
				3990	}
				3991
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3992	static unsigned long task_h_load(struct task_struct *p);
				3993
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	3994	static const unsigned int sched_nr_migrate_break = 32;
				3995
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	3996	/*
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	3997	* move_tasks tries to move up to imbalance weighted load from busiest to
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	3998	* this_rq, as part of a balancing operation within domain "sd".
				3999	* Returns 1 if successful and 0 otherwise.
				4000	*
				4001	* Called with both runqueues locked.
				4002	*/
				4003	static int move_tasks(struct lb_env *env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4004	{
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4005	struct list_head *tasks = &env->src_rq->cfs_tasks;
				4006	struct task_struct *p;
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4007	unsigned long load;
				4008	int pulled = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4009
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4010	if (env->imbalance <= 0)
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4011	return 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4012
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4013	while (!list_empty(tasks)) {
				4014	p = list_first_entry(tasks, struct task_struct, se.group_node);
				4015
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4016	env->loop++;
				4017	/* We've more or less seen every task there is, call it quits */
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4018	if (env->loop > env->loop_max)
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4019	break;
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4020
				4021	/* take a breather every nr_migrate tasks */
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4022	if (env->loop > env->loop_break) {
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	4023	env->loop_break += sched_nr_migrate_break;
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	4024	env->flags \|= LBF_NEED_BREAK;
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4025	break;
Peter Zijlstra	a195f00	2011-09-22 15:30:18 +0200	[diff] [blame]	4026	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4027
Joonsoo Kim	d319808	2013-04-23 17:27:40 +0900	[diff] [blame]	4028	if (!can_migrate_task(p, env))
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4029	goto next;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4030
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4031	load = task_h_load(p);
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4032
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	4033	if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4034	goto next;
				4035
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4036	if ((load / 2) > env->imbalance)
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4037	goto next;
				4038
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4039	move_task(p, env);
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4040	pulled++;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4041	env->imbalance -= load;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4042
				4043	#ifdef CONFIG_PREEMPT
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4044	/*
				4045	* NEWIDLE balancing is a source of latency, so preemptible
				4046	* kernels will stop after the first task is pulled to minimize
				4047	* the critical section.
				4048	*/
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4049	if (env->idle == CPU_NEWLY_IDLE)
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4050	break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4051	#endif
				4052
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4053	/*
				4054	* We only want to steal up to the prescribed amount of
				4055	* weighted load.
				4056	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4057	if (env->imbalance <= 0)
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4058	break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4059
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4060	continue;
				4061	next:
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4062	list_move_tail(&p->se.group_node, tasks);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4063	}
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4064
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4065	/*
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4066	* Right now, this is one of only two places move_task() is called,
				4067	* so we can safely collect move_task() stats here rather than
				4068	* inside move_task().
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4069	*/
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	4070	schedstat_add(env->sd, lb_gained[env->idle], pulled);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4071
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4072	return pulled;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4073	}
				4074
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4075	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4076	/*
				4077	* update tg->load_weight by folding this cpu's load_avg
				4078	*/
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4079	static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4080	{
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4081	struct sched_entity *se = tg->se[cpu];
				4082	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4083
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4084	/* throttled entities do not contribute to load */
				4085	if (throttled_hierarchy(cfs_rq))
				4086	return;
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4087
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	4088	update_cfs_rq_blocked_load(cfs_rq, 1);
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4089
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	4090	if (se) {
				4091	update_entity_load_avg(se, 1);
				4092	/*
				4093	* We pivot on our runnable average having decayed to zero for
				4094	* list removal. This generally implies that all our children
				4095	* have also been removed (modulo rounding error or bandwidth
				4096	* control); however, such cases are rare and we can fix these
				4097	* at enqueue.
				4098	*
				4099	* TODO: fix up out-of-order children on enqueue.
				4100	*/
				4101	if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
				4102	list_del_leaf_cfs_rq(cfs_rq);
				4103	} else {
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4104	struct rq *rq = rq_of(cfs_rq);
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	4105	update_rq_runnable_avg(rq, rq->nr_running);
				4106	}
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4107	}
				4108
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4109	static void update_blocked_averages(int cpu)
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4110	{
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4111	struct rq *rq = cpu_rq(cpu);
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4112	struct cfs_rq *cfs_rq;
				4113	unsigned long flags;
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4114
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4115	raw_spin_lock_irqsave(&rq->lock, flags);
				4116	update_rq_clock(rq);
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4117	/*
				4118	* Iterates the task_group tree in a bottom up fashion, see
				4119	* list_add_leaf_cfs_rq() for details.
				4120	*/
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	4121	for_each_leaf_cfs_rq(rq, cfs_rq) {
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4122	/*
				4123	* Note: We may want to consider periodically releasing
				4124	* rq->lock about these updates so that creating many task
				4125	* groups does not result in continually extending hold time.
				4126	*/
				4127	__update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	4128	}
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4129
				4130	raw_spin_unlock_irqrestore(&rq->lock, flags);
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4131	}
				4132
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4133	/*
				4134	* Compute the cpu's hierarchical load factor for each task group.
				4135	* This needs to be done in a top-down fashion because the load of a child
				4136	* group is a fraction of its parents load.
				4137	*/
				4138	static int tg_load_down(struct task_group tg, void data)
				4139	{
				4140	unsigned long load;
				4141	long cpu = (long)data;
				4142
				4143	if (!tg->parent) {
				4144	load = cpu_rq(cpu)->load.weight;
				4145	} else {
				4146	load = tg->parent->cfs_rq[cpu]->h_load;
				4147	load *= tg->se[cpu]->load.weight;
				4148	load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
				4149	}
				4150
				4151	tg->cfs_rq[cpu]->h_load = load;
				4152
				4153	return 0;
				4154	}
				4155
				4156	static void update_h_load(long cpu)
				4157	{
Peter Zijlstra	a35b646	2012-08-08 21:46:40 +0200	[diff] [blame]	4158	struct rq *rq = cpu_rq(cpu);
				4159	unsigned long now = jiffies;
				4160
				4161	if (rq->h_load_throttle == now)
				4162	return;
				4163
				4164	rq->h_load_throttle = now;
				4165
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4166	rcu_read_lock();
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4167	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4168	rcu_read_unlock();
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4169	}
				4170
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4171	static unsigned long task_h_load(struct task_struct *p)
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4172	{
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4173	struct cfs_rq *cfs_rq = task_cfs_rq(p);
				4174	unsigned long load;
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4175
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4176	load = p->se.load.weight;
				4177	load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1);
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4178
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4179	return load;
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4180	}
				4181	#else
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4182	static inline void update_blocked_averages(int cpu)
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4183	{
				4184	}
				4185
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4186	static inline void update_h_load(long cpu)
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4187	{
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4188	}
				4189
				4190	static unsigned long task_h_load(struct task_struct *p)
				4191	{
				4192	return p->se.load.weight;
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4193	}
				4194	#endif
				4195
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4196	/******** Helpers for find_busiest_group **********************/
				4197	/*
				4198	* sd_lb_stats - Structure to store the statistics of a sched_domain
				4199	* during load balancing.
				4200	*/
				4201	struct sd_lb_stats {
				4202	struct sched_group busiest; / Busiest group in this sd */
				4203	struct sched_group this; / Local group in this sd */
				4204	unsigned long total_load; /* Total load of all groups in sd */
				4205	unsigned long total_pwr; /* Total power of all groups in sd */
				4206	unsigned long avg_load; /* Average load across all groups in sd */
				4207
				4208	/** Statistics of this group */
				4209	unsigned long this_load;
				4210	unsigned long this_load_per_task;
				4211	unsigned long this_nr_running;
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4212	unsigned long this_has_capacity;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4213	unsigned int this_idle_cpus;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4214
				4215	/* Statistics of the busiest group */
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4216	unsigned int busiest_idle_cpus;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4217	unsigned long max_load;
				4218	unsigned long busiest_load_per_task;
				4219	unsigned long busiest_nr_running;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4220	unsigned long busiest_group_capacity;
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4221	unsigned long busiest_has_capacity;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4222	unsigned int busiest_group_weight;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4223
				4224	int group_imb; /* Is there imbalance in this sd */
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4225	};
				4226
				4227	/*
				4228	* sg_lb_stats - stats of a sched_group required for load_balancing
				4229	*/
				4230	struct sg_lb_stats {
				4231	unsigned long avg_load; /Avg load across the CPUs of the group /
				4232	unsigned long group_load; /* Total load over the CPUs of the group */
				4233	unsigned long sum_nr_running; /* Nr tasks running in the group */
				4234	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
				4235	unsigned long group_capacity;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4236	unsigned long idle_cpus;
				4237	unsigned long group_weight;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4238	int group_imb; /* Is there an imbalance in the group ? */
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4239	int group_has_capacity; /* Is there extra capacity in the group? */
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4240	};
				4241
				4242	/**
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4243	* get_sd_load_idx - Obtain the load index for a given sched domain.
				4244	* @sd: The sched_domain whose load_idx is to be obtained.
				4245	* @idle: The Idle status of the CPU for whose sd load_icx is obtained.
				4246	*/
				4247	static inline int get_sd_load_idx(struct sched_domain *sd,
				4248	enum cpu_idle_type idle)
				4249	{
				4250	int load_idx;
				4251
				4252	switch (idle) {
				4253	case CPU_NOT_IDLE:
				4254	load_idx = sd->busy_idx;
				4255	break;
				4256
				4257	case CPU_NEWLY_IDLE:
				4258	load_idx = sd->newidle_idx;
				4259	break;
				4260	default:
				4261	load_idx = sd->idle_idx;
				4262	break;
				4263	}
				4264
				4265	return load_idx;
				4266	}
				4267
Li Zefan	15f803c	2013-03-05 16:07:11 +0800	[diff] [blame]	4268	static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4269	{
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4270	return SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4271	}
				4272
				4273	unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
				4274	{
				4275	return default_scale_freq_power(sd, cpu);
				4276	}
				4277
Li Zefan	15f803c	2013-03-05 16:07:11 +0800	[diff] [blame]	4278	static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4279	{
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	4280	unsigned long weight = sd->span_weight;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4281	unsigned long smt_gain = sd->smt_gain;
				4282
				4283	smt_gain /= weight;
				4284
				4285	return smt_gain;
				4286	}
				4287
				4288	unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
				4289	{
				4290	return default_scale_smt_power(sd, cpu);
				4291	}
				4292
Li Zefan	15f803c	2013-03-05 16:07:11 +0800	[diff] [blame]	4293	static unsigned long scale_rt_power(int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4294	{
				4295	struct rq *rq = cpu_rq(cpu);
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	4296	u64 total, available, age_stamp, avg;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4297
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	4298	/*
				4299	* Since we're reading these variables without serialization make sure
				4300	* we read them once before doing sanity checks on them.
				4301	*/
				4302	age_stamp = ACCESS_ONCE(rq->age_stamp);
				4303	avg = ACCESS_ONCE(rq->rt_avg);
Venkatesh Pallipadi	aa48380	2010-10-04 17:03:22 -0700	[diff] [blame]	4304
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	4305	total = sched_avg_period() + (rq->clock - age_stamp);
				4306
				4307	if (unlikely(total < avg)) {
Venkatesh Pallipadi	aa48380	2010-10-04 17:03:22 -0700	[diff] [blame]	4308	/* Ensures that power won't end up being negative */
				4309	available = 0;
				4310	} else {
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	4311	available = total - avg;
Venkatesh Pallipadi	aa48380	2010-10-04 17:03:22 -0700	[diff] [blame]	4312	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4313
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4314	if (unlikely((s64)total < SCHED_POWER_SCALE))
				4315	total = SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4316
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4317	total >>= SCHED_POWER_SHIFT;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4318
				4319	return div_u64(available, total);
				4320	}
				4321
				4322	static void update_cpu_power(struct sched_domain *sd, int cpu)
				4323	{
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	4324	unsigned long weight = sd->span_weight;
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4325	unsigned long power = SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4326	struct sched_group *sdg = sd->groups;
				4327
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4328	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
				4329	if (sched_feat(ARCH_POWER))
				4330	power *= arch_scale_smt_power(sd, cpu);
				4331	else
				4332	power *= default_scale_smt_power(sd, cpu);
				4333
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4334	power >>= SCHED_POWER_SHIFT;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4335	}
				4336
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4337	sdg->sgp->power_orig = power;
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4338
				4339	if (sched_feat(ARCH_POWER))
				4340	power *= arch_scale_freq_power(sd, cpu);
				4341	else
				4342	power *= default_scale_freq_power(sd, cpu);
				4343
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4344	power >>= SCHED_POWER_SHIFT;
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4345
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4346	power *= scale_rt_power(cpu);
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4347	power >>= SCHED_POWER_SHIFT;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4348
				4349	if (!power)
				4350	power = 1;
				4351
Peter Zijlstra	e51fd5e	2010-05-31 12:37:30 +0200	[diff] [blame]	4352	cpu_rq(cpu)->cpu_power = power;
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4353	sdg->sgp->power = power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4354	}
				4355
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4356	void update_group_power(struct sched_domain *sd, int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4357	{
				4358	struct sched_domain *child = sd->child;
				4359	struct sched_group group, sdg = sd->groups;
				4360	unsigned long power;
Vincent Guittot	4ec4412	2011-12-12 20:21:08 +0100	[diff] [blame]	4361	unsigned long interval;
				4362
				4363	interval = msecs_to_jiffies(sd->balance_interval);
				4364	interval = clamp(interval, 1UL, max_load_balance_interval);
				4365	sdg->sgp->next_update = jiffies + interval;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4366
				4367	if (!child) {
				4368	update_cpu_power(sd, cpu);
				4369	return;
				4370	}
				4371
				4372	power = 0;
				4373
Peter Zijlstra	74a5ce2	2012-05-23 18:00:43 +0200	[diff] [blame]	4374	if (child->flags & SD_OVERLAP) {
				4375	/*
				4376	* SD_OVERLAP domains cannot assume that child groups
				4377	* span the current group.
				4378	*/
				4379
				4380	for_each_cpu(cpu, sched_group_cpus(sdg))
				4381	power += power_of(cpu);
				4382	} else {
				4383	/*
				4384	* !SD_OVERLAP domains can assume that child groups
				4385	* span the current group.
				4386	*/
				4387
				4388	group = child->groups;
				4389	do {
				4390	power += group->sgp->power;
				4391	group = group->next;
				4392	} while (group != child->groups);
				4393	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4394
Peter Zijlstra	c3decf0	2012-05-31 12:05:32 +0200	[diff] [blame]	4395	sdg->sgp->power_orig = sdg->sgp->power = power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4396	}
				4397
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4398	/*
				4399	* Try and fix up capacity for tiny siblings, this is needed when
				4400	* things like SD_ASYM_PACKING need f_b_g to select another sibling
				4401	* which on its own isn't powerful enough.
				4402	*
				4403	* See update_sd_pick_busiest() and check_asym_packing().
				4404	*/
				4405	static inline int
				4406	fix_small_capacity(struct sched_domain sd, struct sched_group group)
				4407	{
				4408	/*
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4409	* Only siblings can have significantly less than SCHED_POWER_SCALE
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4410	*/
Peter Zijlstra	a6c75f2	2011-04-07 14:09:52 +0200	[diff] [blame]	4411	if (!(sd->flags & SD_SHARE_CPUPOWER))
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4412	return 0;
				4413
				4414	/*
				4415	* If ~90% of the cpu_power is still there, we're good.
				4416	*/
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4417	if (group->sgp->power * 32 > group->sgp->power_orig * 29)
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4418	return 1;
				4419
				4420	return 0;
				4421	}
				4422
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4423	/**
				4424	* update_sg_lb_stats - Update sched_group's statistics for load balancing.
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4425	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4426	* @group: sched_group whose statistics are to be updated.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4427	* @load_idx: Load index of sched_domain of this_cpu for load calc.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4428	* @local_group: Does group contain this_cpu.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4429	* @balance: Should we balance.
				4430	* @sgs: variable to hold the statistics for this group.
				4431	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4432	static inline void update_sg_lb_stats(struct lb_env *env,
				4433	struct sched_group *group, int load_idx,
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	4434	int local_group, int balance, struct sg_lb_stats sgs)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4435	{
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4436	unsigned long nr_running, max_nr_running, min_nr_running;
				4437	unsigned long load, max_cpu_load, min_cpu_load;
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	4438	unsigned int balance_cpu = -1, first_idle_cpu = 0;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4439	unsigned long avg_load_per_task = 0;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4440	int i;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4441
Gautham R Shenoy	871e35b	2010-01-20 14:02:44 -0600	[diff] [blame]	4442	if (local_group)
Peter Zijlstra	c117487	2012-05-31 14:47:33 +0200	[diff] [blame]	4443	balance_cpu = group_balance_cpu(group);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4444
				4445	/* Tally up the load of all CPUs in the group */
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4446	max_cpu_load = 0;
				4447	min_cpu_load = ~0UL;
Nikhil Rao	2582f0e	2010-10-13 12:09:36 -0700	[diff] [blame]	4448	max_nr_running = 0;
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4449	min_nr_running = ~0UL;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4450
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	4451	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4452	struct rq *rq = cpu_rq(i);
				4453
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4454	nr_running = rq->nr_running;
				4455
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4456	/* Bias balancing toward cpus of our domain */
				4457	if (local_group) {
Peter Zijlstra	c117487	2012-05-31 14:47:33 +0200	[diff] [blame]	4458	if (idle_cpu(i) && !first_idle_cpu &&
				4459	cpumask_test_cpu(i, sched_group_mask(group))) {
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	4460	first_idle_cpu = 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4461	balance_cpu = i;
				4462	}
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	4463
				4464	load = target_load(i, load_idx);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4465	} else {
				4466	load = source_load(i, load_idx);
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4467	if (load > max_cpu_load)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4468	max_cpu_load = load;
				4469	if (min_cpu_load > load)
				4470	min_cpu_load = load;
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4471
				4472	if (nr_running > max_nr_running)
				4473	max_nr_running = nr_running;
				4474	if (min_nr_running > nr_running)
				4475	min_nr_running = nr_running;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4476	}
				4477
				4478	sgs->group_load += load;
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4479	sgs->sum_nr_running += nr_running;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4480	sgs->sum_weighted_load += weighted_cpuload(i);
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4481	if (idle_cpu(i))
				4482	sgs->idle_cpus++;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4483	}
				4484
				4485	/*
				4486	* First idle cpu or the first cpu(busiest) in this sched group
				4487	* is eligible for doing load balancing at this and above
				4488	* domains. In the newly idle case, we will allow all the cpu's
				4489	* to do the newly idle load balance.
				4490	*/
Vincent Guittot	4ec4412	2011-12-12 20:21:08 +0100	[diff] [blame]	4491	if (local_group) {
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4492	if (env->idle != CPU_NEWLY_IDLE) {
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	4493	if (balance_cpu != env->dst_cpu) {
Vincent Guittot	4ec4412	2011-12-12 20:21:08 +0100	[diff] [blame]	4494	*balance = 0;
				4495	return;
				4496	}
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4497	update_group_power(env->sd, env->dst_cpu);
Vincent Guittot	4ec4412	2011-12-12 20:21:08 +0100	[diff] [blame]	4498	} else if (time_after_eq(jiffies, group->sgp->next_update))
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4499	update_group_power(env->sd, env->dst_cpu);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4500	}
				4501
				4502	/* Adjust by relative CPU power of the group */
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4503	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4504
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4505	/*
				4506	* Consider the group unbalanced when the imbalance is larger
Peter Zijlstra	866ab43	2011-02-21 18:56:47 +0100	[diff] [blame]	4507	* than the average weight of a task.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4508	*
				4509	* APZ: with cgroup the avg task weight can vary wildly and
				4510	* might not be a suitable number - should we keep a
				4511	* normalized nr_running number somewhere that negates
				4512	* the hierarchy?
				4513	*/
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4514	if (sgs->sum_nr_running)
				4515	avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4516
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4517	if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
				4518	(max_nr_running - min_nr_running) > 1)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4519	sgs->group_imb = 1;
				4520
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4521	sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4522	SCHED_POWER_SCALE);
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4523	if (!sgs->group_capacity)
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4524	sgs->group_capacity = fix_small_capacity(env->sd, group);
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4525	sgs->group_weight = group->group_weight;
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4526
				4527	if (sgs->group_capacity > sgs->sum_nr_running)
				4528	sgs->group_has_capacity = 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4529	}
				4530
				4531	/**
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4532	* update_sd_pick_busiest - return 1 on busiest group
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4533	* @env: The load balancing environment.
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4534	* @sds: sched_domain statistics
				4535	* @sg: sched_group candidate to be checked for being the busiest
Michael Neuling	b6b1229	2010-06-10 12:06:21 +1000	[diff] [blame]	4536	* @sgs: sched_group statistics
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4537	*
				4538	* Determine if @sg is a busier group than the previously selected
				4539	* busiest group.
				4540	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4541	static bool update_sd_pick_busiest(struct lb_env *env,
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4542	struct sd_lb_stats *sds,
				4543	struct sched_group *sg,
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4544	struct sg_lb_stats *sgs)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4545	{
				4546	if (sgs->avg_load <= sds->max_load)
				4547	return false;
				4548
				4549	if (sgs->sum_nr_running > sgs->group_capacity)
				4550	return true;
				4551
				4552	if (sgs->group_imb)
				4553	return true;
				4554
				4555	/*
				4556	* ASYM_PACKING needs to move all the work to the lowest
				4557	* numbered CPUs in the group, therefore mark all groups
				4558	* higher than ourself as busy.
				4559	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4560	if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
				4561	env->dst_cpu < group_first_cpu(sg)) {
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4562	if (!sds->busiest)
				4563	return true;
				4564
				4565	if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
				4566	return true;
				4567	}
				4568
				4569	return false;
				4570	}
				4571
				4572	/**
Hui Kang	461819a	2011-10-11 23:00:59 -0400	[diff] [blame]	4573	* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4574	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4575	* @balance: Should we balance.
				4576	* @sds: variable to hold the statistics for this sched_domain.
				4577	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4578	static inline void update_sd_lb_stats(struct lb_env *env,
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	4579	int balance, struct sd_lb_stats sds)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4580	{
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4581	struct sched_domain *child = env->sd->child;
				4582	struct sched_group *sg = env->sd->groups;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4583	struct sg_lb_stats sgs;
				4584	int load_idx, prefer_sibling = 0;
				4585
				4586	if (child && child->flags & SD_PREFER_SIBLING)
				4587	prefer_sibling = 1;
				4588
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4589	load_idx = get_sd_load_idx(env->sd, env->idle);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4590
				4591	do {
				4592	int local_group;
				4593
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4594	local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4595	memset(&sgs, 0, sizeof(sgs));
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	4596	update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4597
Peter Zijlstra	8f190fb	2009-12-24 14:18:21 +0100	[diff] [blame]	4598	if (local_group && !(*balance))
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4599	return;
				4600
				4601	sds->total_load += sgs.group_load;
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4602	sds->total_pwr += sg->sgp->power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4603
				4604	/*
				4605	* In case the child domain prefers tasks go to siblings
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4606	* first, lower the sg capacity to one so that we'll try
Nikhil Rao	75dd321	2010-10-15 13:12:30 -0700	[diff] [blame]	4607	* and move all the excess tasks away. We lower the capacity
				4608	* of a group only if the local group has the capacity to fit
				4609	* these excess tasks, i.e. nr_running < group_capacity. The
				4610	* extra check prevents the case where you always pull from the
				4611	* heaviest group when it is already under-utilized (possible
				4612	* with a large weight task outweighs the tasks on the system).
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4613	*/
Nikhil Rao	75dd321	2010-10-15 13:12:30 -0700	[diff] [blame]	4614	if (prefer_sibling && !local_group && sds->this_has_capacity)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4615	sgs.group_capacity = min(sgs.group_capacity, 1UL);
				4616
				4617	if (local_group) {
				4618	sds->this_load = sgs.avg_load;
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4619	sds->this = sg;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4620	sds->this_nr_running = sgs.sum_nr_running;
				4621	sds->this_load_per_task = sgs.sum_weighted_load;
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4622	sds->this_has_capacity = sgs.group_has_capacity;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4623	sds->this_idle_cpus = sgs.idle_cpus;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4624	} else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4625	sds->max_load = sgs.avg_load;
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4626	sds->busiest = sg;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4627	sds->busiest_nr_running = sgs.sum_nr_running;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4628	sds->busiest_idle_cpus = sgs.idle_cpus;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4629	sds->busiest_group_capacity = sgs.group_capacity;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4630	sds->busiest_load_per_task = sgs.sum_weighted_load;
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4631	sds->busiest_has_capacity = sgs.group_has_capacity;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4632	sds->busiest_group_weight = sgs.group_weight;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4633	sds->group_imb = sgs.group_imb;
				4634	}
				4635
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4636	sg = sg->next;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4637	} while (sg != env->sd->groups);
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4638	}
				4639
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4640	/**
				4641	* check_asym_packing - Check to see if the group is packed into the
				4642	* sched doman.
				4643	*
				4644	* This is primarily intended to used at the sibling level. Some
				4645	* cores like POWER7 prefer to use lower numbered SMT threads. In the
				4646	* case of POWER7, it can move to lower SMT modes only when higher
				4647	* threads are idle. When in lower SMT modes, the threads will
				4648	* perform better since they share less core resources. Hence when we
				4649	* have idle threads, we want them to be the higher ones.
				4650	*
				4651	* This packing function is run on idle threads. It checks to see if
				4652	* the busiest CPU in this domain (core in the P7 case) has a higher
				4653	* CPU number than the packing function is being run on. Here we are
				4654	* assuming lower CPU number will be equivalent to lower a SMT thread
				4655	* number.
				4656	*
Michael Neuling	b6b1229	2010-06-10 12:06:21 +1000	[diff] [blame]	4657	* Returns 1 when packing is required and a task should be moved to
				4658	* this CPU. The amount of the imbalance is returned in *imbalance.
				4659	*
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4660	* @env: The load balancing environment.
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4661	* @sds: Statistics of the sched_domain which is to be packed
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4662	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4663	static int check_asym_packing(struct lb_env env, struct sd_lb_stats sds)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4664	{
				4665	int busiest_cpu;
				4666
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4667	if (!(env->sd->flags & SD_ASYM_PACKING))
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4668	return 0;
				4669
				4670	if (!sds->busiest)
				4671	return 0;
				4672
				4673	busiest_cpu = group_first_cpu(sds->busiest);
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4674	if (env->dst_cpu > busiest_cpu)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4675	return 0;
				4676
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4677	env->imbalance = DIV_ROUND_CLOSEST(
				4678	sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
				4679
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4680	return 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4681	}
				4682
				4683	/**
				4684	* fix_small_imbalance - Calculate the minor imbalance that exists
				4685	* amongst the groups of a sched_domain, during
				4686	* load balancing.
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4687	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4688	* @sds: Statistics of the sched_domain whose imbalance is to be calculated.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4689	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4690	static inline
				4691	void fix_small_imbalance(struct lb_env env, struct sd_lb_stats sds)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4692	{
				4693	unsigned long tmp, pwr_now = 0, pwr_move = 0;
				4694	unsigned int imbn = 2;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4695	unsigned long scaled_busy_load_per_task;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4696
				4697	if (sds->this_nr_running) {
				4698	sds->this_load_per_task /= sds->this_nr_running;
				4699	if (sds->busiest_load_per_task >
				4700	sds->this_load_per_task)
				4701	imbn = 1;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4702	} else {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4703	sds->this_load_per_task =
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4704	cpu_avg_load_per_task(env->dst_cpu);
				4705	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4706
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4707	scaled_busy_load_per_task = sds->busiest_load_per_task
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4708	* SCHED_POWER_SCALE;
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4709	scaled_busy_load_per_task /= sds->busiest->sgp->power;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4710
				4711	if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
				4712	(scaled_busy_load_per_task * imbn)) {
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4713	env->imbalance = sds->busiest_load_per_task;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4714	return;
				4715	}
				4716
				4717	/*
				4718	* OK, we don't have enough imbalance to justify moving tasks,
				4719	* however we may be able to increase total CPU power used by
				4720	* moving them.
				4721	*/
				4722
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4723	pwr_now += sds->busiest->sgp->power *
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4724	min(sds->busiest_load_per_task, sds->max_load);
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4725	pwr_now += sds->this->sgp->power *
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4726	min(sds->this_load_per_task, sds->this_load);
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4727	pwr_now /= SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4728
				4729	/* Amount of load we'd subtract */
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4730	tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4731	sds->busiest->sgp->power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4732	if (sds->max_load > tmp)
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4733	pwr_move += sds->busiest->sgp->power *
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4734	min(sds->busiest_load_per_task, sds->max_load - tmp);
				4735
				4736	/* Amount of load we'd add */
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4737	if (sds->max_load * sds->busiest->sgp->power <
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4738	sds->busiest_load_per_task * SCHED_POWER_SCALE)
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4739	tmp = (sds->max_load * sds->busiest->sgp->power) /
				4740	sds->this->sgp->power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4741	else
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4742	tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4743	sds->this->sgp->power;
				4744	pwr_move += sds->this->sgp->power *
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4745	min(sds->this_load_per_task, sds->this_load + tmp);
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4746	pwr_move /= SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4747
				4748	/* Move if we gain throughput */
				4749	if (pwr_move > pwr_now)
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4750	env->imbalance = sds->busiest_load_per_task;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4751	}
				4752
				4753	/**
				4754	* calculate_imbalance - Calculate the amount of imbalance present within the
				4755	* groups of a given sched_domain during load balance.
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4756	* @env: load balance environment
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4757	* @sds: statistics of the sched_domain whose imbalance is to be calculated.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4758	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4759	static inline void calculate_imbalance(struct lb_env env, struct sd_lb_stats sds)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4760	{
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4761	unsigned long max_pull, load_above_capacity = ~0UL;
				4762
				4763	sds->busiest_load_per_task /= sds->busiest_nr_running;
				4764	if (sds->group_imb) {
				4765	sds->busiest_load_per_task =
				4766	min(sds->busiest_load_per_task, sds->avg_load);
				4767	}
				4768
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4769	/*
				4770	* In the presence of smp nice balancing, certain scenarios can have
				4771	* max load less than avg load(as we skip the groups at or below
				4772	* its cpu_power, while calculating max_load..)
				4773	*/
				4774	if (sds->max_load < sds->avg_load) {
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4775	env->imbalance = 0;
				4776	return fix_small_imbalance(env, sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4777	}
				4778
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4779	if (!sds->group_imb) {
				4780	/*
				4781	* Don't want to pull so many tasks that a group would go idle.
				4782	*/
				4783	load_above_capacity = (sds->busiest_nr_running -
				4784	sds->busiest_group_capacity);
				4785
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4786	load_above_capacity = (SCHED_LOAD_SCALE SCHED_POWER_SCALE);
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4787
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4788	load_above_capacity /= sds->busiest->sgp->power;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4789	}
				4790
				4791	/*
				4792	* We're trying to get all the cpus to the average_load, so we don't
				4793	* want to push ourselves above the average load, nor do we wish to
				4794	* reduce the max loaded cpu below the average load. At the same time,
				4795	* we also don't want to reduce the group load below the group capacity
				4796	* (so that we can implement power-savings policies etc). Thus we look
				4797	* for the minimum possible imbalance.
				4798	* Be careful of negative numbers as they'll appear as very large values
				4799	* with unsigned longs.
				4800	*/
				4801	max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4802
				4803	/* How much load to actually move to equalise the imbalance */
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4804	env->imbalance = min(max_pull * sds->busiest->sgp->power,
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4805	(sds->avg_load - sds->this_load) * sds->this->sgp->power)
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4806	/ SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4807
				4808	/*
				4809	* if *imbalance is less than the average load per runnable task
Lucas De Marchi	25985ed	2011-03-30 22:57:33 -0300	[diff] [blame]	4810	* there is no guarantee that any tasks will be moved so we'll have
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4811	* a think about bumping its value to force at least one task to be
				4812	* moved
				4813	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4814	if (env->imbalance < sds->busiest_load_per_task)
				4815	return fix_small_imbalance(env, sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4816
				4817	}
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4818
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4819	/***** find_busiest_group() helpers end here *******************/
				4820
				4821	/**
				4822	* find_busiest_group - Returns the busiest group within the sched_domain
				4823	* if there is an imbalance. If there isn't an imbalance, and
				4824	* the user has opted for power-savings, it returns a group whose
				4825	* CPUs can be put to idle by rebalancing those tasks elsewhere, if
				4826	* such a group exists.
				4827	*
				4828	* Also calculates the amount of weighted load which should be moved
				4829	* to restore balance.
				4830	*
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4831	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4832	* @balance: Pointer to a variable indicating if this_cpu
				4833	* is the appropriate cpu to perform load balancing at this_level.
				4834	*
				4835	* Returns: - the busiest group if imbalance exists.
				4836	* - If no imbalance and user has opted for power-savings balance,
				4837	* return the least loaded group whose CPUs can be
				4838	* put to idle by rebalancing its tasks onto our group.
				4839	*/
				4840	static struct sched_group *
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	4841	find_busiest_group(struct lb_env env, int balance)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4842	{
				4843	struct sd_lb_stats sds;
				4844
				4845	memset(&sds, 0, sizeof(sds));
				4846
				4847	/*
				4848	* Compute the various statistics relavent for load balancing at
				4849	* this level.
				4850	*/
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	4851	update_sd_lb_stats(env, balance, &sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4852
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	4853	/*
				4854	* this_cpu is not the appropriate cpu to perform load balancing at
				4855	* this level.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4856	*/
Peter Zijlstra	8f190fb	2009-12-24 14:18:21 +0100	[diff] [blame]	4857	if (!(*balance))
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4858	goto ret;
				4859
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4860	if ((env->idle == CPU_IDLE \|\| env->idle == CPU_NEWLY_IDLE) &&
				4861	check_asym_packing(env, &sds))
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4862	return sds.busiest;
				4863
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	4864	/* There is no busy sibling group to pull tasks from */
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4865	if (!sds.busiest \|\| sds.busiest_nr_running == 0)
				4866	goto out_balanced;
				4867
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4868	sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
Ken Chen	b0432d8	2011-04-07 17:23:22 -0700	[diff] [blame]	4869
Peter Zijlstra	866ab43	2011-02-21 18:56:47 +0100	[diff] [blame]	4870	/*
				4871	* If the busiest group is imbalanced the below checks don't
				4872	* work because they assumes all things are equal, which typically
				4873	* isn't true due to cpus_allowed constraints and the like.
				4874	*/
				4875	if (sds.group_imb)
				4876	goto force_balance;
				4877
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	4878	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4879	if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4880	!sds.busiest_has_capacity)
				4881	goto force_balance;
				4882
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	4883	/*
				4884	* If the local group is more busy than the selected busiest group
				4885	* don't try and pull any tasks.
				4886	*/
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4887	if (sds.this_load >= sds.max_load)
				4888	goto out_balanced;
				4889
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	4890	/*
				4891	* Don't pull any tasks if this group is already above the domain
				4892	* average load.
				4893	*/
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4894	if (sds.this_load >= sds.avg_load)
				4895	goto out_balanced;
				4896
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4897	if (env->idle == CPU_IDLE) {
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4898	/*
				4899	* This cpu is idle. If the busiest group load doesn't
				4900	* have more tasks than the number of available cpu's and
				4901	* there is no imbalance between this and busiest group
				4902	* wrt to idle cpu's, it is balanced.
				4903	*/
Peter Zijlstra	c186faf	2011-02-21 18:52:53 +0100	[diff] [blame]	4904	if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4905	sds.busiest_nr_running <= sds.busiest_group_weight)
				4906	goto out_balanced;
Peter Zijlstra	c186faf	2011-02-21 18:52:53 +0100	[diff] [blame]	4907	} else {
				4908	/*
				4909	* In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
				4910	* imbalance_pct to be conservative.
				4911	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4912	if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
Peter Zijlstra	c186faf	2011-02-21 18:52:53 +0100	[diff] [blame]	4913	goto out_balanced;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4914	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4915
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4916	force_balance:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4917	/* Looks like there is an imbalance. Compute it */
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4918	calculate_imbalance(env, &sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4919	return sds.busiest;
				4920
				4921	out_balanced:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4922	ret:
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4923	env->imbalance = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4924	return NULL;
				4925	}
				4926
				4927	/*
				4928	* find_busiest_queue - find the busiest runqueue among the cpus in group.
				4929	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4930	static struct rq find_busiest_queue(struct lb_env env,
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	4931	struct sched_group *group)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4932	{
				4933	struct rq busiest = NULL, rq;
				4934	unsigned long max_load = 0;
				4935	int i;
				4936
				4937	for_each_cpu(i, sched_group_cpus(group)) {
				4938	unsigned long power = power_of(i);
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4939	unsigned long capacity = DIV_ROUND_CLOSEST(power,
				4940	SCHED_POWER_SCALE);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4941	unsigned long wl;
				4942
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4943	if (!capacity)
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4944	capacity = fix_small_capacity(env->sd, group);
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4945
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	4946	if (!cpumask_test_cpu(i, env->cpus))
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4947	continue;
				4948
				4949	rq = cpu_rq(i);
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	4950	wl = weighted_cpuload(i);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4951
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	4952	/*
				4953	* When comparing with imbalance, use weighted_cpuload()
				4954	* which is not scaled with the cpu power.
				4955	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4956	if (capacity && rq->nr_running == 1 && wl > env->imbalance)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4957	continue;
				4958
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	4959	/*
				4960	* For the load comparisons with the other cpu's, consider
				4961	* the weighted_cpuload() scaled with the cpu power, so that
				4962	* the load can be moved away from the cpu that is potentially
				4963	* running at a lower capacity.
				4964	*/
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4965	wl = (wl * SCHED_POWER_SCALE) / power;
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	4966
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4967	if (wl > max_load) {
				4968	max_load = wl;
				4969	busiest = rq;
				4970	}
				4971	}
				4972
				4973	return busiest;
				4974	}
				4975
				4976	/*
				4977	* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
				4978	* so long as it is large enough.
				4979	*/
				4980	#define MAX_PINNED_INTERVAL 512
				4981
				4982	/* Working cpumask for load_balance and load_balance_newidle. */
Joonsoo Kim	e6252c3	2013-04-23 17:27:41 +0900	[diff] [blame]	4983	DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4984
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4985	static int need_active_balance(struct lb_env *env)
Peter Zijlstra	1af3ed3	2009-12-23 15:10:31 +0100	[diff] [blame]	4986	{
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4987	struct sched_domain *sd = env->sd;
				4988
				4989	if (env->idle == CPU_NEWLY_IDLE) {
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4990
				4991	/*
				4992	* ASYM_PACKING needs to force migrate tasks from busy but
				4993	* higher numbered CPUs in order to pack all tasks in the
				4994	* lowest numbered CPUs.
				4995	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4996	if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4997	return 1;
Peter Zijlstra	1af3ed3	2009-12-23 15:10:31 +0100	[diff] [blame]	4998	}
				4999
				5000	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
				5001	}
				5002
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5003	static int active_load_balance_cpu_stop(void *data);
				5004
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5005	/*
				5006	* Check this_cpu to ensure it is balanced within domain. Attempt to move
				5007	* tasks if there is an imbalance.
				5008	*/
				5009	static int load_balance(int this_cpu, struct rq *this_rq,
				5010	struct sched_domain *sd, enum cpu_idle_type idle,
				5011	int *balance)
				5012	{
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5013	int ld_moved, cur_ld_moved, active_balance = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5014	struct sched_group *group;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5015	struct rq *busiest;
				5016	unsigned long flags;
Joonsoo Kim	e6252c3	2013-04-23 17:27:41 +0900	[diff] [blame]	5017	struct cpumask *cpus = __get_cpu_var(load_balance_mask);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5018
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5019	struct lb_env env = {
				5020	.sd = sd,
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	5021	.dst_cpu = this_cpu,
				5022	.dst_rq = this_rq,
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5023	.dst_grpmask = sched_group_cpus(sd->groups),
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5024	.idle = idle,
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	5025	.loop_break = sched_nr_migrate_break,
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	5026	.cpus = cpus,
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5027	};
				5028
Joonsoo Kim	cfc0311	2013-04-23 17:27:39 +0900	[diff] [blame]	5029	/*
				5030	* For NEWLY_IDLE load_balancing, we don't need to consider
				5031	* other cpus in our group
				5032	*/
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	5033	if (idle == CPU_NEWLY_IDLE)
Joonsoo Kim	cfc0311	2013-04-23 17:27:39 +0900	[diff] [blame]	5034	env.dst_grpmask = NULL;
Joonsoo Kim	cfc0311	2013-04-23 17:27:39 +0900	[diff] [blame]	5035
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5036	cpumask_copy(cpus, cpu_active_mask);
				5037
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5038	schedstat_inc(sd, lb_count[idle]);
				5039
				5040	redo:
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	5041	group = find_busiest_group(&env, balance);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5042
				5043	if (*balance == 0)
				5044	goto out_balanced;
				5045
				5046	if (!group) {
				5047	schedstat_inc(sd, lb_nobusyg[idle]);
				5048	goto out_balanced;
				5049	}
				5050
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	5051	busiest = find_busiest_queue(&env, group);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5052	if (!busiest) {
				5053	schedstat_inc(sd, lb_nobusyq[idle]);
				5054	goto out_balanced;
				5055	}
				5056
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	5057	BUG_ON(busiest == env.dst_rq);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5058
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5059	schedstat_add(sd, lb_imbalance[idle], env.imbalance);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5060
				5061	ld_moved = 0;
				5062	if (busiest->nr_running > 1) {
				5063	/*
				5064	* Attempt to move tasks. If find_busiest_group has found
				5065	* an imbalance but busiest->nr_running <= 1, the group is
				5066	* still unbalanced. ld_moved simply stays zero, so it is
				5067	* correctly treated as an imbalance.
				5068	*/
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5069	env.flags \|= LBF_ALL_PINNED;
Peter Zijlstra	c82513e	2012-04-26 13:12:27 +0200	[diff] [blame]	5070	env.src_cpu = busiest->cpu;
				5071	env.src_rq = busiest;
				5072	env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5073
Peter Zijlstra	a35b646	2012-08-08 21:46:40 +0200	[diff] [blame]	5074	update_h_load(env.src_cpu);
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	5075	more_balance:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5076	local_irq_save(flags);
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	5077	double_rq_lock(env.dst_rq, busiest);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5078
				5079	/*
				5080	* cur_ld_moved - load moved in current iteration
				5081	* ld_moved - cumulative load moved across iterations
				5082	*/
				5083	cur_ld_moved = move_tasks(&env);
				5084	ld_moved += cur_ld_moved;
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	5085	double_rq_unlock(env.dst_rq, busiest);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5086	local_irq_restore(flags);
				5087
				5088	/*
				5089	* some other cpu did the load balance for us.
				5090	*/
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5091	if (cur_ld_moved && env.dst_cpu != smp_processor_id())
				5092	resched_cpu(env.dst_cpu);
				5093
Joonsoo Kim	f1cd085	2013-04-23 17:27:37 +0900	[diff] [blame]	5094	if (env.flags & LBF_NEED_BREAK) {
				5095	env.flags &= ~LBF_NEED_BREAK;
				5096	goto more_balance;
				5097	}
				5098
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5099	/*
				5100	* Revisit (affine) tasks on src_cpu that couldn't be moved to
				5101	* us and move them to an alternate dst_cpu in our sched_group
				5102	* where they can run. The upper limit on how many times we
				5103	* iterate on same src_cpu is dependent on number of cpus in our
				5104	* sched_group.
				5105	*
				5106	* This changes load balance semantics a bit on who can move
				5107	* load to a given_cpu. In addition to the given_cpu itself
				5108	* (or a ilb_cpu acting on its behalf where given_cpu is
				5109	* nohz-idle), we now have balance_cpu in a position to move
				5110	* load to given_cpu. In rare situations, this may cause
				5111	* conflicts (balance_cpu and given_cpu/ilb_cpu deciding
				5112	* _independently_ and at _same_ time to move some load to
				5113	* given_cpu) causing exceess load to be moved to given_cpu.
				5114	* This however should not happen so much in practice and
				5115	* moreover subsequent load balance cycles should correct the
				5116	* excess load moved.
				5117	*/
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	5118	if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5119
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	5120	env.dst_rq = cpu_rq(env.new_dst_cpu);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5121	env.dst_cpu = env.new_dst_cpu;
				5122	env.flags &= ~LBF_SOME_PINNED;
				5123	env.loop = 0;
				5124	env.loop_break = sched_nr_migrate_break;
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	5125
				5126	/* Prevent to re-select dst_cpu via env's cpus */
				5127	cpumask_clear_cpu(env.dst_cpu, env.cpus);
				5128
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5129	/*
				5130	* Go back to "more_balance" rather than "redo" since we
				5131	* need to continue with same src_cpu.
				5132	*/
				5133	goto more_balance;
				5134	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5135
				5136	/* All tasks on this runqueue were pinned by CPU affinity */
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5137	if (unlikely(env.flags & LBF_ALL_PINNED)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5138	cpumask_clear_cpu(cpu_of(busiest), cpus);
Prashanth Nageshappa	bbf18b1	2012-06-19 17:52:07 +0530	[diff] [blame]	5139	if (!cpumask_empty(cpus)) {
				5140	env.loop = 0;
				5141	env.loop_break = sched_nr_migrate_break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5142	goto redo;
Prashanth Nageshappa	bbf18b1	2012-06-19 17:52:07 +0530	[diff] [blame]	5143	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5144	goto out_balanced;
				5145	}
				5146	}
				5147
				5148	if (!ld_moved) {
				5149	schedstat_inc(sd, lb_failed[idle]);
Venkatesh Pallipadi	58b26c4	2010-09-10 18:19:17 -0700	[diff] [blame]	5150	/*
				5151	* Increment the failure counter only on periodic balance.
				5152	* We do not want newidle balance, which can be very
				5153	* frequent, pollute the failure counter causing
				5154	* excessive cache_hot migrations and active balances.
				5155	*/
				5156	if (idle != CPU_NEWLY_IDLE)
				5157	sd->nr_balance_failed++;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5158
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5159	if (need_active_balance(&env)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5160	raw_spin_lock_irqsave(&busiest->lock, flags);
				5161
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5162	/* don't kick the active_load_balance_cpu_stop,
				5163	* if the curr task on busiest cpu can't be
				5164	* moved to this_cpu
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5165	*/
				5166	if (!cpumask_test_cpu(this_cpu,
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	5167	tsk_cpus_allowed(busiest->curr))) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5168	raw_spin_unlock_irqrestore(&busiest->lock,
				5169	flags);
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5170	env.flags \|= LBF_ALL_PINNED;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5171	goto out_one_pinned;
				5172	}
				5173
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5174	/*
				5175	* ->active_balance synchronizes accesses to
				5176	* ->active_balance_work. Once set, it's cleared
				5177	* only after active load balance is finished.
				5178	*/
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5179	if (!busiest->active_balance) {
				5180	busiest->active_balance = 1;
				5181	busiest->push_cpu = this_cpu;
				5182	active_balance = 1;
				5183	}
				5184	raw_spin_unlock_irqrestore(&busiest->lock, flags);
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5185
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5186	if (active_balance) {
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5187	stop_one_cpu_nowait(cpu_of(busiest),
				5188	active_load_balance_cpu_stop, busiest,
				5189	&busiest->active_balance_work);
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5190	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5191
				5192	/*
				5193	* We've kicked active balancing, reset the failure
				5194	* counter.
				5195	*/
				5196	sd->nr_balance_failed = sd->cache_nice_tries+1;
				5197	}
				5198	} else
				5199	sd->nr_balance_failed = 0;
				5200
				5201	if (likely(!active_balance)) {
				5202	/* We were unbalanced, so reset the balancing interval */
				5203	sd->balance_interval = sd->min_interval;
				5204	} else {
				5205	/*
				5206	* If we've begun active balancing, start to back off. This
				5207	* case may not be covered by the all_pinned logic if there
				5208	* is only 1 task on the busy runqueue (because we don't call
				5209	* move_tasks).
				5210	*/
				5211	if (sd->balance_interval < sd->max_interval)
				5212	sd->balance_interval *= 2;
				5213	}
				5214
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5215	goto out;
				5216
				5217	out_balanced:
				5218	schedstat_inc(sd, lb_balanced[idle]);
				5219
				5220	sd->nr_balance_failed = 0;
				5221
				5222	out_one_pinned:
				5223	/* tune up the balancing interval */
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5224	if (((env.flags & LBF_ALL_PINNED) &&
Peter Zijlstra	5b54b56	2011-09-22 15:23:13 +0200	[diff] [blame]	5225	sd->balance_interval < MAX_PINNED_INTERVAL) \|\|
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5226	(sd->balance_interval < sd->max_interval))
				5227	sd->balance_interval *= 2;
				5228
Venkatesh Pallipadi	46e49b3	2011-02-14 14:38:50 -0800	[diff] [blame]	5229	ld_moved = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5230	out:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5231	return ld_moved;
				5232	}
				5233
				5234	/*
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5235	* idle_balance is called by schedule() if this_cpu is about to become
				5236	* idle. Attempts to pull tasks from other CPUs.
				5237	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5238	void idle_balance(int this_cpu, struct rq *this_rq)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5239	{
				5240	struct sched_domain *sd;
				5241	int pulled_task = 0;
				5242	unsigned long next_balance = jiffies + HZ;
				5243
				5244	this_rq->idle_stamp = this_rq->clock;
				5245
				5246	if (this_rq->avg_idle < sysctl_sched_migration_cost)
				5247	return;
				5248
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5249	/*
				5250	* Drop the rq->lock, but keep IRQ/preempt disabled.
				5251	*/
				5252	raw_spin_unlock(&this_rq->lock);
				5253
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	5254	update_blocked_averages(this_cpu);
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5255	rcu_read_lock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5256	for_each_domain(this_cpu, sd) {
				5257	unsigned long interval;
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5258	int balance = 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5259
				5260	if (!(sd->flags & SD_LOAD_BALANCE))
				5261	continue;
				5262
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5263	if (sd->flags & SD_BALANCE_NEWIDLE) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5264	/* If we've pulled tasks over stop searching: */
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5265	pulled_task = load_balance(this_cpu, this_rq,
				5266	sd, CPU_NEWLY_IDLE, &balance);
				5267	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5268
				5269	interval = msecs_to_jiffies(sd->balance_interval);
				5270	if (time_after(next_balance, sd->last_balance + interval))
				5271	next_balance = sd->last_balance + interval;
Nikhil Rao	d5ad140	2010-11-17 11:42:04 -0800	[diff] [blame]	5272	if (pulled_task) {
				5273	this_rq->idle_stamp = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5274	break;
Nikhil Rao	d5ad140	2010-11-17 11:42:04 -0800	[diff] [blame]	5275	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5276	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5277	rcu_read_unlock();
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5278
				5279	raw_spin_lock(&this_rq->lock);
				5280
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5281	if (pulled_task \|\| time_after(jiffies, this_rq->next_balance)) {
				5282	/*
				5283	* We are going idle. next_balance may be set based on
				5284	* a busy processor. So reset next_balance.
				5285	*/
				5286	this_rq->next_balance = next_balance;
				5287	}
				5288	}
				5289
				5290	/*
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5291	* active_load_balance_cpu_stop is run by cpu stopper. It pushes
				5292	* running tasks off the busiest CPU onto idle CPUs. It requires at
				5293	* least 1 task to be running on each physical CPU where possible, and
				5294	* avoids physical / logical imbalances.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5295	*/
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5296	static int active_load_balance_cpu_stop(void *data)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5297	{
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5298	struct rq *busiest_rq = data;
				5299	int busiest_cpu = cpu_of(busiest_rq);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5300	int target_cpu = busiest_rq->push_cpu;
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5301	struct rq *target_rq = cpu_rq(target_cpu);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5302	struct sched_domain *sd;
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5303
				5304	raw_spin_lock_irq(&busiest_rq->lock);
				5305
				5306	/* make sure the requested cpu hasn't gone down in the meantime */
				5307	if (unlikely(busiest_cpu != smp_processor_id() \|\|
				5308	!busiest_rq->active_balance))
				5309	goto out_unlock;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5310
				5311	/* Is there any task to move? */
				5312	if (busiest_rq->nr_running <= 1)
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5313	goto out_unlock;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5314
				5315	/*
				5316	* This condition is "impossible", if it occurs
				5317	* we need to fix it. Originally reported by
				5318	* Bjorn Helgaas on a 128-cpu setup.
				5319	*/
				5320	BUG_ON(busiest_rq == target_rq);
				5321
				5322	/* move a task from busiest_rq to target_rq */
				5323	double_lock_balance(busiest_rq, target_rq);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5324
				5325	/* Search for an sd spanning us and the target CPU. */
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5326	rcu_read_lock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5327	for_each_domain(target_cpu, sd) {
				5328	if ((sd->flags & SD_LOAD_BALANCE) &&
				5329	cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
				5330	break;
				5331	}
				5332
				5333	if (likely(sd)) {
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5334	struct lb_env env = {
				5335	.sd = sd,
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	5336	.dst_cpu = target_cpu,
				5337	.dst_rq = target_rq,
				5338	.src_cpu = busiest_rq->cpu,
				5339	.src_rq = busiest_rq,
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5340	.idle = CPU_IDLE,
				5341	};
				5342
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5343	schedstat_inc(sd, alb_count);
				5344
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5345	if (move_one_task(&env))
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5346	schedstat_inc(sd, alb_pushed);
				5347	else
				5348	schedstat_inc(sd, alb_failed);
				5349	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5350	rcu_read_unlock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5351	double_unlock_balance(busiest_rq, target_rq);
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5352	out_unlock:
				5353	busiest_rq->active_balance = 0;
				5354	raw_spin_unlock_irq(&busiest_rq->lock);
				5355	return 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5356	}
				5357
Frederic Weisbecker	3451d02	2011-08-10 23:21:01 +0200	[diff] [blame]	5358	#ifdef CONFIG_NO_HZ_COMMON
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5359	/*
				5360	* idle load balancing details
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5361	* - When one of the busy CPUs notice that there may be an idle rebalancing
				5362	* needed, they will kick the idle load balancer, which then does idle
				5363	* load balancing for all the idle CPUs.
				5364	*/
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5365	static struct {
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5366	cpumask_var_t idle_cpus_mask;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5367	atomic_t nr_cpus;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5368	unsigned long next_balance; /* in jiffy units */
				5369	} nohz ____cacheline_aligned;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5370
Peter Zijlstra	8e7fbcb	2012-01-09 11:28:35 +0100	[diff] [blame]	5371	static inline int find_new_ilb(int call_cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5372	{
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5373	int ilb = cpumask_first(nohz.idle_cpus_mask);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5374
Suresh Siddha	786d6dc7	2011-12-01 17:07:35 -0800	[diff] [blame]	5375	if (ilb < nr_cpu_ids && idle_cpu(ilb))
				5376	return ilb;
				5377
				5378	return nr_cpu_ids;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5379	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5380
				5381	/*
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5382	* Kick a CPU to do the nohz balancing, if it is time for it. We pick the
				5383	* nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
				5384	* CPU (if there is one).
				5385	*/
				5386	static void nohz_balancer_kick(int cpu)
				5387	{
				5388	int ilb_cpu;
				5389
				5390	nohz.next_balance++;
				5391
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5392	ilb_cpu = find_new_ilb(cpu);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5393
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5394	if (ilb_cpu >= nr_cpu_ids)
				5395	return;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5396
Suresh Siddha	cd490c5	2011-12-06 11:26:34 -0800	[diff] [blame]	5397	if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5398	return;
				5399	/*
				5400	* Use smp_send_reschedule() instead of resched_cpu().
				5401	* This way we generate a sched IPI on the target cpu which
				5402	* is idle. And the softirq performing nohz idle load balance
				5403	* will be run before returning from the IPI.
				5404	*/
				5405	smp_send_reschedule(ilb_cpu);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5406	return;
				5407	}
				5408
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5409	static inline void nohz_balance_exit_idle(int cpu)
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	5410	{
				5411	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
				5412	cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
				5413	atomic_dec(&nohz.nr_cpus);
				5414	clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
				5415	}
				5416	}
				5417
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	5418	static inline void set_cpu_sd_state_busy(void)
				5419	{
				5420	struct sched_domain *sd;
				5421	int cpu = smp_processor_id();
				5422
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	5423	rcu_read_lock();
Vincent Guittot	25f55d9	2013-04-23 16:59:02 +0200	[diff] [blame]	5424	sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
				5425
				5426	if (!sd \|\| !sd->nohz_idle)
				5427	goto unlock;
				5428	sd->nohz_idle = 0;
				5429
				5430	for (; sd; sd = sd->parent)
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	5431	atomic_inc(&sd->groups->sgp->nr_busy_cpus);
Vincent Guittot	25f55d9	2013-04-23 16:59:02 +0200	[diff] [blame]	5432	unlock:
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	5433	rcu_read_unlock();
				5434	}
				5435
				5436	void set_cpu_sd_state_idle(void)
				5437	{
				5438	struct sched_domain *sd;
				5439	int cpu = smp_processor_id();
				5440
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	5441	rcu_read_lock();
Vincent Guittot	25f55d9	2013-04-23 16:59:02 +0200	[diff] [blame]	5442	sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
				5443
				5444	if (!sd \|\| sd->nohz_idle)
				5445	goto unlock;
				5446	sd->nohz_idle = 1;
				5447
				5448	for (; sd; sd = sd->parent)
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	5449	atomic_dec(&sd->groups->sgp->nr_busy_cpus);
Vincent Guittot	25f55d9	2013-04-23 16:59:02 +0200	[diff] [blame]	5450	unlock:
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	5451	rcu_read_unlock();
				5452	}
				5453
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5454	/*
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5455	* This routine will record that the cpu is going idle with tick stopped.
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5456	* This info will be used in performing idle load balancing in the future.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5457	*/
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5458	void nohz_balance_enter_idle(int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5459	{
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	5460	/*
				5461	* If this cpu is going down, then nothing needs to be done.
				5462	*/
				5463	if (!cpu_active(cpu))
				5464	return;
				5465
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5466	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
				5467	return;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5468
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5469	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
				5470	atomic_inc(&nohz.nr_cpus);
				5471	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5472	}
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	5473
				5474	static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
				5475	unsigned long action, void *hcpu)
				5476	{
				5477	switch (action & ~CPU_TASKS_FROZEN) {
				5478	case CPU_DYING:
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5479	nohz_balance_exit_idle(smp_processor_id());
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	5480	return NOTIFY_OK;
				5481	default:
				5482	return NOTIFY_DONE;
				5483	}
				5484	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5485	#endif
				5486
				5487	static DEFINE_SPINLOCK(balancing);
				5488
Peter Zijlstra	49c022e	2011-04-05 10:14:25 +0200	[diff] [blame]	5489	/*
				5490	* Scale the max load_balance interval with the number of CPUs in the system.
				5491	* This trades load-balance latency on larger machines for less cross talk.
				5492	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5493	void update_max_interval(void)
Peter Zijlstra	49c022e	2011-04-05 10:14:25 +0200	[diff] [blame]	5494	{
				5495	max_load_balance_interval = HZ*num_online_cpus()/10;
				5496	}
				5497
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5498	/*
				5499	* It checks each scheduling domain to see if it is due to be balanced,
				5500	* and initiates a balancing operation if so.
				5501	*
Libin	b9b0853	2013-04-01 19:14:01 +0800	[diff] [blame]	5502	* Balancing parameters are set up in init_sched_domains.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5503	*/
				5504	static void rebalance_domains(int cpu, enum cpu_idle_type idle)
				5505	{
				5506	int balance = 1;
				5507	struct rq *rq = cpu_rq(cpu);
				5508	unsigned long interval;
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	5509	struct sched_domain *sd;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5510	/* Earliest time when we have to do rebalance again */
				5511	unsigned long next_balance = jiffies + 60*HZ;
				5512	int update_next_balance = 0;
				5513	int need_serialize;
				5514
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	5515	update_blocked_averages(cpu);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	5516
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5517	rcu_read_lock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5518	for_each_domain(cpu, sd) {
				5519	if (!(sd->flags & SD_LOAD_BALANCE))
				5520	continue;
				5521
				5522	interval = sd->balance_interval;
				5523	if (idle != CPU_IDLE)
				5524	interval *= sd->busy_factor;
				5525
				5526	/* scale ms to jiffies */
				5527	interval = msecs_to_jiffies(interval);
Peter Zijlstra	49c022e	2011-04-05 10:14:25 +0200	[diff] [blame]	5528	interval = clamp(interval, 1UL, max_load_balance_interval);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5529
				5530	need_serialize = sd->flags & SD_SERIALIZE;
				5531
				5532	if (need_serialize) {
				5533	if (!spin_trylock(&balancing))
				5534	goto out;
				5535	}
				5536
				5537	if (time_after_eq(jiffies, sd->last_balance + interval)) {
				5538	if (load_balance(cpu, rq, sd, idle, &balance)) {
				5539	/*
Joonsoo Kim	de5eb2d	2013-04-23 17:27:38 +0900	[diff] [blame]	5540	* The LBF_SOME_PINNED logic could have changed
				5541	* env->dst_cpu, so we can't know our idle
				5542	* state even if we migrated tasks. Update it.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5543	*/
Joonsoo Kim	de5eb2d	2013-04-23 17:27:38 +0900	[diff] [blame]	5544	idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5545	}
				5546	sd->last_balance = jiffies;
				5547	}
				5548	if (need_serialize)
				5549	spin_unlock(&balancing);
				5550	out:
				5551	if (time_after(next_balance, sd->last_balance + interval)) {
				5552	next_balance = sd->last_balance + interval;
				5553	update_next_balance = 1;
				5554	}
				5555
				5556	/*
				5557	* Stop the load balance at this level. There is another
				5558	* CPU in our sched group which is doing load balancing more
				5559	* actively.
				5560	*/
				5561	if (!balance)
				5562	break;
				5563	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5564	rcu_read_unlock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5565
				5566	/*
				5567	* next_balance will be updated only when there is a need.
				5568	* When the cpu is attached to null domain for ex, it will not be
				5569	* updated.
				5570	*/
				5571	if (likely(update_next_balance))
				5572	rq->next_balance = next_balance;
				5573	}
				5574
Frederic Weisbecker	3451d02	2011-08-10 23:21:01 +0200	[diff] [blame]	5575	#ifdef CONFIG_NO_HZ_COMMON
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5576	/*
Frederic Weisbecker	3451d02	2011-08-10 23:21:01 +0200	[diff] [blame]	5577	* In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5578	* rebalancing for all the cpus for whom scheduler ticks are stopped.
				5579	*/
				5580	static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
				5581	{
				5582	struct rq *this_rq = cpu_rq(this_cpu);
				5583	struct rq *rq;
				5584	int balance_cpu;
				5585
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5586	if (idle != CPU_IDLE \|\|
				5587	!test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
				5588	goto end;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5589
				5590	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
Suresh Siddha	8a6d42d	2011-12-06 11:19:37 -0800	[diff] [blame]	5591	if (balance_cpu == this_cpu \|\| !idle_cpu(balance_cpu))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5592	continue;
				5593
				5594	/*
				5595	* If this cpu gets work to do, stop the load balancing
				5596	* work being done for other cpus. Next load
				5597	* balancing owner will pick it up.
				5598	*/
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5599	if (need_resched())
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5600	break;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5601
Vincent Guittot	5ed4f1d	2012-09-13 06:11:26 +0200	[diff] [blame]	5602	rq = cpu_rq(balance_cpu);
				5603
				5604	raw_spin_lock_irq(&rq->lock);
				5605	update_rq_clock(rq);
				5606	update_idle_cpu_load(rq);
				5607	raw_spin_unlock_irq(&rq->lock);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5608
				5609	rebalance_domains(balance_cpu, CPU_IDLE);
				5610
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5611	if (time_after(this_rq->next_balance, rq->next_balance))
				5612	this_rq->next_balance = rq->next_balance;
				5613	}
				5614	nohz.next_balance = this_rq->next_balance;
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5615	end:
				5616	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5617	}
				5618
				5619	/*
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5620	* Current heuristic for kicking the idle load balancer in the presence
				5621	* of an idle cpu is the system.
				5622	* - This rq has more than one task.
				5623	* - At any scheduler domain level, this cpu's scheduler group has multiple
				5624	* busy cpu's exceeding the group's power.
				5625	* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
				5626	* domain span are idle.
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5627	*/
				5628	static inline int nohz_kick_needed(struct rq *rq, int cpu)
				5629	{
				5630	unsigned long now = jiffies;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5631	struct sched_domain *sd;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5632
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5633	if (unlikely(idle_cpu(cpu)))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5634	return 0;
				5635
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5636	/*
				5637	* We may be recently in ticked or tickless idle mode. At the first
				5638	* busy tick after returning from idle, we will update the busy stats.
				5639	*/
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	5640	set_cpu_sd_state_busy();
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5641	nohz_balance_exit_idle(cpu);
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5642
				5643	/*
				5644	* None are in tickless mode and hence no need for NOHZ idle load
				5645	* balancing.
				5646	*/
				5647	if (likely(!atomic_read(&nohz.nr_cpus)))
				5648	return 0;
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5649
				5650	if (time_before(now, nohz.next_balance))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5651	return 0;
				5652
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5653	if (rq->nr_running >= 2)
				5654	goto need_kick;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5655
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5656	rcu_read_lock();
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5657	for_each_domain(cpu, sd) {
				5658	struct sched_group *sg = sd->groups;
				5659	struct sched_group_power *sgp = sg->sgp;
				5660	int nr_busy = atomic_read(&sgp->nr_busy_cpus);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5661
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5662	if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5663	goto need_kick_unlock;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5664
				5665	if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
				5666	&& (cpumask_first_and(nohz.idle_cpus_mask,
				5667	sched_domain_span(sd)) < cpu))
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5668	goto need_kick_unlock;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5669
				5670	if (!(sd->flags & (SD_SHARE_PKG_RESOURCES \| SD_ASYM_PACKING)))
				5671	break;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5672	}
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5673	rcu_read_unlock();
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5674	return 0;
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5675
				5676	need_kick_unlock:
				5677	rcu_read_unlock();
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5678	need_kick:
				5679	return 1;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5680	}
				5681	#else
				5682	static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
				5683	#endif
				5684
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5685	/*
				5686	* run_rebalance_domains is triggered when needed from the scheduler tick.
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5687	* Also triggered for nohz idle balancing (with nohz_balancing_kick set).
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5688	*/
				5689	static void run_rebalance_domains(struct softirq_action *h)
				5690	{
				5691	int this_cpu = smp_processor_id();
				5692	struct rq *this_rq = cpu_rq(this_cpu);
Suresh Siddha	6eb57e0	2011-10-03 15:09:01 -0700	[diff] [blame]	5693	enum cpu_idle_type idle = this_rq->idle_balance ?
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5694	CPU_IDLE : CPU_NOT_IDLE;
				5695
				5696	rebalance_domains(this_cpu, idle);
				5697
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5698	/*
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5699	* If this cpu has a pending nohz_balance_kick, then do the
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5700	* balancing on behalf of the other idle cpus whose ticks are
				5701	* stopped.
				5702	*/
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5703	nohz_idle_balance(this_cpu, idle);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5704	}
				5705
				5706	static inline int on_null_domain(int cpu)
				5707	{
Paul E. McKenney	90a6501	2010-02-28 08:32:18 -0800	[diff] [blame]	5708	return !rcu_dereference_sched(cpu_rq(cpu)->sd);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5709	}
				5710
				5711	/*
				5712	* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5713	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5714	void trigger_load_balance(struct rq *rq, int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5715	{
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5716	/* Don't need to rebalance while attached to NULL domain */
				5717	if (time_after_eq(jiffies, rq->next_balance) &&
				5718	likely(!on_null_domain(cpu)))
				5719	raise_softirq(SCHED_SOFTIRQ);
Frederic Weisbecker	3451d02	2011-08-10 23:21:01 +0200	[diff] [blame]	5720	#ifdef CONFIG_NO_HZ_COMMON
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5721	if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5722	nohz_balancer_kick(cpu);
				5723	#endif
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5724	}
				5725
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	5726	static void rq_online_fair(struct rq *rq)
				5727	{
				5728	update_sysctl();
				5729	}
				5730
				5731	static void rq_offline_fair(struct rq *rq)
				5732	{
				5733	update_sysctl();
Peter Boonstoppel	a4c96ae	2012-08-09 15:34:47 -0700	[diff] [blame]	5734
				5735	/* Ensure any throttled groups are reachable by pick_next_task */
				5736	unthrottle_offline_cfs_rqs(rq);
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	5737	}
				5738
Dhaval Giani	55e12e5	2008-06-24 23:39:43 +0530	[diff] [blame]	5739	#endif /* CONFIG_SMP */
Peter Williams	e1d1484	2007-10-24 18:23:51 +0200	[diff] [blame]	5740
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5741	/*
				5742	* scheduler tick hitting a task of our scheduling class:
				5743	*/
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	5744	static void task_tick_fair(struct rq rq, struct task_struct curr, int queued)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5745	{
				5746	struct cfs_rq *cfs_rq;
				5747	struct sched_entity *se = &curr->se;
				5748
				5749	for_each_sched_entity(se) {
				5750	cfs_rq = cfs_rq_of(se);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	5751	entity_tick(cfs_rq, se, queued);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5752	}
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	5753
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	5754	if (sched_feat_numa(NUMA))
				5755	task_tick_numa(rq, curr);
Linus Torvalds	3d59eeb	2012-12-16 14:33:25 -0800	[diff] [blame]	5756
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	5757	update_rq_runnable_avg(rq, 1);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5758	}
				5759
				5760	/*
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5761	* called on fork with the child task as argument from the parent's context
				5762	* - child not yet on the tasklist
				5763	* - preemption disabled
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5764	*/
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5765	static void task_fork_fair(struct task_struct *p)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5766	{
Daisuke Nishimura	4fc420c	2011-12-15 14:36:55 +0900	[diff] [blame]	5767	struct cfs_rq *cfs_rq;
				5768	struct sched_entity se = &p->se, curr;
Ingo Molnar	00bf7bf	2007-10-15 17:00:14 +0200	[diff] [blame]	5769	int this_cpu = smp_processor_id();
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5770	struct rq *rq = this_rq();
				5771	unsigned long flags;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5772
Thomas Gleixner	05fa785	2009-11-17 14:28:38 +0100	[diff] [blame]	5773	raw_spin_lock_irqsave(&rq->lock, flags);
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5774
Peter Zijlstra	861d034	2010-08-19 13:31:43 +0200	[diff] [blame]	5775	update_rq_clock(rq);
				5776
Daisuke Nishimura	4fc420c	2011-12-15 14:36:55 +0900	[diff] [blame]	5777	cfs_rq = task_cfs_rq(current);
				5778	curr = cfs_rq->curr;
				5779
Paul E. McKenney	b0a0f66	2010-10-06 17:32:51 -0700	[diff] [blame]	5780	if (unlikely(task_cpu(p) != this_cpu)) {
				5781	rcu_read_lock();
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5782	__set_task_cpu(p, this_cpu);
Paul E. McKenney	b0a0f66	2010-10-06 17:32:51 -0700	[diff] [blame]	5783	rcu_read_unlock();
				5784	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5785
Ting Yang	7109c44	2007-08-28 12:53:24 +0200	[diff] [blame]	5786	update_curr(cfs_rq);
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5787
Mike Galbraith	b5d9d73	2009-09-08 11:12:28 +0200	[diff] [blame]	5788	if (curr)
				5789	se->vruntime = curr->vruntime;
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	5790	place_entity(cfs_rq, se, 1);
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	5791
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5792	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
Dmitry Adamushko	87fefa3	2007-10-15 17:00:08 +0200	[diff] [blame]	5793	/*
Ingo Molnar	edcb60a	2007-10-15 17:00:08 +0200	[diff] [blame]	5794	* Upon rescheduling, sched_class::put_prev_task() will place
				5795	* 'current' within the tree based on its new key value.
				5796	*/
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	5797	swap(curr->vruntime, se->vruntime);
Bharata B Rao	aec0a51	2008-08-28 14:42:49 +0530	[diff] [blame]	5798	resched_task(rq->curr);
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	5799	}
				5800
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	5801	se->vruntime -= cfs_rq->min_vruntime;
				5802
Thomas Gleixner	05fa785	2009-11-17 14:28:38 +0100	[diff] [blame]	5803	raw_spin_unlock_irqrestore(&rq->lock, flags);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5804	}
				5805
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5806	/*
				5807	* Priority of the task has changed. Check to see if we preempt
				5808	* the current task.
				5809	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5810	static void
				5811	prio_changed_fair(struct rq rq, struct task_struct p, int oldprio)
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5812	{
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5813	if (!p->se.on_rq)
				5814	return;
				5815
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5816	/*
				5817	* Reschedule if we are currently running on this runqueue and
				5818	* our priority decreased, or if we are not currently running on
				5819	* this runqueue and our priority is higher than the current's
				5820	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5821	if (rq->curr == p) {
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5822	if (p->prio > oldprio)
				5823	resched_task(rq->curr);
				5824	} else
Peter Zijlstra	15afe09	2008-09-20 23:38:02 +0200	[diff] [blame]	5825	check_preempt_curr(rq, p, 0);
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5826	}
				5827
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5828	static void switched_from_fair(struct rq rq, struct task_struct p)
				5829	{
				5830	struct sched_entity *se = &p->se;
				5831	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				5832
				5833	/*
				5834	* Ensure the task's vruntime is normalized, so that when its
				5835	* switched back to the fair class the enqueue_entity(.flags=0) will
				5836	* do the right thing.
				5837	*
				5838	* If it was on_rq, then the dequeue_entity(.flags=0) will already
				5839	* have normalized the vruntime, if it was !on_rq, then only when
				5840	* the task is sleeping will it still have non-normalized vruntime.
				5841	*/
				5842	if (!se->on_rq && p->state != TASK_RUNNING) {
				5843	/*
				5844	* Fix up our vruntime so that the current sleep doesn't
				5845	* cause 'unlimited' sleep bonus.
				5846	*/
				5847	place_entity(cfs_rq, se, 0);
				5848	se->vruntime -= cfs_rq->min_vruntime;
				5849	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	5850
				5851	#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
				5852	/*
				5853	* Remove our load from contribution when we leave sched_fair
				5854	* and ensure we don't carry in an old decay_count if we
				5855	* switch back.
				5856	*/
				5857	if (p->se.avg.decay_count) {
				5858	struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
				5859	__synchronize_entity_decay(&p->se);
				5860	subtract_blocked_load_contrib(cfs_rq,
				5861	p->se.avg.load_avg_contrib);
				5862	}
				5863	#endif
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5864	}
				5865
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5866	/*
				5867	* We switched to the sched_fair class.
				5868	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5869	static void switched_to_fair(struct rq rq, struct task_struct p)
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5870	{
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5871	if (!p->se.on_rq)
				5872	return;
				5873
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5874	/*
				5875	* We were most likely switched from sched_rt, so
				5876	* kick off the schedule if running, otherwise just see
				5877	* if we can still preempt the current task.
				5878	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5879	if (rq->curr == p)
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5880	resched_task(rq->curr);
				5881	else
Peter Zijlstra	15afe09	2008-09-20 23:38:02 +0200	[diff] [blame]	5882	check_preempt_curr(rq, p, 0);
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5883	}
				5884
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	5885	/* Account for a task changing its policy or group.
				5886	*
				5887	* This routine is mostly called to set cfs_rq->curr field when a task
				5888	* migrates between groups/classes.
				5889	*/
				5890	static void set_curr_task_fair(struct rq *rq)
				5891	{
				5892	struct sched_entity *se = &rq->curr->se;
				5893
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	5894	for_each_sched_entity(se) {
				5895	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				5896
				5897	set_next_entity(cfs_rq, se);
				5898	/* ensure bandwidth has been allocated on our new cfs_rq */
				5899	account_cfs_rq_runtime(cfs_rq, 0);
				5900	}
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	5901	}
				5902
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5903	void init_cfs_rq(struct cfs_rq *cfs_rq)
				5904	{
				5905	cfs_rq->tasks_timeline = RB_ROOT;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5906	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
				5907	#ifndef CONFIG_64BIT
				5908	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
				5909	#endif
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	5910	#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
				5911	atomic64_set(&cfs_rq->decay_counter, 1);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	5912	atomic64_set(&cfs_rq->removed_load, 0);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	5913	#endif
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5914	}
				5915
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	5916	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	5917	static void task_move_group_fair(struct task_struct *p, int on_rq)
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	5918	{
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	5919	struct cfs_rq *cfs_rq;
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	5920	/*
				5921	* If the task was not on the rq at the time of this cgroup movement
				5922	* it must have been asleep, sleeping tasks keep their ->vruntime
				5923	* absolute on their old rq until wakeup (needed for the fair sleeper
				5924	* bonus in place_entity()).
				5925	*
				5926	* If it was on the rq, we've just 'preempted' it, which does convert
				5927	* ->vruntime to a relative base.
				5928	*
				5929	* Make sure both cases convert their relative position when migrating
				5930	* to another cgroup's rq. This does somewhat interfere with the
				5931	* fair sleeper stuff for the first placement, but who cares.
				5932	*/
Daisuke Nishimura	7ceff01	2011-12-15 14:36:07 +0900	[diff] [blame]	5933	/*
				5934	* When !on_rq, vruntime of the task has usually NOT been normalized.
				5935	* But there are some cases where it has already been normalized:
				5936	*
				5937	* - Moving a forked child which is waiting for being woken up by
				5938	* wake_up_new_task().
Daisuke Nishimura	62af378	2011-12-15 14:37:41 +0900	[diff] [blame]	5939	* - Moving a task which has been woken up by try_to_wake_up() and
				5940	* waiting for actually being woken up by sched_ttwu_pending().
Daisuke Nishimura	7ceff01	2011-12-15 14:36:07 +0900	[diff] [blame]	5941	*
				5942	* To prevent boost or penalty in the new cfs_rq caused by delta
				5943	* min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
				5944	*/
Daisuke Nishimura	62af378	2011-12-15 14:37:41 +0900	[diff] [blame]	5945	if (!on_rq && (!p->se.sum_exec_runtime \|\| p->state == TASK_WAKING))
Daisuke Nishimura	7ceff01	2011-12-15 14:36:07 +0900	[diff] [blame]	5946	on_rq = 1;
				5947
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	5948	if (!on_rq)
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	5949	p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
				5950	set_task_rq(p, task_cpu(p));
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	5951	if (!on_rq) {
				5952	cfs_rq = cfs_rq_of(&p->se);
				5953	p->se.vruntime += cfs_rq->min_vruntime;
				5954	#ifdef CONFIG_SMP
				5955	/*
				5956	* migrate_task_rq_fair() will have removed our previous
				5957	* contribution, but we must synchronize for ongoing future
				5958	* decay.
				5959	*/
				5960	p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
				5961	cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
				5962	#endif
				5963	}
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	5964	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5965
				5966	void free_fair_sched_group(struct task_group *tg)
				5967	{
				5968	int i;
				5969
				5970	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
				5971
				5972	for_each_possible_cpu(i) {
				5973	if (tg->cfs_rq)
				5974	kfree(tg->cfs_rq[i]);
				5975	if (tg->se)
				5976	kfree(tg->se[i]);
				5977	}
				5978
				5979	kfree(tg->cfs_rq);
				5980	kfree(tg->se);
				5981	}
				5982
				5983	int alloc_fair_sched_group(struct task_group tg, struct task_group parent)
				5984	{
				5985	struct cfs_rq *cfs_rq;
				5986	struct sched_entity *se;
				5987	int i;
				5988
				5989	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
				5990	if (!tg->cfs_rq)
				5991	goto err;
				5992	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
				5993	if (!tg->se)
				5994	goto err;
				5995
				5996	tg->shares = NICE_0_LOAD;
				5997
				5998	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
				5999
				6000	for_each_possible_cpu(i) {
				6001	cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
				6002	GFP_KERNEL, cpu_to_node(i));
				6003	if (!cfs_rq)
				6004	goto err;
				6005
				6006	se = kzalloc_node(sizeof(struct sched_entity),
				6007	GFP_KERNEL, cpu_to_node(i));
				6008	if (!se)
				6009	goto err_free_rq;
				6010
				6011	init_cfs_rq(cfs_rq);
				6012	init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
				6013	}
				6014
				6015	return 1;
				6016
				6017	err_free_rq:
				6018	kfree(cfs_rq);
				6019	err:
				6020	return 0;
				6021	}
				6022
				6023	void unregister_fair_sched_group(struct task_group *tg, int cpu)
				6024	{
				6025	struct rq *rq = cpu_rq(cpu);
				6026	unsigned long flags;
				6027
				6028	/*
				6029	* Only empty task groups can be destroyed; so we can speculatively
				6030	* check on_list without danger of it being re-added.
				6031	*/
				6032	if (!tg->cfs_rq[cpu]->on_list)
				6033	return;
				6034
				6035	raw_spin_lock_irqsave(&rq->lock, flags);
				6036	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
				6037	raw_spin_unlock_irqrestore(&rq->lock, flags);
				6038	}
				6039
				6040	void init_tg_cfs_entry(struct task_group tg, struct cfs_rq cfs_rq,
				6041	struct sched_entity *se, int cpu,
				6042	struct sched_entity *parent)
				6043	{
				6044	struct rq *rq = cpu_rq(cpu);
				6045
				6046	cfs_rq->tg = tg;
				6047	cfs_rq->rq = rq;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6048	init_cfs_rq_runtime(cfs_rq);
				6049
				6050	tg->cfs_rq[cpu] = cfs_rq;
				6051	tg->se[cpu] = se;
				6052
				6053	/* se could be NULL for root_task_group */
				6054	if (!se)
				6055	return;
				6056
				6057	if (!parent)
				6058	se->cfs_rq = &rq->cfs;
				6059	else
				6060	se->cfs_rq = parent->my_q;
				6061
				6062	se->my_q = cfs_rq;
				6063	update_load_set(&se->load, 0);
				6064	se->parent = parent;
				6065	}
				6066
				6067	static DEFINE_MUTEX(shares_mutex);
				6068
				6069	int sched_group_set_shares(struct task_group *tg, unsigned long shares)
				6070	{
				6071	int i;
				6072	unsigned long flags;
				6073
				6074	/*
				6075	* We can't change the weight of the root cgroup.
				6076	*/
				6077	if (!tg->se[0])
				6078	return -EINVAL;
				6079
				6080	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
				6081
				6082	mutex_lock(&shares_mutex);
				6083	if (tg->shares == shares)
				6084	goto done;
				6085
				6086	tg->shares = shares;
				6087	for_each_possible_cpu(i) {
				6088	struct rq *rq = cpu_rq(i);
				6089	struct sched_entity *se;
				6090
				6091	se = tg->se[i];
				6092	/* Propagate contribution to hierarchy */
				6093	raw_spin_lock_irqsave(&rq->lock, flags);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	6094	for_each_sched_entity(se)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6095	update_cfs_shares(group_cfs_rq(se));
				6096	raw_spin_unlock_irqrestore(&rq->lock, flags);
				6097	}
				6098
				6099	done:
				6100	mutex_unlock(&shares_mutex);
				6101	return 0;
				6102	}
				6103	#else /* CONFIG_FAIR_GROUP_SCHED */
				6104
				6105	void free_fair_sched_group(struct task_group *tg) { }
				6106
				6107	int alloc_fair_sched_group(struct task_group tg, struct task_group parent)
				6108	{
				6109	return 1;
				6110	}
				6111
				6112	void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
				6113
				6114	#endif /* CONFIG_FAIR_GROUP_SCHED */
				6115
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6116
H Hartley Sweeten	6d686f4	2010-01-13 20:21:52 -0700	[diff] [blame]	6117	static unsigned int get_rr_interval_fair(struct rq rq, struct task_struct task)
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6118	{
				6119	struct sched_entity *se = &task->se;
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6120	unsigned int rr_interval = 0;
				6121
				6122	/*
				6123	* Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
				6124	* idle runqueue:
				6125	*/
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6126	if (rq->cfs.load.weight)
Zhu Yanhai	a59f4e0	2013-01-08 12:56:52 +0800	[diff] [blame]	6127	rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6128
				6129	return rr_interval;
				6130	}
				6131
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6132	/*
				6133	* All the scheduling class methods:
				6134	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6135	const struct sched_class fair_sched_class = {
Ingo Molnar	5522d5d	2007-10-15 17:00:12 +0200	[diff] [blame]	6136	.next = &idle_sched_class,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6137	.enqueue_task = enqueue_task_fair,
				6138	.dequeue_task = dequeue_task_fair,
				6139	.yield_task = yield_task_fair,
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	6140	.yield_to_task = yield_to_task_fair,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6141
Ingo Molnar	2e09bf5	2007-10-15 17:00:05 +0200	[diff] [blame]	6142	.check_preempt_curr = check_preempt_wakeup,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6143
				6144	.pick_next_task = pick_next_task_fair,
				6145	.put_prev_task = put_prev_task_fair,
				6146
Peter Williams	681f3e6	2007-10-24 18:23:51 +0200	[diff] [blame]	6147	#ifdef CONFIG_SMP
Li Zefan	4ce72a2	2008-10-22 15:25:26 +0800	[diff] [blame]	6148	.select_task_rq = select_task_rq_fair,
Paul Turner	f4e26b1	2012-10-04 13:18:32 +0200	[diff] [blame]	6149	#ifdef CONFIG_FAIR_GROUP_SCHED
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	6150	.migrate_task_rq = migrate_task_rq_fair,
Paul Turner	f4e26b1	2012-10-04 13:18:32 +0200	[diff] [blame]	6151	#endif
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	6152	.rq_online = rq_online_fair,
				6153	.rq_offline = rq_offline_fair,
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	6154
				6155	.task_waking = task_waking_fair,
Peter Williams	681f3e6	2007-10-24 18:23:51 +0200	[diff] [blame]	6156	#endif
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6157
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	6158	.set_curr_task = set_curr_task_fair,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6159	.task_tick = task_tick_fair,
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	6160	.task_fork = task_fork_fair,
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6161
				6162	.prio_changed = prio_changed_fair,
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	6163	.switched_from = switched_from_fair,
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6164	.switched_to = switched_to_fair,
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6165
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6166	.get_rr_interval = get_rr_interval_fair,
				6167
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6168	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	6169	.task_move_group = task_move_group_fair,
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6170	#endif
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6171	};
				6172
				6173	#ifdef CONFIG_SCHED_DEBUG
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6174	void print_cfs_stats(struct seq_file *m, int cpu)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6175	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6176	struct cfs_rq *cfs_rq;
				6177
Peter Zijlstra	5973e5b	2008-01-25 21:08:34 +0100	[diff] [blame]	6178	rcu_read_lock();
Ingo Molnar	c3b64f1	2007-08-09 11:16:51 +0200	[diff] [blame]	6179	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
Ingo Molnar	5cef9ec	2007-08-09 11:16:47 +0200	[diff] [blame]	6180	print_cfs_rq(m, cpu, cfs_rq);
Peter Zijlstra	5973e5b	2008-01-25 21:08:34 +0100	[diff] [blame]	6181	rcu_read_unlock();
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6182	}
				6183	#endif
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6184
				6185	__init void init_sched_fair_class(void)
				6186	{
				6187	#ifdef CONFIG_SMP
				6188	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
				6189
Frederic Weisbecker	3451d02	2011-08-10 23:21:01 +0200	[diff] [blame]	6190	#ifdef CONFIG_NO_HZ_COMMON
Diwakar Tundlam	554ceca	2012-03-07 14:44:26 -0800	[diff] [blame]	6191	nohz.next_balance = jiffies;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6192	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	6193	cpu_notifier(sched_ilb_notifier, 0);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6194	#endif
				6195	#endif /* SMP */
				6196
				6197	}