Blame - kernel/sched/fair.c - SHIFTPHONES/android_kernel_shift_sdm845

blob: 5eea8707234a15167eeaabbc5881f04cf0345ca1 [file] [log] [blame]

Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1	/*
				2	* Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
				3	*
				4	* Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
				5	*
				6	* Interactivity improvements by Mike Galbraith
				7	* (C) 2007 Mike Galbraith <efault@gmx.de>
				8	*
				9	* Various enhancements by Dmitry Adamushko.
				10	* (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
				11	*
				12	* Group scheduling enhancements by Srivatsa Vaddagiri
				13	* Copyright IBM Corporation, 2007
				14	* Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
				15	*
				16	* Scaled math optimizations by Thomas Gleixner
				17	* Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	18	*
				19	* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
				20	* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	21	*/
				22
Arjan van de Ven	9745512	2008-01-25 21:08:34 +0100	[diff] [blame]	23	#include <linux/latencytop.h>
Christian Ehrhardt	1983a92	2009-11-30 12:16:47 +0100	[diff] [blame]	24	#include <linux/sched.h>
Sisir Koppaka	3436ae1	2011-03-26 18:22:55 +0530	[diff] [blame]	25	#include <linux/cpumask.h>
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	26	#include <linux/slab.h>
				27	#include <linux/profile.h>
				28	#include <linux/interrupt.h>
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	29	#include <linux/mempolicy.h>
Mel Gorman	e14808b	2012-11-19 10:59:15 +0000	[diff] [blame]	30	#include <linux/migrate.h>
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	31	#include <linux/task_work.h>
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	32
				33	#include <trace/events/sched.h>
				34
				35	#include "sched.h"
Arjan van de Ven	9745512	2008-01-25 21:08:34 +0100	[diff] [blame]	36
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	37	/*
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	38	* Targeted preemption latency for CPU-bound tasks:
Takuya Yoshikawa	864616e	2010-10-14 16:09:13 +0900	[diff] [blame]	39	* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	40	*
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	41	* NOTE: this latency value is not the same as the concept of
Ingo Molnar	d274a4c	2007-10-15 17:00:14 +0200	[diff] [blame]	42	* 'timeslice length' - timeslices in CFS are of variable length
				43	* and have no persistent notion like in traditional, time-slice
				44	* based scheduling concepts.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	45	*
Ingo Molnar	d274a4c	2007-10-15 17:00:14 +0200	[diff] [blame]	46	* (to see the precise effective timeslice length of your workload,
				47	* run vmstat and monitor the context-switches (cs) field)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	48	*/
Mike Galbraith	2140692	2010-03-11 17:17:15 +0100	[diff] [blame]	49	unsigned int sysctl_sched_latency = 6000000ULL;
				50	unsigned int normalized_sysctl_sched_latency = 6000000ULL;
Ingo Molnar	2bd8e6d	2007-10-15 17:00:02 +0200	[diff] [blame]	51
				52	/*
Christian Ehrhardt	1983a92	2009-11-30 12:16:47 +0100	[diff] [blame]	53	* The initial- and re-scaling of tunables is configurable
				54	* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
				55	*
				56	* Options are:
				57	* SCHED_TUNABLESCALING_NONE - unscaled, always *1
				58	* SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
				59	* SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
				60	*/
				61	enum sched_tunable_scaling sysctl_sched_tunable_scaling
				62	= SCHED_TUNABLESCALING_LOG;
				63
				64	/*
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	65	* Minimal preemption granularity for CPU-bound tasks:
Takuya Yoshikawa	864616e	2010-10-14 16:09:13 +0900	[diff] [blame]	66	* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	67	*/
Ingo Molnar	0bf377b	2010-09-12 08:14:52 +0200	[diff] [blame]	68	unsigned int sysctl_sched_min_granularity = 750000ULL;
				69	unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	70
				71	/*
				72	* is kept at sysctl_sched_latency / sysctl_sched_min_granularity
				73	*/
Ingo Molnar	0bf377b	2010-09-12 08:14:52 +0200	[diff] [blame]	74	static unsigned int sched_nr_latency = 8;
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	75
				76	/*
Mike Galbraith	2bba22c	2009-09-09 15:41:37 +0200	[diff] [blame]	77	* After fork, child runs first. If set to 0 (default) then
Ingo Molnar	2bd8e6d	2007-10-15 17:00:02 +0200	[diff] [blame]	78	* parent will (try to) run first.
				79	*/
Mike Galbraith	2bba22c	2009-09-09 15:41:37 +0200	[diff] [blame]	80	unsigned int sysctl_sched_child_runs_first __read_mostly;
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	81
				82	/*
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	83	* SCHED_OTHER wake-up granularity.
Mike Galbraith	172e082	2009-09-09 15:41:37 +0200	[diff] [blame]	84	* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	85	*
				86	* This option delays the preemption effects of decoupled workloads
				87	* and reduces their over-scheduling. Synchronous workloads will still
				88	* have immediate wakeup/sleep latencies.
				89	*/
Mike Galbraith	172e082	2009-09-09 15:41:37 +0200	[diff] [blame]	90	unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	91	unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	92
Ingo Molnar	da84d96	2007-10-15 17:00:18 +0200	[diff] [blame]	93	const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
				94
Paul Turner	a7a4f8a	2010-11-15 15:47:06 -0800	[diff] [blame]	95	/*
				96	* The exponential sliding window over which load is averaged for shares
				97	* distribution.
				98	* (default: 10msec)
				99	*/
				100	unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
				101
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	102	#ifdef CONFIG_CFS_BANDWIDTH
				103	/*
				104	* Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
				105	* each time a cfs_rq requests quota.
				106	*
				107	* Note: in the case that the slice exceeds the runtime remaining (either due
				108	* to consumption or the quota being specified to be smaller than the slice)
				109	* we will always only issue the remaining available time.
				110	*
				111	* default: 5 msec, units: microseconds
				112	*/
				113	unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
				114	#endif
				115
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	116	/*
				117	* Increase the granularity value when there are more CPUs,
				118	* because with more CPUs the 'effective latency' as visible
				119	* to users decreases. But the relationship is not linear,
				120	* so pick a second-best guess by going with the log2 of the
				121	* number of CPUs.
				122	*
				123	* This idea comes from the SD scheduler of Con Kolivas:
				124	*/
				125	static int get_update_sysctl_factor(void)
				126	{
				127	unsigned int cpus = min_t(int, num_online_cpus(), 8);
				128	unsigned int factor;
				129
				130	switch (sysctl_sched_tunable_scaling) {
				131	case SCHED_TUNABLESCALING_NONE:
				132	factor = 1;
				133	break;
				134	case SCHED_TUNABLESCALING_LINEAR:
				135	factor = cpus;
				136	break;
				137	case SCHED_TUNABLESCALING_LOG:
				138	default:
				139	factor = 1 + ilog2(cpus);
				140	break;
				141	}
				142
				143	return factor;
				144	}
				145
				146	static void update_sysctl(void)
				147	{
				148	unsigned int factor = get_update_sysctl_factor();
				149
				150	#define SET_SYSCTL(name) \
				151	(sysctl_##name = (factor) * normalized_sysctl_##name)
				152	SET_SYSCTL(sched_min_granularity);
				153	SET_SYSCTL(sched_latency);
				154	SET_SYSCTL(sched_wakeup_granularity);
				155	#undef SET_SYSCTL
				156	}
				157
				158	void sched_init_granularity(void)
				159	{
				160	update_sysctl();
				161	}
				162
				163	#if BITS_PER_LONG == 32
				164	# define WMULT_CONST (~0UL)
				165	#else
				166	# define WMULT_CONST (1UL << 32)
				167	#endif
				168
				169	#define WMULT_SHIFT 32
				170
				171	/*
				172	* Shift right and round:
				173	*/
				174	#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
				175
				176	/*
				177	* delta *= weight / lw
				178	*/
				179	static unsigned long
				180	calc_delta_mine(unsigned long delta_exec, unsigned long weight,
				181	struct load_weight *lw)
				182	{
				183	u64 tmp;
				184
				185	/*
				186	* weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
				187	* entities since MIN_SHARES = 2. Treat weight as 1 if less than
				188	* 2^SCHED_LOAD_RESOLUTION.
				189	*/
				190	if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
				191	tmp = (u64)delta_exec * scale_load_down(weight);
				192	else
				193	tmp = (u64)delta_exec;
				194
				195	if (!lw->inv_weight) {
				196	unsigned long w = scale_load_down(lw->weight);
				197
				198	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
				199	lw->inv_weight = 1;
				200	else if (unlikely(!w))
				201	lw->inv_weight = WMULT_CONST;
				202	else
				203	lw->inv_weight = WMULT_CONST / w;
				204	}
				205
				206	/*
				207	* Check whether we'd overflow the 64-bit multiplication:
				208	*/
				209	if (unlikely(tmp > WMULT_CONST))
				210	tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
				211	WMULT_SHIFT/2);
				212	else
				213	tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
				214
				215	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
				216	}
				217
				218
				219	const struct sched_class fair_sched_class;
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	220
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	221	/**************************************************************
				222	* CFS operations on generic schedulable entities:
				223	*/
				224
				225	#ifdef CONFIG_FAIR_GROUP_SCHED
				226
				227	/* cpu runqueue to which this cfs_rq is attached */
				228	static inline struct rq rq_of(struct cfs_rq cfs_rq)
				229	{
				230	return cfs_rq->rq;
				231	}
				232
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	233	/* An entity is a task if it doesn't "own" a runqueue */
				234	#define entity_is_task(se) (!se->my_q)
				235
Peter Zijlstra	8f48894	2009-07-24 12:25:30 +0200	[diff] [blame]	236	static inline struct task_struct task_of(struct sched_entity se)
				237	{
				238	#ifdef CONFIG_SCHED_DEBUG
				239	WARN_ON_ONCE(!entity_is_task(se));
				240	#endif
				241	return container_of(se, struct task_struct, se);
				242	}
				243
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	244	/* Walk up scheduling entities hierarchy */
				245	#define for_each_sched_entity(se) \
				246	for (; se; se = se->parent)
				247
				248	static inline struct cfs_rq task_cfs_rq(struct task_struct p)
				249	{
				250	return p->se.cfs_rq;
				251	}
				252
				253	/* runqueue on which this entity is (to be) queued */
				254	static inline struct cfs_rq cfs_rq_of(struct sched_entity se)
				255	{
				256	return se->cfs_rq;
				257	}
				258
				259	/* runqueue "owned" by this group */
				260	static inline struct cfs_rq group_cfs_rq(struct sched_entity grp)
				261	{
				262	return grp->my_q;
				263	}
				264
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	265	static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
				266	int force_update);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	267
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	268	static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				269	{
				270	if (!cfs_rq->on_list) {
Paul Turner	67e8625	2010-11-15 15:47:05 -0800	[diff] [blame]	271	/*
				272	* Ensure we either appear before our parent (if already
				273	* enqueued) or force our parent to appear after us when it is
				274	* enqueued. The fact that we always enqueue bottom-up
				275	* reduces this to two cases.
				276	*/
				277	if (cfs_rq->tg->parent &&
				278	cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
				279	list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	280	&rq_of(cfs_rq)->leaf_cfs_rq_list);
Paul Turner	67e8625	2010-11-15 15:47:05 -0800	[diff] [blame]	281	} else {
				282	list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
				283	&rq_of(cfs_rq)->leaf_cfs_rq_list);
				284	}
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	285
				286	cfs_rq->on_list = 1;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	287	/* We should have no load, but we need to update last_decay. */
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	288	update_cfs_rq_blocked_load(cfs_rq, 0);
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	289	}
				290	}
				291
				292	static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				293	{
				294	if (cfs_rq->on_list) {
				295	list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
				296	cfs_rq->on_list = 0;
				297	}
				298	}
				299
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	300	/* Iterate thr' all leaf cfs_rq's on a runqueue */
				301	#define for_each_leaf_cfs_rq(rq, cfs_rq) \
				302	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
				303
				304	/* Do the two (enqueued) entities belong to the same group ? */
				305	static inline int
				306	is_same_group(struct sched_entity se, struct sched_entity pse)
				307	{
				308	if (se->cfs_rq == pse->cfs_rq)
				309	return 1;
				310
				311	return 0;
				312	}
				313
				314	static inline struct sched_entity parent_entity(struct sched_entity se)
				315	{
				316	return se->parent;
				317	}
				318
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	319	/* return depth at which a sched entity is present in the hierarchy */
				320	static inline int depth_se(struct sched_entity *se)
				321	{
				322	int depth = 0;
				323
				324	for_each_sched_entity(se)
				325	depth++;
				326
				327	return depth;
				328	}
				329
				330	static void
				331	find_matching_se(struct sched_entity se, struct sched_entity pse)
				332	{
				333	int se_depth, pse_depth;
				334
				335	/*
				336	* preemption test can be made between sibling entities who are in the
				337	* same cfs_rq i.e who have a common parent. Walk up the hierarchy of
				338	* both tasks until we find their ancestors who are siblings of common
				339	* parent.
				340	*/
				341
				342	/* First walk up until both entities are at same depth */
				343	se_depth = depth_se(*se);
				344	pse_depth = depth_se(*pse);
				345
				346	while (se_depth > pse_depth) {
				347	se_depth--;
				348	se = parent_entity(se);
				349	}
				350
				351	while (pse_depth > se_depth) {
				352	pse_depth--;
				353	pse = parent_entity(pse);
				354	}
				355
				356	while (!is_same_group(se, pse)) {
				357	se = parent_entity(se);
				358	pse = parent_entity(pse);
				359	}
				360	}
				361
Peter Zijlstra	8f48894	2009-07-24 12:25:30 +0200	[diff] [blame]	362	#else /* !CONFIG_FAIR_GROUP_SCHED */
				363
				364	static inline struct task_struct task_of(struct sched_entity se)
				365	{
				366	return container_of(se, struct task_struct, se);
				367	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	368
				369	static inline struct rq rq_of(struct cfs_rq cfs_rq)
				370	{
				371	return container_of(cfs_rq, struct rq, cfs);
				372	}
				373
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	374	#define entity_is_task(se) 1
				375
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	376	#define for_each_sched_entity(se) \
				377	for (; se; se = NULL)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	378
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	379	static inline struct cfs_rq task_cfs_rq(struct task_struct p)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	380	{
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	381	return &task_rq(p)->cfs;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	382	}
				383
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	384	static inline struct cfs_rq cfs_rq_of(struct sched_entity se)
				385	{
				386	struct task_struct *p = task_of(se);
				387	struct rq *rq = task_rq(p);
				388
				389	return &rq->cfs;
				390	}
				391
				392	/* runqueue "owned" by this group */
				393	static inline struct cfs_rq group_cfs_rq(struct sched_entity grp)
				394	{
				395	return NULL;
				396	}
				397
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	398	static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				399	{
				400	}
				401
				402	static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				403	{
				404	}
				405
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	406	#define for_each_leaf_cfs_rq(rq, cfs_rq) \
				407	for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
				408
				409	static inline int
				410	is_same_group(struct sched_entity se, struct sched_entity pse)
				411	{
				412	return 1;
				413	}
				414
				415	static inline struct sched_entity parent_entity(struct sched_entity se)
				416	{
				417	return NULL;
				418	}
				419
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	420	static inline void
				421	find_matching_se(struct sched_entity se, struct sched_entity pse)
				422	{
				423	}
				424
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	425	#endif /* CONFIG_FAIR_GROUP_SCHED */
				426
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	427	static __always_inline
				428	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	429
				430	/**************************************************************
				431	* Scheduling class tree data structure manipulation methods:
				432	*/
				433
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	434	static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
Peter Zijlstra	02e0431	2007-10-15 17:00:07 +0200	[diff] [blame]	435	{
Peter Zijlstra	368059a	2007-10-15 17:00:11 +0200	[diff] [blame]	436	s64 delta = (s64)(vruntime - min_vruntime);
				437	if (delta > 0)
Peter Zijlstra	02e0431	2007-10-15 17:00:07 +0200	[diff] [blame]	438	min_vruntime = vruntime;
				439
				440	return min_vruntime;
				441	}
				442
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	443	static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
Peter Zijlstra	b0ffd24	2007-10-15 17:00:12 +0200	[diff] [blame]	444	{
				445	s64 delta = (s64)(vruntime - min_vruntime);
				446	if (delta < 0)
				447	min_vruntime = vruntime;
				448
				449	return min_vruntime;
				450	}
				451
Fabio Checconi	54fdc58	2009-07-16 12:32:27 +0200	[diff] [blame]	452	static inline int entity_before(struct sched_entity *a,
				453	struct sched_entity *b)
				454	{
				455	return (s64)(a->vruntime - b->vruntime) < 0;
				456	}
				457
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	458	static void update_min_vruntime(struct cfs_rq *cfs_rq)
				459	{
				460	u64 vruntime = cfs_rq->min_vruntime;
				461
				462	if (cfs_rq->curr)
				463	vruntime = cfs_rq->curr->vruntime;
				464
				465	if (cfs_rq->rb_leftmost) {
				466	struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
				467	struct sched_entity,
				468	run_node);
				469
Peter Zijlstra	e17036d	2009-01-15 14:53:39 +0100	[diff] [blame]	470	if (!cfs_rq->curr)
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	471	vruntime = se->vruntime;
				472	else
				473	vruntime = min_vruntime(vruntime, se->vruntime);
				474	}
				475
				476	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	477	#ifndef CONFIG_64BIT
				478	smp_wmb();
				479	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
				480	#endif
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	481	}
				482
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	483	/*
				484	* Enqueue an entity into the rb-tree:
				485	*/
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	486	static void __enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	487	{
				488	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
				489	struct rb_node *parent = NULL;
				490	struct sched_entity *entry;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	491	int leftmost = 1;
				492
				493	/*
				494	* Find the right place in the rbtree:
				495	*/
				496	while (*link) {
				497	parent = *link;
				498	entry = rb_entry(parent, struct sched_entity, run_node);
				499	/*
				500	* We dont care about collisions. Nodes with
				501	* the same key stay together.
				502	*/
Stephan Baerwolf	2bd2d6f	2011-07-20 14:46:59 +0200	[diff] [blame]	503	if (entity_before(se, entry)) {
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	504	link = &parent->rb_left;
				505	} else {
				506	link = &parent->rb_right;
				507	leftmost = 0;
				508	}
				509	}
				510
				511	/*
				512	* Maintain a cache of leftmost tree entries (it is frequently
				513	* used):
				514	*/
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	515	if (leftmost)
Ingo Molnar	57cb499	2007-10-15 17:00:11 +0200	[diff] [blame]	516	cfs_rq->rb_leftmost = &se->run_node;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	517
				518	rb_link_node(&se->run_node, parent, link);
				519	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	520	}
				521
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	522	static void __dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	523	{
Peter Zijlstra	3fe6974	2008-03-14 20:55:51 +0100	[diff] [blame]	524	if (cfs_rq->rb_leftmost == &se->run_node) {
				525	struct rb_node *next_node;
Peter Zijlstra	3fe6974	2008-03-14 20:55:51 +0100	[diff] [blame]	526
				527	next_node = rb_next(&se->run_node);
				528	cfs_rq->rb_leftmost = next_node;
Peter Zijlstra	3fe6974	2008-03-14 20:55:51 +0100	[diff] [blame]	529	}
Ingo Molnar	e9acbff	2007-10-15 17:00:04 +0200	[diff] [blame]	530
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	531	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	532	}
				533
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	534	struct sched_entity __pick_first_entity(struct cfs_rq cfs_rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	535	{
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	536	struct rb_node *left = cfs_rq->rb_leftmost;
				537
				538	if (!left)
				539	return NULL;
				540
				541	return rb_entry(left, struct sched_entity, run_node);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	542	}
				543
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	544	static struct sched_entity __pick_next_entity(struct sched_entity se)
				545	{
				546	struct rb_node *next = rb_next(&se->run_node);
				547
				548	if (!next)
				549	return NULL;
				550
				551	return rb_entry(next, struct sched_entity, run_node);
				552	}
				553
				554	#ifdef CONFIG_SCHED_DEBUG
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	555	struct sched_entity __pick_last_entity(struct cfs_rq cfs_rq)
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	556	{
Ingo Molnar	7eee3e6	2008-02-22 10:32:21 +0100	[diff] [blame]	557	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	558
Balbir Singh	70eee74	2008-02-22 13:25:53 +0530	[diff] [blame]	559	if (!last)
				560	return NULL;
Ingo Molnar	7eee3e6	2008-02-22 10:32:21 +0100	[diff] [blame]	561
				562	return rb_entry(last, struct sched_entity, run_node);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	563	}
				564
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	565	/**************************************************************
				566	* Scheduling class statistics methods:
				567	*/
				568
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	569	int sched_proc_update_handler(struct ctl_table *table, int write,
Alexey Dobriyan	8d65af7	2009-09-23 15:57:19 -0700	[diff] [blame]	570	void __user buffer, size_t lenp,
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	571	loff_t *ppos)
				572	{
Alexey Dobriyan	8d65af7	2009-09-23 15:57:19 -0700	[diff] [blame]	573	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	574	int factor = get_update_sysctl_factor();
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	575
				576	if (ret \|\| !write)
				577	return ret;
				578
				579	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
				580	sysctl_sched_min_granularity);
				581
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	582	#define WRT_SYSCTL(name) \
				583	(normalized_sysctl_##name = sysctl_##name / (factor))
				584	WRT_SYSCTL(sched_min_granularity);
				585	WRT_SYSCTL(sched_latency);
				586	WRT_SYSCTL(sched_wakeup_granularity);
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	587	#undef WRT_SYSCTL
				588
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	589	return 0;
				590	}
				591	#endif
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	592
				593	/*
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	594	* delta /= w
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	595	*/
				596	static inline unsigned long
				597	calc_delta_fair(unsigned long delta, struct sched_entity *se)
				598	{
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	599	if (unlikely(se->load.weight != NICE_0_LOAD))
				600	delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	601
				602	return delta;
				603	}
				604
				605	/*
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	606	* The idea is to set a period in which each task runs once.
				607	*
Borislav Petkov	532b185	2012-08-08 16:16:04 +0200	[diff] [blame]	608	* When there are too many tasks (sched_nr_latency) we have to stretch
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	609	* this period because otherwise the slices get too small.
				610	*
				611	* p = (nr <= nl) ? l : l*nr/nl
				612	*/
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	613	static u64 __sched_period(unsigned long nr_running)
				614	{
				615	u64 period = sysctl_sched_latency;
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	616	unsigned long nr_latency = sched_nr_latency;
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	617
				618	if (unlikely(nr_running > nr_latency)) {
Peter Zijlstra	4bf0b77	2008-01-25 21:08:21 +0100	[diff] [blame]	619	period = sysctl_sched_min_granularity;
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	620	period *= nr_running;
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	621	}
				622
				623	return period;
				624	}
				625
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	626	/*
				627	* We calculate the wall-time slice from the period by taking a part
				628	* proportional to the weight.
				629	*
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	630	* s = p*P[w/rw]
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	631	*/
Peter Zijlstra	6d0f0ebd	2007-10-15 17:00:05 +0200	[diff] [blame]	632	static u64 sched_slice(struct cfs_rq cfs_rq, struct sched_entity se)
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	633	{
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	634	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	635
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	636	for_each_sched_entity(se) {
Lin Ming	6272d68	2009-01-15 17:17:15 +0100	[diff] [blame]	637	struct load_weight *load;
Christian Engelmayer	3104bf0	2009-06-16 10:35:12 +0200	[diff] [blame]	638	struct load_weight lw;
Lin Ming	6272d68	2009-01-15 17:17:15 +0100	[diff] [blame]	639
				640	cfs_rq = cfs_rq_of(se);
				641	load = &cfs_rq->load;
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	642
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	643	if (unlikely(!se->on_rq)) {
Christian Engelmayer	3104bf0	2009-06-16 10:35:12 +0200	[diff] [blame]	644	lw = cfs_rq->load;
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	645
				646	update_load_add(&lw, se->load.weight);
				647	load = &lw;
				648	}
				649	slice = calc_delta_mine(slice, se->load.weight, load);
				650	}
				651	return slice;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	652	}
				653
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	654	/*
Peter Zijlstra	ac884de	2008-04-19 19:45:00 +0200	[diff] [blame]	655	* We calculate the vruntime slice of a to be inserted task
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	656	*
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	657	* vs = s/w
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	658	*/
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	659	static u64 sched_vslice(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	660	{
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	661	return calc_delta_fair(sched_slice(cfs_rq, se), se);
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	662	}
				663
				664	/*
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	665	* Update the current task's runtime statistics. Skip current tasks that
				666	* are not in our scheduling class.
				667	*/
				668	static inline void
Ingo Molnar	8ebc91d	2007-10-15 17:00:03 +0200	[diff] [blame]	669	__update_curr(struct cfs_rq cfs_rq, struct sched_entity curr,
				670	unsigned long delta_exec)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	671	{
Ingo Molnar	bbdba7c	2007-10-15 17:00:06 +0200	[diff] [blame]	672	unsigned long delta_exec_weighted;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	673
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	674	schedstat_set(curr->statistics.exec_max,
				675	max((u64)delta_exec, curr->statistics.exec_max));
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	676
				677	curr->sum_exec_runtime += delta_exec;
Ingo Molnar	7a62eab	2007-10-15 17:00:06 +0200	[diff] [blame]	678	schedstat_add(cfs_rq, exec_clock, delta_exec);
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	679	delta_exec_weighted = calc_delta_fair(delta_exec, curr);
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	680
Ingo Molnar	e9acbff	2007-10-15 17:00:04 +0200	[diff] [blame]	681	curr->vruntime += delta_exec_weighted;
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	682	update_min_vruntime(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	683	}
				684
Ingo Molnar	b7cc089	2007-08-09 11:16:47 +0200	[diff] [blame]	685	static void update_curr(struct cfs_rq *cfs_rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	686	{
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	687	struct sched_entity *curr = cfs_rq->curr;
Venkatesh Pallipadi	305e683	2010-10-04 17:03:21 -0700	[diff] [blame]	688	u64 now = rq_of(cfs_rq)->clock_task;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	689	unsigned long delta_exec;
				690
				691	if (unlikely(!curr))
				692	return;
				693
				694	/*
				695	* Get the amount of time the current task was running
				696	* since the last time we changed load (this cannot
				697	* overflow on 32 bits):
				698	*/
Ingo Molnar	8ebc91d	2007-10-15 17:00:03 +0200	[diff] [blame]	699	delta_exec = (unsigned long)(now - curr->exec_start);
Peter Zijlstra	34f28ec	2008-12-16 08:45:31 +0100	[diff] [blame]	700	if (!delta_exec)
				701	return;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	702
Ingo Molnar	8ebc91d	2007-10-15 17:00:03 +0200	[diff] [blame]	703	__update_curr(cfs_rq, curr, delta_exec);
				704	curr->exec_start = now;
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	705
				706	if (entity_is_task(curr)) {
				707	struct task_struct *curtask = task_of(curr);
				708
Ingo Molnar	f977bb4	2009-09-13 18:15:54 +0200	[diff] [blame]	709	trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	710	cpuacct_charge(curtask, delta_exec);
Frank Mayhar	f06febc	2008-09-12 09:54:39 -0700	[diff] [blame]	711	account_group_exec_runtime(curtask, delta_exec);
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	712	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	713
				714	account_cfs_rq_runtime(cfs_rq, delta_exec);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	715	}
				716
				717	static inline void
Ingo Molnar	5870db5	2007-08-09 11:16:47 +0200	[diff] [blame]	718	update_stats_wait_start(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	719	{
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	720	schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	721	}
				722
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	723	/*
				724	* Task is being enqueued - update stats:
				725	*/
Ingo Molnar	d2417e5	2007-08-09 11:16:47 +0200	[diff] [blame]	726	static void update_stats_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	727	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	728	/*
				729	* Are we enqueueing a waiting task? (for current tasks
				730	* a dequeue/enqueue event is a NOP)
				731	*/
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	732	if (se != cfs_rq->curr)
Ingo Molnar	5870db5	2007-08-09 11:16:47 +0200	[diff] [blame]	733	update_stats_wait_start(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	734	}
				735
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	736	static void
Ingo Molnar	9ef0a96	2007-08-09 11:16:47 +0200	[diff] [blame]	737	update_stats_wait_end(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	738	{
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	739	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
				740	rq_of(cfs_rq)->clock - se->statistics.wait_start));
				741	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
				742	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
				743	rq_of(cfs_rq)->clock - se->statistics.wait_start);
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	744	#ifdef CONFIG_SCHEDSTATS
				745	if (entity_is_task(se)) {
				746	trace_sched_stat_wait(task_of(se),
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	747	rq_of(cfs_rq)->clock - se->statistics.wait_start);
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	748	}
				749	#endif
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	750	schedstat_set(se->statistics.wait_start, 0);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	751	}
				752
				753	static inline void
Ingo Molnar	19b6a2e	2007-08-09 11:16:48 +0200	[diff] [blame]	754	update_stats_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	755	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	756	/*
				757	* Mark the end of the wait period if dequeueing a
				758	* waiting task:
				759	*/
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	760	if (se != cfs_rq->curr)
Ingo Molnar	9ef0a96	2007-08-09 11:16:47 +0200	[diff] [blame]	761	update_stats_wait_end(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	762	}
				763
				764	/*
				765	* We are picking a new current task - update its stats:
				766	*/
				767	static inline void
Ingo Molnar	79303e9	2007-08-09 11:16:47 +0200	[diff] [blame]	768	update_stats_curr_start(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	769	{
				770	/*
				771	* We are starting a new run period:
				772	*/
Venkatesh Pallipadi	305e683	2010-10-04 17:03:21 -0700	[diff] [blame]	773	se->exec_start = rq_of(cfs_rq)->clock_task;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	774	}
				775
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	776	/**************************************************
				777	* Scheduling class queueing methods:
				778	*/
				779
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	780	#ifdef CONFIG_NUMA_BALANCING
				781	/*
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	782	* numa task sample period in ms
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	783	*/
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	784	unsigned int sysctl_numa_balancing_scan_period_min = 100;
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	785	unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
				786	unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	787
				788	/* Portion of address space to scan in MB */
				789	unsigned int sysctl_numa_balancing_scan_size = 256;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	790
Peter Zijlstra	4b96a29	2012-10-25 14:16:47 +0200	[diff] [blame]	791	/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
				792	unsigned int sysctl_numa_balancing_scan_delay = 1000;
				793
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	794	static void task_numa_placement(struct task_struct *p)
				795	{
Hugh Dickins	2832bc1	2012-12-19 17:42:16 -0800	[diff] [blame^]	796	int seq;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	797
Hugh Dickins	2832bc1	2012-12-19 17:42:16 -0800	[diff] [blame^]	798	if (!p->mm) /* for example, ksmd faulting in a user's mm */
				799	return;
				800	seq = ACCESS_ONCE(p->mm->numa_scan_seq);
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	801	if (p->numa_scan_seq == seq)
				802	return;
				803	p->numa_scan_seq = seq;
				804
				805	/* FIXME: Scheduling placement policy hints go here */
				806	}
				807
				808	/*
				809	* Got a PROT_NONE fault for a page on @node.
				810	*/
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	811	void task_numa_fault(int node, int pages, bool migrated)
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	812	{
				813	struct task_struct *p = current;
				814
Mel Gorman	1a687c2	2012-11-22 11:16:36 +0000	[diff] [blame]	815	if (!sched_feat_numa(NUMA))
				816	return;
				817
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	818	/* FIXME: Allocate task-specific structure for placement policy here */
				819
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	820	/*
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	821	* If pages are properly placed (did not migrate) then scan slower.
				822	* This is reset periodically in case of phase changes
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	823	*/
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	824	if (!migrated)
				825	p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
				826	p->numa_scan_period + jiffies_to_msecs(10));
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	827
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	828	task_numa_placement(p);
				829	}
				830
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	831	static void reset_ptenuma_scan(struct task_struct *p)
				832	{
				833	ACCESS_ONCE(p->mm->numa_scan_seq)++;
				834	p->mm->numa_scan_offset = 0;
				835	}
				836
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	837	/*
				838	* The expensive part of numa migration is done from task_work context.
				839	* Triggered from task_tick_numa().
				840	*/
				841	void task_numa_work(struct callback_head *work)
				842	{
				843	unsigned long migrate, next_scan, now = jiffies;
				844	struct task_struct *p = current;
				845	struct mm_struct *mm = p->mm;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	846	struct vm_area_struct *vma;
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	847	unsigned long start, end;
				848	long pages;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	849
				850	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
				851
				852	work->next = work; /* protect against double add */
				853	/*
				854	* Who cares about NUMA placement when they're dying.
				855	*
				856	* NOTE: make sure not to dereference p->mm before this check,
				857	* exit_task_work() happens _after_ exit_mm() so we could be called
				858	* without p->mm even though we still had it when we enqueued this
				859	* work.
				860	*/
				861	if (p->flags & PF_EXITING)
				862	return;
				863
				864	/*
Mel Gorman	5bca230	2012-11-22 14:40:03 +0000	[diff] [blame]	865	* We do not care about task placement until a task runs on a node
				866	* other than the first one used by the address space. This is
				867	* largely because migrations are driven by what CPU the task
				868	* is running on. If it's never scheduled on another node, it'll
				869	* not migrate so why bother trapping the fault.
				870	*/
				871	if (mm->first_nid == NUMA_PTE_SCAN_INIT)
				872	mm->first_nid = numa_node_id();
				873	if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
				874	/* Are we running on a new node yet? */
				875	if (numa_node_id() == mm->first_nid &&
				876	!sched_feat_numa(NUMA_FORCE))
				877	return;
				878
				879	mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
				880	}
				881
				882	/*
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	883	* Reset the scan period if enough time has gone by. Objective is that
				884	* scanning will be reduced if pages are properly placed. As tasks
				885	* can enter different phases this needs to be re-examined. Lacking
				886	* proper tracking of reference behaviour, this blunt hammer is used.
				887	*/
				888	migrate = mm->numa_next_reset;
				889	if (time_after(now, migrate)) {
				890	p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
				891	next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
				892	xchg(&mm->numa_next_reset, next_scan);
				893	}
				894
				895	/*
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	896	* Enforce maximal scan/migration frequency..
				897	*/
				898	migrate = mm->numa_next_scan;
				899	if (time_before(now, migrate))
				900	return;
				901
				902	if (p->numa_scan_period == 0)
				903	p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
				904
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	905	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	906	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
				907	return;
				908
Mel Gorman	e14808b	2012-11-19 10:59:15 +0000	[diff] [blame]	909	/*
				910	* Do not set pte_numa if the current running node is rate-limited.
				911	* This loses statistics on the fault but if we are unwilling to
				912	* migrate to this node, it is less likely we can do useful work
				913	*/
				914	if (migrate_ratelimited(numa_node_id()))
				915	return;
				916
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	917	start = mm->numa_scan_offset;
				918	pages = sysctl_numa_balancing_scan_size;
				919	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
				920	if (!pages)
				921	return;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	922
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	923	down_read(&mm->mmap_sem);
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	924	vma = find_vma(mm, start);
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	925	if (!vma) {
				926	reset_ptenuma_scan(p);
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	927	start = 0;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	928	vma = mm->mmap;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	929	}
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	930	for (; vma; vma = vma->vm_next) {
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	931	if (!vma_migratable(vma))
				932	continue;
				933
				934	/* Skip small VMAs. They are not likely to be of relevance */
Mel Gorman	221392c	2012-12-17 14:05:53 +0000	[diff] [blame]	935	if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	936	continue;
				937
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	938	do {
				939	start = max(start, vma->vm_start);
				940	end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
				941	end = min(end, vma->vm_end);
				942	pages -= change_prot_numa(vma, start, end);
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	943
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	944	start = end;
				945	if (pages <= 0)
				946	goto out;
				947	} while (end != vma->vm_end);
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	948	}
				949
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	950	out:
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	951	/*
				952	* It is possible to reach the end of the VMA list but the last few VMAs are
				953	* not guaranteed to the vma_migratable. If they are not, we would find the
				954	* !migratable VMA on the next scan but not reset the scanner to the start
				955	* so check it now.
				956	*/
				957	if (vma)
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	958	mm->numa_scan_offset = start;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	959	else
				960	reset_ptenuma_scan(p);
				961	up_read(&mm->mmap_sem);
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	962	}
				963
				964	/*
				965	* Drive the periodic memory faults..
				966	*/
				967	void task_tick_numa(struct rq rq, struct task_struct curr)
				968	{
				969	struct callback_head *work = &curr->numa_work;
				970	u64 period, now;
				971
				972	/*
				973	* We don't care about NUMA placement if we don't have memory.
				974	*/
				975	if (!curr->mm \|\| (curr->flags & PF_EXITING) \|\| work->next != work)
				976	return;
				977
				978	/*
				979	* Using runtime rather than walltime has the dual advantage that
				980	* we (mostly) drive the selection from busy threads and that the
				981	* task needs to have done some actual work before we bother with
				982	* NUMA placement.
				983	*/
				984	now = curr->se.sum_exec_runtime;
				985	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
				986
				987	if (now - curr->node_stamp > period) {
Peter Zijlstra	4b96a29	2012-10-25 14:16:47 +0200	[diff] [blame]	988	if (!curr->node_stamp)
				989	curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	990	curr->node_stamp = now;
				991
				992	if (!time_before(jiffies, curr->mm->numa_next_scan)) {
				993	init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
				994	task_work_add(curr, work, true);
				995	}
				996	}
				997	}
				998	#else
				999	static void task_tick_numa(struct rq rq, struct task_struct curr)
				1000	{
				1001	}
				1002	#endif /* CONFIG_NUMA_BALANCING */
				1003
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1004	static void
				1005	account_entity_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
				1006	{
				1007	update_load_add(&cfs_rq->load, se->load.weight);
Peter Zijlstra	c09595f	2008-06-27 13:41:14 +0200	[diff] [blame]	1008	if (!parent_entity(se))
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	1009	update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	1010	#ifdef CONFIG_SMP
				1011	if (entity_is_task(se))
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	1012	list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	1013	#endif
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1014	cfs_rq->nr_running++;
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1015	}
				1016
				1017	static void
				1018	account_entity_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)
				1019	{
				1020	update_load_sub(&cfs_rq->load, se->load.weight);
Peter Zijlstra	c09595f	2008-06-27 13:41:14 +0200	[diff] [blame]	1021	if (!parent_entity(se))
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	1022	update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	1023	if (entity_is_task(se))
Bharata B Rao	b87f172	2008-09-25 09:53:54 +0530	[diff] [blame]	1024	list_del_init(&se->group_node);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1025	cfs_rq->nr_running--;
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1026	}
				1027
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1028	#ifdef CONFIG_FAIR_GROUP_SCHED
				1029	# ifdef CONFIG_SMP
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1030	static inline long calc_tg_weight(struct task_group tg, struct cfs_rq cfs_rq)
				1031	{
				1032	long tg_weight;
				1033
				1034	/*
				1035	* Use this CPU's actual weight instead of the last load_contribution
				1036	* to gain a more accurate current total weight. See
				1037	* update_cfs_rq_load_contribution().
				1038	*/
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	1039	tg_weight = atomic64_read(&tg->load_avg);
				1040	tg_weight -= cfs_rq->tg_load_contrib;
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1041	tg_weight += cfs_rq->load.weight;
				1042
				1043	return tg_weight;
				1044	}
				1045
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1046	static long calc_cfs_shares(struct cfs_rq cfs_rq, struct task_group tg)
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1047	{
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1048	long tg_weight, load, shares;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1049
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1050	tg_weight = calc_tg_weight(tg, cfs_rq);
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1051	load = cfs_rq->load.weight;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1052
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1053	shares = (tg->shares * load);
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1054	if (tg_weight)
				1055	shares /= tg_weight;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1056
				1057	if (shares < MIN_SHARES)
				1058	shares = MIN_SHARES;
				1059	if (shares > tg->shares)
				1060	shares = tg->shares;
				1061
				1062	return shares;
				1063	}
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1064	# else /* CONFIG_SMP */
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1065	static inline long calc_cfs_shares(struct cfs_rq cfs_rq, struct task_group tg)
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1066	{
				1067	return tg->shares;
				1068	}
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1069	# endif /* CONFIG_SMP */
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1070	static void reweight_entity(struct cfs_rq cfs_rq, struct sched_entity se,
				1071	unsigned long weight)
				1072	{
Paul Turner	19e5eeb	2010-12-15 19:10:18 -0800	[diff] [blame]	1073	if (se->on_rq) {
				1074	/* commit outstanding execution time */
				1075	if (cfs_rq->curr == se)
				1076	update_curr(cfs_rq);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1077	account_entity_dequeue(cfs_rq, se);
Paul Turner	19e5eeb	2010-12-15 19:10:18 -0800	[diff] [blame]	1078	}
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1079
				1080	update_load_set(&se->load, weight);
				1081
				1082	if (se->on_rq)
				1083	account_entity_enqueue(cfs_rq, se);
				1084	}
				1085
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	1086	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
				1087
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1088	static void update_cfs_shares(struct cfs_rq *cfs_rq)
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1089	{
				1090	struct task_group *tg;
				1091	struct sched_entity *se;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1092	long shares;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1093
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1094	tg = cfs_rq->tg;
				1095	se = tg->se[cpu_of(rq_of(cfs_rq))];
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	1096	if (!se \|\| throttled_hierarchy(cfs_rq))
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1097	return;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1098	#ifndef CONFIG_SMP
				1099	if (likely(se->load.weight == tg->shares))
				1100	return;
				1101	#endif
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1102	shares = calc_cfs_shares(cfs_rq, tg);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1103
				1104	reweight_entity(cfs_rq_of(se), se, shares);
				1105	}
				1106	#else /* CONFIG_FAIR_GROUP_SCHED */
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1107	static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1108	{
				1109	}
				1110	#endif /* CONFIG_FAIR_GROUP_SCHED */
				1111
Paul Turner	f4e26b1	2012-10-04 13:18:32 +0200	[diff] [blame]	1112	/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
				1113	#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1114	/*
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1115	* We choose a half-life close to 1 scheduling period.
				1116	* Note: The tables below are dependent on this value.
				1117	*/
				1118	#define LOAD_AVG_PERIOD 32
				1119	#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
				1120	#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
				1121
				1122	/* Precomputed fixed inverse multiplies for multiplication by y^n */
				1123	static const u32 runnable_avg_yN_inv[] = {
				1124	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
				1125	0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
				1126	0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
				1127	0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
				1128	0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
				1129	0x85aac367, 0x82cd8698,
				1130	};
				1131
				1132	/*
				1133	* Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
				1134	* over-estimates when re-combining.
				1135	*/
				1136	static const u32 runnable_avg_yN_sum[] = {
				1137	0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
				1138	9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
				1139	17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
				1140	};
				1141
				1142	/*
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1143	* Approximate:
				1144	* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
				1145	*/
				1146	static __always_inline u64 decay_load(u64 val, u64 n)
				1147	{
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1148	unsigned int local_n;
				1149
				1150	if (!n)
				1151	return val;
				1152	else if (unlikely(n > LOAD_AVG_PERIOD * 63))
				1153	return 0;
				1154
				1155	/* after bounds checking we can collapse to 32-bit */
				1156	local_n = n;
				1157
				1158	/*
				1159	* As y^PERIOD = 1/2, we can combine
				1160	* y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
				1161	* With a look-up table which covers k^n (n<PERIOD)
				1162	*
				1163	* To achieve constant time decay_load.
				1164	*/
				1165	if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
				1166	val >>= local_n / LOAD_AVG_PERIOD;
				1167	local_n %= LOAD_AVG_PERIOD;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1168	}
				1169
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1170	val *= runnable_avg_yN_inv[local_n];
				1171	/* We don't use SRR here since we always want to round down. */
				1172	return val >> 32;
				1173	}
				1174
				1175	/*
				1176	* For updates fully spanning n periods, the contribution to runnable
				1177	* average will be: \Sum 1024*y^n
				1178	*
				1179	* We can compute this reasonably efficiently by combining:
				1180	* y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
				1181	*/
				1182	static u32 __compute_runnable_contrib(u64 n)
				1183	{
				1184	u32 contrib = 0;
				1185
				1186	if (likely(n <= LOAD_AVG_PERIOD))
				1187	return runnable_avg_yN_sum[n];
				1188	else if (unlikely(n >= LOAD_AVG_MAX_N))
				1189	return LOAD_AVG_MAX;
				1190
				1191	/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
				1192	do {
				1193	contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
				1194	contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
				1195
				1196	n -= LOAD_AVG_PERIOD;
				1197	} while (n > LOAD_AVG_PERIOD);
				1198
				1199	contrib = decay_load(contrib, n);
				1200	return contrib + runnable_avg_yN_sum[n];
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1201	}
				1202
				1203	/*
				1204	* We can represent the historical contribution to runnable average as the
				1205	* coefficients of a geometric series. To do this we sub-divide our runnable
				1206	* history into segments of approximately 1ms (1024us); label the segment that
				1207	* occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
				1208	*
				1209	* [<- 1024us ->\|<- 1024us ->\|<- 1024us ->\| ...
				1210	* p0 p1 p2
				1211	* (now) (~1ms ago) (~2ms ago)
				1212	*
				1213	* Let u_i denote the fraction of p_i that the entity was runnable.
				1214	*
				1215	* We then designate the fractions u_i as our co-efficients, yielding the
				1216	* following representation of historical load:
				1217	* u_0 + u_1y + u_2y^2 + u_3*y^3 + ...
				1218	*
				1219	* We choose y based on the with of a reasonably scheduling period, fixing:
				1220	* y^32 = 0.5
				1221	*
				1222	* This means that the contribution to load ~32ms ago (u_32) will be weighted
				1223	* approximately half as much as the contribution to load within the last ms
				1224	* (u_0).
				1225	*
				1226	* When a period "rolls over" and we have new u_0`, multiplying the previous
				1227	* sum again by y is sufficient to update:
				1228	* load_avg = u_0` + y(u_0 + u_1y + u_2*y^2 + ... )
				1229	* = u_0 + u_1y + u_2y^2 + ... [re-labeling u_i --> u_{i+1}]
				1230	*/
				1231	static __always_inline int __update_entity_runnable_avg(u64 now,
				1232	struct sched_avg *sa,
				1233	int runnable)
				1234	{
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1235	u64 delta, periods;
				1236	u32 runnable_contrib;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1237	int delta_w, decayed = 0;
				1238
				1239	delta = now - sa->last_runnable_update;
				1240	/*
				1241	* This should only happen when time goes backwards, which it
				1242	* unfortunately does during sched clock init when we swap over to TSC.
				1243	*/
				1244	if ((s64)delta < 0) {
				1245	sa->last_runnable_update = now;
				1246	return 0;
				1247	}
				1248
				1249	/*
				1250	* Use 1024ns as the unit of measurement since it's a reasonable
				1251	* approximation of 1us and fast to compute.
				1252	*/
				1253	delta >>= 10;
				1254	if (!delta)
				1255	return 0;
				1256	sa->last_runnable_update = now;
				1257
				1258	/* delta_w is the amount already accumulated against our next period */
				1259	delta_w = sa->runnable_avg_period % 1024;
				1260	if (delta + delta_w >= 1024) {
				1261	/* period roll-over */
				1262	decayed = 1;
				1263
				1264	/*
				1265	* Now that we know we're crossing a period boundary, figure
				1266	* out how much from delta we need to complete the current
				1267	* period and accrue it.
				1268	*/
				1269	delta_w = 1024 - delta_w;
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1270	if (runnable)
				1271	sa->runnable_avg_sum += delta_w;
				1272	sa->runnable_avg_period += delta_w;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1273
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1274	delta -= delta_w;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1275
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1276	/* Figure out how many additional periods this update spans */
				1277	periods = delta / 1024;
				1278	delta %= 1024;
				1279
				1280	sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
				1281	periods + 1);
				1282	sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
				1283	periods + 1);
				1284
				1285	/* Efficiently calculate \sum (1..n_period) 1024y^i /
				1286	runnable_contrib = __compute_runnable_contrib(periods);
				1287	if (runnable)
				1288	sa->runnable_avg_sum += runnable_contrib;
				1289	sa->runnable_avg_period += runnable_contrib;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1290	}
				1291
				1292	/* Remainder of delta accrued against u_0` */
				1293	if (runnable)
				1294	sa->runnable_avg_sum += delta;
				1295	sa->runnable_avg_period += delta;
				1296
				1297	return decayed;
				1298	}
				1299
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1300	/* Synchronize an entity's decay with its parenting cfs_rq.*/
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1301	static inline u64 __synchronize_entity_decay(struct sched_entity *se)
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1302	{
				1303	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1304	u64 decays = atomic64_read(&cfs_rq->decay_counter);
				1305
				1306	decays -= se->avg.decay_count;
				1307	if (!decays)
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1308	return 0;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1309
				1310	se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
				1311	se->avg.decay_count = 0;
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1312
				1313	return decays;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1314	}
				1315
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1316	#ifdef CONFIG_FAIR_GROUP_SCHED
				1317	static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
				1318	int force_update)
				1319	{
				1320	struct task_group *tg = cfs_rq->tg;
				1321	s64 tg_contrib;
				1322
				1323	tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
				1324	tg_contrib -= cfs_rq->tg_load_contrib;
				1325
				1326	if (force_update \|\| abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
				1327	atomic64_add(tg_contrib, &tg->load_avg);
				1328	cfs_rq->tg_load_contrib += tg_contrib;
				1329	}
				1330	}
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1331
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1332	/*
				1333	* Aggregate cfs_rq runnable averages into an equivalent task_group
				1334	* representation for computing load contributions.
				1335	*/
				1336	static inline void __update_tg_runnable_avg(struct sched_avg *sa,
				1337	struct cfs_rq *cfs_rq)
				1338	{
				1339	struct task_group *tg = cfs_rq->tg;
				1340	long contrib;
				1341
				1342	/* The fraction of a cpu used by this cfs_rq */
				1343	contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
				1344	sa->runnable_avg_period + 1);
				1345	contrib -= cfs_rq->tg_runnable_contrib;
				1346
				1347	if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
				1348	atomic_add(contrib, &tg->runnable_avg);
				1349	cfs_rq->tg_runnable_contrib += contrib;
				1350	}
				1351	}
				1352
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1353	static inline void __update_group_entity_contrib(struct sched_entity *se)
				1354	{
				1355	struct cfs_rq *cfs_rq = group_cfs_rq(se);
				1356	struct task_group *tg = cfs_rq->tg;
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1357	int runnable_avg;
				1358
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1359	u64 contrib;
				1360
				1361	contrib = cfs_rq->tg_load_contrib * tg->shares;
				1362	se->avg.load_avg_contrib = div64_u64(contrib,
				1363	atomic64_read(&tg->load_avg) + 1);
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1364
				1365	/*
				1366	* For group entities we need to compute a correction term in the case
				1367	* that they are consuming <1 cpu so that we would contribute the same
				1368	* load as a task of equal weight.
				1369	*
				1370	* Explicitly co-ordinating this measurement would be expensive, but
				1371	* fortunately the sum of each cpus contribution forms a usable
				1372	* lower-bound on the true value.
				1373	*
				1374	* Consider the aggregate of 2 contributions. Either they are disjoint
				1375	* (and the sum represents true value) or they are disjoint and we are
				1376	* understating by the aggregate of their overlap.
				1377	*
				1378	* Extending this to N cpus, for a given overlap, the maximum amount we
				1379	* understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
				1380	* cpus that overlap for this interval and w_i is the interval width.
				1381	*
				1382	* On a small machine; the first term is well-bounded which bounds the
				1383	* total error since w_i is a subset of the period. Whereas on a
				1384	* larger machine, while this first term can be larger, if w_i is the
				1385	* of consequential size guaranteed to see n_i*w_i quickly converge to
				1386	* our upper bound of 1-cpu.
				1387	*/
				1388	runnable_avg = atomic_read(&tg->runnable_avg);
				1389	if (runnable_avg < NICE_0_LOAD) {
				1390	se->avg.load_avg_contrib *= runnable_avg;
				1391	se->avg.load_avg_contrib >>= NICE_0_SHIFT;
				1392	}
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1393	}
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1394	#else
				1395	static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
				1396	int force_update) {}
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1397	static inline void __update_tg_runnable_avg(struct sched_avg *sa,
				1398	struct cfs_rq *cfs_rq) {}
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1399	static inline void __update_group_entity_contrib(struct sched_entity *se) {}
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1400	#endif
				1401
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1402	static inline void __update_task_entity_contrib(struct sched_entity *se)
				1403	{
				1404	u32 contrib;
				1405
				1406	/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
				1407	contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
				1408	contrib /= (se->avg.runnable_avg_period + 1);
				1409	se->avg.load_avg_contrib = scale_load(contrib);
				1410	}
				1411
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1412	/* Compute the current contribution to load_avg by se, return any delta */
				1413	static long __update_entity_load_avg_contrib(struct sched_entity *se)
				1414	{
				1415	long old_contrib = se->avg.load_avg_contrib;
				1416
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1417	if (entity_is_task(se)) {
				1418	__update_task_entity_contrib(se);
				1419	} else {
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1420	__update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1421	__update_group_entity_contrib(se);
				1422	}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1423
				1424	return se->avg.load_avg_contrib - old_contrib;
				1425	}
				1426
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1427	static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
				1428	long load_contrib)
				1429	{
				1430	if (likely(load_contrib < cfs_rq->blocked_load_avg))
				1431	cfs_rq->blocked_load_avg -= load_contrib;
				1432	else
				1433	cfs_rq->blocked_load_avg = 0;
				1434	}
				1435
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	1436	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
				1437
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1438	/* Update a sched_entity's runnable average */
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1439	static inline void update_entity_load_avg(struct sched_entity *se,
				1440	int update_cfs_rq)
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1441	{
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1442	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1443	long contrib_delta;
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	1444	u64 now;
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1445
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	1446	/*
				1447	* For a group entity we need to use their owned cfs_rq_clock_task() in
				1448	* case they are the parent of a throttled hierarchy.
				1449	*/
				1450	if (entity_is_task(se))
				1451	now = cfs_rq_clock_task(cfs_rq);
				1452	else
				1453	now = cfs_rq_clock_task(group_cfs_rq(se));
				1454
				1455	if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1456	return;
				1457
				1458	contrib_delta = __update_entity_load_avg_contrib(se);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1459
				1460	if (!update_cfs_rq)
				1461	return;
				1462
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1463	if (se->on_rq)
				1464	cfs_rq->runnable_load_avg += contrib_delta;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1465	else
				1466	subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
				1467	}
				1468
				1469	/*
				1470	* Decay the load contributed by all blocked children and account this so that
				1471	* their contribution may appropriately discounted when they wake up.
				1472	*/
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1473	static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1474	{
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	1475	u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1476	u64 decays;
				1477
				1478	decays = now - cfs_rq->last_decay;
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1479	if (!decays && !force_update)
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1480	return;
				1481
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1482	if (atomic64_read(&cfs_rq->removed_load)) {
				1483	u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
				1484	subtract_blocked_load_contrib(cfs_rq, removed_load);
				1485	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1486
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1487	if (decays) {
				1488	cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
				1489	decays);
				1490	atomic64_add(decays, &cfs_rq->decay_counter);
				1491	cfs_rq->last_decay = now;
				1492	}
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1493
				1494	__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1495	}
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	1496
				1497	static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
				1498	{
				1499	__update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1500	__update_tg_runnable_avg(&rq->avg, &rq->cfs);
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	1501	}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1502
				1503	/* Add the load generated by se into cfs_rq's child load-average */
				1504	static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1505	struct sched_entity *se,
				1506	int wakeup)
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1507	{
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1508	/*
				1509	* We track migrations using entity decay_count <= 0, on a wake-up
				1510	* migration we use a negative decay count to track the remote decays
				1511	* accumulated while sleeping.
				1512	*/
				1513	if (unlikely(se->avg.decay_count <= 0)) {
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1514	se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1515	if (se->avg.decay_count) {
				1516	/*
				1517	* In a wake-up migration we have to approximate the
				1518	* time sleeping. This is because we can't synchronize
				1519	* clock_task between the two cpus, and it is not
				1520	* guaranteed to be read-safe. Instead, we can
				1521	* approximate this using our carried decays, which are
				1522	* explicitly atomically readable.
				1523	*/
				1524	se->avg.last_runnable_update -= (-se->avg.decay_count)
				1525	<< 20;
				1526	update_entity_load_avg(se, 0);
				1527	/* Indicate that we're now synchronized and on-rq */
				1528	se->avg.decay_count = 0;
				1529	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1530	wakeup = 0;
				1531	} else {
				1532	__synchronize_entity_decay(se);
				1533	}
				1534
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1535	/* migrated tasks did not contribute to our blocked load */
				1536	if (wakeup) {
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1537	subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1538	update_entity_load_avg(se, 0);
				1539	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1540
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1541	cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1542	/* we force update consideration on load-balancer moves */
				1543	update_cfs_rq_blocked_load(cfs_rq, !wakeup);
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1544	}
				1545
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1546	/*
				1547	* Remove se's load from this cfs_rq child load-average, if the entity is
				1548	* transitioning to a blocked state we track its projected decay using
				1549	* blocked_load_avg.
				1550	*/
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1551	static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1552	struct sched_entity *se,
				1553	int sleep)
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1554	{
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1555	update_entity_load_avg(se, 1);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1556	/* we force update consideration on load-balancer moves */
				1557	update_cfs_rq_blocked_load(cfs_rq, !sleep);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1558
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1559	cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1560	if (sleep) {
				1561	cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
				1562	se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
				1563	} /* migrations, e.g. sleep=0 leave decay_count == 0 */
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1564	}
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1565	#else
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1566	static inline void update_entity_load_avg(struct sched_entity *se,
				1567	int update_cfs_rq) {}
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	1568	static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1569	static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1570	struct sched_entity *se,
				1571	int wakeup) {}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1572	static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1573	struct sched_entity *se,
				1574	int sleep) {}
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1575	static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
				1576	int force_update) {}
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1577	#endif
				1578
Ingo Molnar	2396af6	2007-08-09 11:16:48 +0200	[diff] [blame]	1579	static void enqueue_sleeper(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1580	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1581	#ifdef CONFIG_SCHEDSTATS
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	1582	struct task_struct *tsk = NULL;
				1583
				1584	if (entity_is_task(se))
				1585	tsk = task_of(se);
				1586
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1587	if (se->statistics.sleep_start) {
				1588	u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1589
				1590	if ((s64)delta < 0)
				1591	delta = 0;
				1592
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1593	if (unlikely(delta > se->statistics.sleep_max))
				1594	se->statistics.sleep_max = delta;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1595
Peter Zijlstra	8c79a04	2012-01-30 14:51:37 +0100	[diff] [blame]	1596	se->statistics.sleep_start = 0;
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1597	se->statistics.sum_sleep_runtime += delta;
Arjan van de Ven	9745512	2008-01-25 21:08:34 +0100	[diff] [blame]	1598
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	1599	if (tsk) {
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	1600	account_scheduler_latency(tsk, delta >> 10, 1);
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	1601	trace_sched_stat_sleep(tsk, delta);
				1602	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1603	}
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1604	if (se->statistics.block_start) {
				1605	u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1606
				1607	if ((s64)delta < 0)
				1608	delta = 0;
				1609
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1610	if (unlikely(delta > se->statistics.block_max))
				1611	se->statistics.block_max = delta;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1612
Peter Zijlstra	8c79a04	2012-01-30 14:51:37 +0100	[diff] [blame]	1613	se->statistics.block_start = 0;
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1614	se->statistics.sum_sleep_runtime += delta;
Ingo Molnar	30084fb	2007-10-02 14:13:08 +0200	[diff] [blame]	1615
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	1616	if (tsk) {
Arjan van de Ven	8f0dfc3	2009-07-20 11:26:58 -0700	[diff] [blame]	1617	if (tsk->in_iowait) {
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1618	se->statistics.iowait_sum += delta;
				1619	se->statistics.iowait_count++;
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	1620	trace_sched_stat_iowait(tsk, delta);
Arjan van de Ven	8f0dfc3	2009-07-20 11:26:58 -0700	[diff] [blame]	1621	}
				1622
Andrew Vagin	b781a60	2011-11-28 12:03:35 +0300	[diff] [blame]	1623	trace_sched_stat_blocked(tsk, delta);
				1624
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	1625	/*
				1626	* Blocking time is in units of nanosecs, so shift by
				1627	* 20 to get a milliseconds-range estimation of the
				1628	* amount of time that the task spent sleeping:
				1629	*/
				1630	if (unlikely(prof_on == SLEEP_PROFILING)) {
				1631	profile_hits(SLEEP_PROFILING,
				1632	(void *)get_wchan(tsk),
				1633	delta >> 20);
				1634	}
				1635	account_scheduler_latency(tsk, delta >> 10, 0);
Ingo Molnar	30084fb	2007-10-02 14:13:08 +0200	[diff] [blame]	1636	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1637	}
				1638	#endif
				1639	}
				1640
Peter Zijlstra	ddc9729	2007-10-15 17:00:10 +0200	[diff] [blame]	1641	static void check_spread(struct cfs_rq cfs_rq, struct sched_entity se)
				1642	{
				1643	#ifdef CONFIG_SCHED_DEBUG
				1644	s64 d = se->vruntime - cfs_rq->min_vruntime;
				1645
				1646	if (d < 0)
				1647	d = -d;
				1648
				1649	if (d > 3*sysctl_sched_latency)
				1650	schedstat_inc(cfs_rq, nr_spread_over);
				1651	#endif
				1652	}
				1653
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1654	static void
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1655	place_entity(struct cfs_rq cfs_rq, struct sched_entity se, int initial)
				1656	{
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	1657	u64 vruntime = cfs_rq->min_vruntime;
Peter Zijlstra	94dfb5e	2007-10-15 17:00:05 +0200	[diff] [blame]	1658
Peter Zijlstra	2cb8600	2007-11-09 22:39:37 +0100	[diff] [blame]	1659	/*
				1660	* The 'current' period is already promised to the current tasks,
				1661	* however the extra weight of the new task will slow them down a
				1662	* little, place the new task so that it fits in the slot that
				1663	* stays open at the end.
				1664	*/
Peter Zijlstra	94dfb5e	2007-10-15 17:00:05 +0200	[diff] [blame]	1665	if (initial && sched_feat(START_DEBIT))
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	1666	vruntime += sched_vslice(cfs_rq, se);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1667
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1668	/* sleeps up to a single latency don't count. */
Mike Galbraith	5ca9880	2010-03-11 17:17:17 +0100	[diff] [blame]	1669	if (!initial) {
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1670	unsigned long thresh = sysctl_sched_latency;
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	1671
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1672	/*
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1673	* Halve their sleep time's effect, to allow
				1674	* for a gentler effect of sleepers:
				1675	*/
				1676	if (sched_feat(GENTLE_FAIR_SLEEPERS))
				1677	thresh >>= 1;
Ingo Molnar	51e0304	2009-09-16 08:54:45 +0200	[diff] [blame]	1678
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1679	vruntime -= thresh;
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1680	}
				1681
Mike Galbraith	b5d9d73	2009-09-08 11:12:28 +0200	[diff] [blame]	1682	/* ensure we never gain time by being placed backwards. */
				1683	vruntime = max_vruntime(se->vruntime, vruntime);
				1684
Peter Zijlstra	67e9fb2	2007-10-15 17:00:10 +0200	[diff] [blame]	1685	se->vruntime = vruntime;
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1686	}
				1687
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	1688	static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
				1689
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1690	static void
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1691	enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1692	{
				1693	/*
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1694	* Update the normalized vruntime before updating min_vruntime
				1695	* through callig update_curr().
				1696	*/
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	1697	if (!(flags & ENQUEUE_WAKEUP) \|\| (flags & ENQUEUE_WAKING))
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1698	se->vruntime += cfs_rq->min_vruntime;
				1699
				1700	/*
Dmitry Adamushko	a2a2d68	2007-10-15 17:00:13 +0200	[diff] [blame]	1701	* Update run-time statistics of the 'current'.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1702	*/
Ingo Molnar	b7cc089	2007-08-09 11:16:47 +0200	[diff] [blame]	1703	update_curr(cfs_rq);
Paul Turner	f269ae0	2012-10-04 13:18:31 +0200	[diff] [blame]	1704	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	1705	account_entity_enqueue(cfs_rq, se);
				1706	update_cfs_shares(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1707
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1708	if (flags & ENQUEUE_WAKEUP) {
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1709	place_entity(cfs_rq, se, 0);
Ingo Molnar	2396af6	2007-08-09 11:16:48 +0200	[diff] [blame]	1710	enqueue_sleeper(cfs_rq, se);
Ingo Molnar	e9acbff	2007-10-15 17:00:04 +0200	[diff] [blame]	1711	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1712
Ingo Molnar	d2417e5	2007-08-09 11:16:47 +0200	[diff] [blame]	1713	update_stats_enqueue(cfs_rq, se);
Peter Zijlstra	ddc9729	2007-10-15 17:00:10 +0200	[diff] [blame]	1714	check_spread(cfs_rq, se);
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	1715	if (se != cfs_rq->curr)
				1716	__enqueue_entity(cfs_rq, se);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1717	se->on_rq = 1;
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	1718
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	1719	if (cfs_rq->nr_running == 1) {
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	1720	list_add_leaf_cfs_rq(cfs_rq);
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	1721	check_enqueue_throttle(cfs_rq);
				1722	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1723	}
				1724
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	1725	static void __clear_buddies_last(struct sched_entity *se)
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	1726	{
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	1727	for_each_sched_entity(se) {
				1728	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1729	if (cfs_rq->last == se)
				1730	cfs_rq->last = NULL;
				1731	else
				1732	break;
				1733	}
				1734	}
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	1735
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	1736	static void __clear_buddies_next(struct sched_entity *se)
				1737	{
				1738	for_each_sched_entity(se) {
				1739	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1740	if (cfs_rq->next == se)
				1741	cfs_rq->next = NULL;
				1742	else
				1743	break;
				1744	}
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	1745	}
				1746
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1747	static void __clear_buddies_skip(struct sched_entity *se)
				1748	{
				1749	for_each_sched_entity(se) {
				1750	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1751	if (cfs_rq->skip == se)
				1752	cfs_rq->skip = NULL;
				1753	else
				1754	break;
				1755	}
				1756	}
				1757
Peter Zijlstra	a571bbe	2009-01-28 14:51:40 +0100	[diff] [blame]	1758	static void clear_buddies(struct cfs_rq cfs_rq, struct sched_entity se)
				1759	{
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	1760	if (cfs_rq->last == se)
				1761	__clear_buddies_last(se);
				1762
				1763	if (cfs_rq->next == se)
				1764	__clear_buddies_next(se);
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1765
				1766	if (cfs_rq->skip == se)
				1767	__clear_buddies_skip(se);
Peter Zijlstra	a571bbe	2009-01-28 14:51:40 +0100	[diff] [blame]	1768	}
				1769
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	1770	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	1771
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1772	static void
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	1773	dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1774	{
Dmitry Adamushko	a2a2d68	2007-10-15 17:00:13 +0200	[diff] [blame]	1775	/*
				1776	* Update run-time statistics of the 'current'.
				1777	*/
				1778	update_curr(cfs_rq);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	1779	dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
Dmitry Adamushko	a2a2d68	2007-10-15 17:00:13 +0200	[diff] [blame]	1780
Ingo Molnar	19b6a2e	2007-08-09 11:16:48 +0200	[diff] [blame]	1781	update_stats_dequeue(cfs_rq, se);
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	1782	if (flags & DEQUEUE_SLEEP) {
Peter Zijlstra	67e9fb2	2007-10-15 17:00:10 +0200	[diff] [blame]	1783	#ifdef CONFIG_SCHEDSTATS
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1784	if (entity_is_task(se)) {
				1785	struct task_struct *tsk = task_of(se);
				1786
				1787	if (tsk->state & TASK_INTERRUPTIBLE)
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1788	se->statistics.sleep_start = rq_of(cfs_rq)->clock;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1789	if (tsk->state & TASK_UNINTERRUPTIBLE)
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1790	se->statistics.block_start = rq_of(cfs_rq)->clock;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1791	}
Dmitry Adamushko	db36cc7	2007-10-15 17:00:06 +0200	[diff] [blame]	1792	#endif
Peter Zijlstra	67e9fb2	2007-10-15 17:00:10 +0200	[diff] [blame]	1793	}
				1794
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	1795	clear_buddies(cfs_rq, se);
Peter Zijlstra	4793241	2008-11-04 21:25:09 +0100	[diff] [blame]	1796
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	1797	if (se != cfs_rq->curr)
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1798	__dequeue_entity(cfs_rq, se);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	1799	se->on_rq = 0;
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1800	account_entity_dequeue(cfs_rq, se);
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1801
				1802	/*
				1803	* Normalize the entity after updating the min_vruntime because the
				1804	* update can refer to the ->curr item and we need to reflect this
				1805	* movement in our normalized position.
				1806	*/
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	1807	if (!(flags & DEQUEUE_SLEEP))
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1808	se->vruntime -= cfs_rq->min_vruntime;
Peter Zijlstra	1e87623	2011-05-17 16:21:10 -0700	[diff] [blame]	1809
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	1810	/* return excess runtime on last dequeue */
				1811	return_cfs_rq_runtime(cfs_rq);
				1812
Peter Zijlstra	1e87623	2011-05-17 16:21:10 -0700	[diff] [blame]	1813	update_min_vruntime(cfs_rq);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	1814	update_cfs_shares(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1815	}
				1816
				1817	/*
				1818	* Preempt the current task with a newly woken task if needed:
				1819	*/
Peter Zijlstra	7c92e54	2007-09-05 14:32:49 +0200	[diff] [blame]	1820	static void
Ingo Molnar	2e09bf5	2007-10-15 17:00:05 +0200	[diff] [blame]	1821	check_preempt_tick(struct cfs_rq cfs_rq, struct sched_entity curr)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1822	{
Peter Zijlstra	1169783	2007-09-05 14:32:49 +0200	[diff] [blame]	1823	unsigned long ideal_runtime, delta_exec;
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	1824	struct sched_entity *se;
				1825	s64 delta;
Peter Zijlstra	1169783	2007-09-05 14:32:49 +0200	[diff] [blame]	1826
Peter Zijlstra	6d0f0ebd	2007-10-15 17:00:05 +0200	[diff] [blame]	1827	ideal_runtime = sched_slice(cfs_rq, curr);
Peter Zijlstra	1169783	2007-09-05 14:32:49 +0200	[diff] [blame]	1828	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
Mike Galbraith	a9f3e2b	2009-01-28 14:51:39 +0100	[diff] [blame]	1829	if (delta_exec > ideal_runtime) {
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1830	resched_task(rq_of(cfs_rq)->curr);
Mike Galbraith	a9f3e2b	2009-01-28 14:51:39 +0100	[diff] [blame]	1831	/*
				1832	* The current task ran long enough, ensure it doesn't get
				1833	* re-elected due to buddy favours.
				1834	*/
				1835	clear_buddies(cfs_rq, curr);
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1836	return;
				1837	}
				1838
				1839	/*
				1840	* Ensure that a task that missed wakeup preemption by a
				1841	* narrow margin doesn't have to wait for a full slice.
				1842	* This also mitigates buddy induced latencies under load.
				1843	*/
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1844	if (delta_exec < sysctl_sched_min_granularity)
				1845	return;
				1846
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	1847	se = __pick_first_entity(cfs_rq);
				1848	delta = curr->vruntime - se->vruntime;
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1849
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	1850	if (delta < 0)
				1851	return;
Mike Galbraith	d7d8294	2011-01-05 05:41:17 +0100	[diff] [blame]	1852
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	1853	if (delta > ideal_runtime)
				1854	resched_task(rq_of(cfs_rq)->curr);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1855	}
				1856
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	1857	static void
Ingo Molnar	8494f41	2007-08-09 11:16:48 +0200	[diff] [blame]	1858	set_next_entity(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1859	{
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	1860	/* 'current' is not kept within the tree. */
				1861	if (se->on_rq) {
				1862	/*
				1863	* Any task has to be enqueued before it get to execute on
				1864	* a CPU. So account for the time it spent waiting on the
				1865	* runqueue.
				1866	*/
				1867	update_stats_wait_end(cfs_rq, se);
				1868	__dequeue_entity(cfs_rq, se);
				1869	}
				1870
Ingo Molnar	79303e9	2007-08-09 11:16:47 +0200	[diff] [blame]	1871	update_stats_curr_start(cfs_rq, se);
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	1872	cfs_rq->curr = se;
Ingo Molnar	eba1ed4	2007-10-15 17:00:02 +0200	[diff] [blame]	1873	#ifdef CONFIG_SCHEDSTATS
				1874	/*
				1875	* Track our maximum slice length, if the CPU's load is at
				1876	* least twice that of our own weight (i.e. dont track it
				1877	* when there are only lesser-weight tasks around):
				1878	*/
Dmitry Adamushko	495eca4	2007-10-15 17:00:06 +0200	[diff] [blame]	1879	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1880	se->statistics.slice_max = max(se->statistics.slice_max,
Ingo Molnar	eba1ed4	2007-10-15 17:00:02 +0200	[diff] [blame]	1881	se->sum_exec_runtime - se->prev_sum_exec_runtime);
				1882	}
				1883	#endif
Peter Zijlstra	4a55b45	2007-09-05 14:32:49 +0200	[diff] [blame]	1884	se->prev_sum_exec_runtime = se->sum_exec_runtime;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1885	}
				1886
Peter Zijlstra	3f3a490	2008-10-24 11:06:16 +0200	[diff] [blame]	1887	static int
				1888	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se);
				1889
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1890	/*
				1891	* Pick the next process, keeping these things in mind, in this order:
				1892	* 1) keep things fair between processes/task groups
				1893	* 2) pick the "next" process, since someone really wants that to run
				1894	* 3) pick the "last" process, for cache locality
				1895	* 4) do not run the "skip" process, if something else is available
				1896	*/
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	1897	static struct sched_entity pick_next_entity(struct cfs_rq cfs_rq)
Peter Zijlstra	aa2ac25	2008-03-14 21:12:12 +0100	[diff] [blame]	1898	{
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1899	struct sched_entity *se = __pick_first_entity(cfs_rq);
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1900	struct sched_entity *left = se;
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	1901
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1902	/*
				1903	* Avoid running the skip buddy, if running something else can
				1904	* be done without getting too unfair.
				1905	*/
				1906	if (cfs_rq->skip == se) {
				1907	struct sched_entity *second = __pick_next_entity(se);
				1908	if (second && wakeup_preempt_entity(second, left) < 1)
				1909	se = second;
				1910	}
Peter Zijlstra	aa2ac25	2008-03-14 21:12:12 +0100	[diff] [blame]	1911
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1912	/*
				1913	* Prefer last buddy, try to return the CPU to a preempted task.
				1914	*/
				1915	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
				1916	se = cfs_rq->last;
				1917
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1918	/*
				1919	* Someone really wants this to run. If it's not unfair, run it.
				1920	*/
				1921	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
				1922	se = cfs_rq->next;
				1923
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1924	clear_buddies(cfs_rq, se);
Peter Zijlstra	4793241	2008-11-04 21:25:09 +0100	[diff] [blame]	1925
				1926	return se;
Peter Zijlstra	aa2ac25	2008-03-14 21:12:12 +0100	[diff] [blame]	1927	}
				1928
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	1929	static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
				1930
Ingo Molnar	ab6cde2	2007-08-09 11:16:48 +0200	[diff] [blame]	1931	static void put_prev_entity(struct cfs_rq cfs_rq, struct sched_entity prev)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1932	{
				1933	/*
				1934	* If still on the runqueue then deactivate_task()
				1935	* was not called and update_curr() has to be done:
				1936	*/
				1937	if (prev->on_rq)
Ingo Molnar	b7cc089	2007-08-09 11:16:47 +0200	[diff] [blame]	1938	update_curr(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1939
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	1940	/* throttle cfs_rqs exceeding runtime */
				1941	check_cfs_rq_runtime(cfs_rq);
				1942
Peter Zijlstra	ddc9729	2007-10-15 17:00:10 +0200	[diff] [blame]	1943	check_spread(cfs_rq, prev);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1944	if (prev->on_rq) {
Ingo Molnar	5870db5	2007-08-09 11:16:47 +0200	[diff] [blame]	1945	update_stats_wait_start(cfs_rq, prev);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1946	/* Put 'current' back into the tree. */
				1947	__enqueue_entity(cfs_rq, prev);
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1948	/* in !on_rq case, update occurred at dequeue */
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1949	update_entity_load_avg(prev, 1);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1950	}
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	1951	cfs_rq->curr = NULL;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1952	}
				1953
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	1954	static void
				1955	entity_tick(struct cfs_rq cfs_rq, struct sched_entity curr, int queued)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1956	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1957	/*
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1958	* Update run-time statistics of the 'current'.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1959	*/
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1960	update_curr(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1961
Paul Turner	43365bd	2010-12-15 19:10:17 -0800	[diff] [blame]	1962	/*
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1963	* Ensure that runnable average is periodically updated.
				1964	*/
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1965	update_entity_load_avg(curr, 1);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1966	update_cfs_rq_blocked_load(cfs_rq, 1);
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1967
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	1968	#ifdef CONFIG_SCHED_HRTICK
				1969	/*
				1970	* queued ticks are scheduled to match the slice, so don't bother
				1971	* validating it and just reschedule.
				1972	*/
Harvey Harrison	983ed7a	2008-04-24 18:17:55 -0700	[diff] [blame]	1973	if (queued) {
				1974	resched_task(rq_of(cfs_rq)->curr);
				1975	return;
				1976	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	1977	/*
				1978	* don't let the period tick interfere with the hrtick preemption
				1979	*/
				1980	if (!sched_feat(DOUBLE_TICK) &&
				1981	hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
				1982	return;
				1983	#endif
				1984
Yong Zhang	2c2efae	2011-07-29 16:20:33 +0800	[diff] [blame]	1985	if (cfs_rq->nr_running > 1)
Ingo Molnar	2e09bf5	2007-10-15 17:00:05 +0200	[diff] [blame]	1986	check_preempt_tick(cfs_rq, curr);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1987	}
				1988
Paul Turner	ab84d31	2011-07-21 09:43:28 -0700	[diff] [blame]	1989
				1990	/**************************************************
				1991	* CFS bandwidth control machinery
				1992	*/
				1993
				1994	#ifdef CONFIG_CFS_BANDWIDTH
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	1995
				1996	#ifdef HAVE_JUMP_LABEL
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	1997	static struct static_key __cfs_bandwidth_used;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	1998
				1999	static inline bool cfs_bandwidth_used(void)
				2000	{
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	2001	return static_key_false(&__cfs_bandwidth_used);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2002	}
				2003
				2004	void account_cfs_bandwidth_used(int enabled, int was_enabled)
				2005	{
				2006	/* only need to count groups transitioning between enabled/!enabled */
				2007	if (enabled && !was_enabled)
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	2008	static_key_slow_inc(&__cfs_bandwidth_used);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2009	else if (!enabled && was_enabled)
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	2010	static_key_slow_dec(&__cfs_bandwidth_used);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2011	}
				2012	#else /* HAVE_JUMP_LABEL */
				2013	static bool cfs_bandwidth_used(void)
				2014	{
				2015	return true;
				2016	}
				2017
				2018	void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
				2019	#endif /* HAVE_JUMP_LABEL */
				2020
Paul Turner	ab84d31	2011-07-21 09:43:28 -0700	[diff] [blame]	2021	/*
				2022	* default period for cfs group bandwidth.
				2023	* default: 0.1s, units: nanoseconds
				2024	*/
				2025	static inline u64 default_cfs_period(void)
				2026	{
				2027	return 100000000ULL;
				2028	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2029
				2030	static inline u64 sched_cfs_bandwidth_slice(void)
				2031	{
				2032	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
				2033	}
				2034
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2035	/*
				2036	* Replenish runtime according to assigned quota and update expiration time.
				2037	* We use sched_clock_cpu directly instead of rq->clock to avoid adding
				2038	* additional synchronization around rq->lock.
				2039	*
				2040	* requires cfs_b->lock
				2041	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2042	void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2043	{
				2044	u64 now;
				2045
				2046	if (cfs_b->quota == RUNTIME_INF)
				2047	return;
				2048
				2049	now = sched_clock_cpu(smp_processor_id());
				2050	cfs_b->runtime = cfs_b->quota;
				2051	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
				2052	}
				2053
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2054	static inline struct cfs_bandwidth tg_cfs_bandwidth(struct task_group tg)
				2055	{
				2056	return &tg->cfs_bandwidth;
				2057	}
				2058
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2059	/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
				2060	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
				2061	{
				2062	if (unlikely(cfs_rq->throttle_count))
				2063	return cfs_rq->throttled_clock_task;
				2064
				2065	return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
				2066	}
				2067
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2068	/* returns 0 on failure to allocate runtime */
				2069	static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2070	{
				2071	struct task_group *tg = cfs_rq->tg;
				2072	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2073	u64 amount = 0, min_amount, expires;
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2074
				2075	/* note: this is a positive sum as runtime_remaining <= 0 */
				2076	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
				2077
				2078	raw_spin_lock(&cfs_b->lock);
				2079	if (cfs_b->quota == RUNTIME_INF)
				2080	amount = min_amount;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2081	else {
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2082	/*
				2083	* If the bandwidth pool has become inactive, then at least one
				2084	* period must have elapsed since the last consumption.
				2085	* Refresh the global state and ensure bandwidth timer becomes
				2086	* active.
				2087	*/
				2088	if (!cfs_b->timer_active) {
				2089	__refill_cfs_bandwidth_runtime(cfs_b);
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2090	__start_cfs_bandwidth(cfs_b);
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2091	}
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2092
				2093	if (cfs_b->runtime > 0) {
				2094	amount = min(cfs_b->runtime, min_amount);
				2095	cfs_b->runtime -= amount;
				2096	cfs_b->idle = 0;
				2097	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2098	}
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2099	expires = cfs_b->runtime_expires;
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2100	raw_spin_unlock(&cfs_b->lock);
				2101
				2102	cfs_rq->runtime_remaining += amount;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2103	/*
				2104	* we may have advanced our local expiration to account for allowed
				2105	* spread between our sched_clock and the one on which runtime was
				2106	* issued.
				2107	*/
				2108	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
				2109	cfs_rq->runtime_expires = expires;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2110
				2111	return cfs_rq->runtime_remaining > 0;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2112	}
				2113
				2114	/*
				2115	* Note: This depends on the synchronization provided by sched_clock and the
				2116	* fact that rq->clock snapshots this value.
				2117	*/
				2118	static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2119	{
				2120	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2121	struct rq *rq = rq_of(cfs_rq);
				2122
				2123	/* if the deadline is ahead of our clock, nothing to do */
				2124	if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
				2125	return;
				2126
				2127	if (cfs_rq->runtime_remaining < 0)
				2128	return;
				2129
				2130	/*
				2131	* If the local deadline has passed we have to consider the
				2132	* possibility that our sched_clock is 'fast' and the global deadline
				2133	* has not truly expired.
				2134	*
				2135	* Fortunately we can check determine whether this the case by checking
				2136	* whether the global deadline has advanced.
				2137	*/
				2138
				2139	if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
				2140	/* extend local deadline, drift is bounded above by 2 ticks */
				2141	cfs_rq->runtime_expires += TICK_NSEC;
				2142	} else {
				2143	/* global deadline is ahead, expiration has passed */
				2144	cfs_rq->runtime_remaining = 0;
				2145	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2146	}
				2147
				2148	static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
				2149	unsigned long delta_exec)
				2150	{
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2151	/* dock delta_exec before expiring quota (as it could span periods) */
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2152	cfs_rq->runtime_remaining -= delta_exec;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2153	expire_cfs_rq_runtime(cfs_rq);
				2154
				2155	if (likely(cfs_rq->runtime_remaining > 0))
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2156	return;
				2157
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2158	/*
				2159	* if we're unable to extend our runtime we resched so that the active
				2160	* hierarchy can be throttled
				2161	*/
				2162	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
				2163	resched_task(rq_of(cfs_rq)->curr);
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2164	}
				2165
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	2166	static __always_inline
				2167	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2168	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2169	if (!cfs_bandwidth_used() \|\| !cfs_rq->runtime_enabled)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2170	return;
				2171
				2172	__account_cfs_rq_runtime(cfs_rq, delta_exec);
				2173	}
				2174
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2175	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
				2176	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2177	return cfs_bandwidth_used() && cfs_rq->throttled;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2178	}
				2179
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2180	/* check whether cfs_rq, or any parent, is throttled */
				2181	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
				2182	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2183	return cfs_bandwidth_used() && cfs_rq->throttle_count;
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2184	}
				2185
				2186	/*
				2187	* Ensure that neither of the group entities corresponding to src_cpu or
				2188	* dest_cpu are members of a throttled hierarchy when performing group
				2189	* load-balance operations.
				2190	*/
				2191	static inline int throttled_lb_pair(struct task_group *tg,
				2192	int src_cpu, int dest_cpu)
				2193	{
				2194	struct cfs_rq src_cfs_rq, dest_cfs_rq;
				2195
				2196	src_cfs_rq = tg->cfs_rq[src_cpu];
				2197	dest_cfs_rq = tg->cfs_rq[dest_cpu];
				2198
				2199	return throttled_hierarchy(src_cfs_rq) \|\|
				2200	throttled_hierarchy(dest_cfs_rq);
				2201	}
				2202
				2203	/* updated child weight may affect parent so we have to do this bottom up */
				2204	static int tg_unthrottle_up(struct task_group tg, void data)
				2205	{
				2206	struct rq *rq = data;
				2207	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
				2208
				2209	cfs_rq->throttle_count--;
				2210	#ifdef CONFIG_SMP
				2211	if (!cfs_rq->throttle_count) {
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2212	/* adjust cfs_rq_clock_task() */
				2213	cfs_rq->throttled_clock_task_time += rq->clock_task -
				2214	cfs_rq->throttled_clock_task;
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2215	}
				2216	#endif
				2217
				2218	return 0;
				2219	}
				2220
				2221	static int tg_throttle_down(struct task_group tg, void data)
				2222	{
				2223	struct rq *rq = data;
				2224	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
				2225
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	2226	/* group is entering throttled state, stop time */
				2227	if (!cfs_rq->throttle_count)
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2228	cfs_rq->throttled_clock_task = rq->clock_task;
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2229	cfs_rq->throttle_count++;
				2230
				2231	return 0;
				2232	}
				2233
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2234	static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2235	{
				2236	struct rq *rq = rq_of(cfs_rq);
				2237	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2238	struct sched_entity *se;
				2239	long task_delta, dequeue = 1;
				2240
				2241	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
				2242
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2243	/* freeze hierarchy runnable averages while throttled */
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2244	rcu_read_lock();
				2245	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
				2246	rcu_read_unlock();
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2247
				2248	task_delta = cfs_rq->h_nr_running;
				2249	for_each_sched_entity(se) {
				2250	struct cfs_rq *qcfs_rq = cfs_rq_of(se);
				2251	/* throttled entity or throttle-on-deactivate */
				2252	if (!se->on_rq)
				2253	break;
				2254
				2255	if (dequeue)
				2256	dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
				2257	qcfs_rq->h_nr_running -= task_delta;
				2258
				2259	if (qcfs_rq->load.weight)
				2260	dequeue = 0;
				2261	}
				2262
				2263	if (!se)
				2264	rq->nr_running -= task_delta;
				2265
				2266	cfs_rq->throttled = 1;
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2267	cfs_rq->throttled_clock = rq->clock;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2268	raw_spin_lock(&cfs_b->lock);
				2269	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
				2270	raw_spin_unlock(&cfs_b->lock);
				2271	}
				2272
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2273	void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2274	{
				2275	struct rq *rq = rq_of(cfs_rq);
				2276	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2277	struct sched_entity *se;
				2278	int enqueue = 1;
				2279	long task_delta;
				2280
				2281	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
				2282
				2283	cfs_rq->throttled = 0;
				2284	raw_spin_lock(&cfs_b->lock);
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2285	cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2286	list_del_rcu(&cfs_rq->throttled_list);
				2287	raw_spin_unlock(&cfs_b->lock);
				2288
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2289	update_rq_clock(rq);
				2290	/* update hierarchical throttle state */
				2291	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
				2292
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2293	if (!cfs_rq->load.weight)
				2294	return;
				2295
				2296	task_delta = cfs_rq->h_nr_running;
				2297	for_each_sched_entity(se) {
				2298	if (se->on_rq)
				2299	enqueue = 0;
				2300
				2301	cfs_rq = cfs_rq_of(se);
				2302	if (enqueue)
				2303	enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
				2304	cfs_rq->h_nr_running += task_delta;
				2305
				2306	if (cfs_rq_throttled(cfs_rq))
				2307	break;
				2308	}
				2309
				2310	if (!se)
				2311	rq->nr_running += task_delta;
				2312
				2313	/* determine whether we need to wake up potentially idle cpu */
				2314	if (rq->curr == rq->idle && rq->cfs.nr_running)
				2315	resched_task(rq->curr);
				2316	}
				2317
				2318	static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
				2319	u64 remaining, u64 expires)
				2320	{
				2321	struct cfs_rq *cfs_rq;
				2322	u64 runtime = remaining;
				2323
				2324	rcu_read_lock();
				2325	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
				2326	throttled_list) {
				2327	struct rq *rq = rq_of(cfs_rq);
				2328
				2329	raw_spin_lock(&rq->lock);
				2330	if (!cfs_rq_throttled(cfs_rq))
				2331	goto next;
				2332
				2333	runtime = -cfs_rq->runtime_remaining + 1;
				2334	if (runtime > remaining)
				2335	runtime = remaining;
				2336	remaining -= runtime;
				2337
				2338	cfs_rq->runtime_remaining += runtime;
				2339	cfs_rq->runtime_expires = expires;
				2340
				2341	/* we check whether we're throttled above */
				2342	if (cfs_rq->runtime_remaining > 0)
				2343	unthrottle_cfs_rq(cfs_rq);
				2344
				2345	next:
				2346	raw_spin_unlock(&rq->lock);
				2347
				2348	if (!remaining)
				2349	break;
				2350	}
				2351	rcu_read_unlock();
				2352
				2353	return remaining;
				2354	}
				2355
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2356	/*
				2357	* Responsible for refilling a task_group's bandwidth and unthrottling its
				2358	* cfs_rqs as appropriate. If there has been no activity within the last
				2359	* period the timer is deactivated until scheduling resumes; cfs_b->idle is
				2360	* used to track this state.
				2361	*/
				2362	static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
				2363	{
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2364	u64 runtime, runtime_expires;
				2365	int idle = 1, throttled;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2366
				2367	raw_spin_lock(&cfs_b->lock);
				2368	/* no need to continue the timer with no bandwidth constraint */
				2369	if (cfs_b->quota == RUNTIME_INF)
				2370	goto out_unlock;
				2371
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2372	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
				2373	/* idle depends on !throttled (for the case of a large deficit) */
				2374	idle = cfs_b->idle && !throttled;
Nikhil Rao	e8da1b1	2011-07-21 09:43:40 -0700	[diff] [blame]	2375	cfs_b->nr_periods += overrun;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2376
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2377	/* if we're going inactive then everything else can be deferred */
				2378	if (idle)
				2379	goto out_unlock;
				2380
				2381	__refill_cfs_bandwidth_runtime(cfs_b);
				2382
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2383	if (!throttled) {
				2384	/* mark as potentially idle for the upcoming period */
				2385	cfs_b->idle = 1;
				2386	goto out_unlock;
				2387	}
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2388
Nikhil Rao	e8da1b1	2011-07-21 09:43:40 -0700	[diff] [blame]	2389	/* account preceding periods in which throttling occurred */
				2390	cfs_b->nr_throttled += overrun;
				2391
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2392	/*
				2393	* There are throttled entities so we must first use the new bandwidth
				2394	* to unthrottle them before making it generally available. This
				2395	* ensures that all existing debts will be paid before a new cfs_rq is
				2396	* allowed to run.
				2397	*/
				2398	runtime = cfs_b->runtime;
				2399	runtime_expires = cfs_b->runtime_expires;
				2400	cfs_b->runtime = 0;
				2401
				2402	/*
				2403	* This check is repeated as we are holding onto the new bandwidth
				2404	* while we unthrottle. This can potentially race with an unthrottled
				2405	* group trying to acquire new bandwidth from the global pool.
				2406	*/
				2407	while (throttled && runtime > 0) {
				2408	raw_spin_unlock(&cfs_b->lock);
				2409	/* we can't nest cfs_b->lock while distributing bandwidth */
				2410	runtime = distribute_cfs_runtime(cfs_b, runtime,
				2411	runtime_expires);
				2412	raw_spin_lock(&cfs_b->lock);
				2413
				2414	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
				2415	}
				2416
				2417	/* return (any) remaining runtime */
				2418	cfs_b->runtime = runtime;
				2419	/*
				2420	* While we are ensured activity in the period following an
				2421	* unthrottle, this also covers the case in which the new bandwidth is
				2422	* insufficient to cover the existing bandwidth deficit. (Forcing the
				2423	* timer to remain active while there are any throttled entities.)
				2424	*/
				2425	cfs_b->idle = 0;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2426	out_unlock:
				2427	if (idle)
				2428	cfs_b->timer_active = 0;
				2429	raw_spin_unlock(&cfs_b->lock);
				2430
				2431	return idle;
				2432	}
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2433
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	2434	/* a cfs_rq won't donate quota below this amount */
				2435	static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
				2436	/* minimum remaining period time to redistribute slack quota */
				2437	static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
				2438	/* how long we wait to gather additional slack before distributing */
				2439	static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
				2440
				2441	/* are we near the end of the current quota period? */
				2442	static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
				2443	{
				2444	struct hrtimer *refresh_timer = &cfs_b->period_timer;
				2445	u64 remaining;
				2446
				2447	/* if the call-back is running a quota refresh is already occurring */
				2448	if (hrtimer_callback_running(refresh_timer))
				2449	return 1;
				2450
				2451	/* is a quota refresh about to occur? */
				2452	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
				2453	if (remaining < min_expire)
				2454	return 1;
				2455
				2456	return 0;
				2457	}
				2458
				2459	static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
				2460	{
				2461	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
				2462
				2463	/* if there's a quota refresh soon don't bother with slack */
				2464	if (runtime_refresh_within(cfs_b, min_left))
				2465	return;
				2466
				2467	start_bandwidth_timer(&cfs_b->slack_timer,
				2468	ns_to_ktime(cfs_bandwidth_slack_period));
				2469	}
				2470
				2471	/* we know any runtime found here is valid as update_curr() precedes return */
				2472	static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2473	{
				2474	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2475	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
				2476
				2477	if (slack_runtime <= 0)
				2478	return;
				2479
				2480	raw_spin_lock(&cfs_b->lock);
				2481	if (cfs_b->quota != RUNTIME_INF &&
				2482	cfs_rq->runtime_expires == cfs_b->runtime_expires) {
				2483	cfs_b->runtime += slack_runtime;
				2484
				2485	/* we are under rq->lock, defer unthrottling using a timer */
				2486	if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
				2487	!list_empty(&cfs_b->throttled_cfs_rq))
				2488	start_cfs_slack_bandwidth(cfs_b);
				2489	}
				2490	raw_spin_unlock(&cfs_b->lock);
				2491
				2492	/* even if it's not valid for return we don't want to try again */
				2493	cfs_rq->runtime_remaining -= slack_runtime;
				2494	}
				2495
				2496	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2497	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2498	if (!cfs_bandwidth_used())
				2499	return;
				2500
Paul Turner	fccfdc6	2011-11-07 20:26:34 -0800	[diff] [blame]	2501	if (!cfs_rq->runtime_enabled \|\| cfs_rq->nr_running)
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	2502	return;
				2503
				2504	__return_cfs_rq_runtime(cfs_rq);
				2505	}
				2506
				2507	/*
				2508	* This is done with a timer (instead of inline with bandwidth return) since
				2509	* it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
				2510	*/
				2511	static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
				2512	{
				2513	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
				2514	u64 expires;
				2515
				2516	/* confirm we're still not at a refresh boundary */
				2517	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
				2518	return;
				2519
				2520	raw_spin_lock(&cfs_b->lock);
				2521	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
				2522	runtime = cfs_b->runtime;
				2523	cfs_b->runtime = 0;
				2524	}
				2525	expires = cfs_b->runtime_expires;
				2526	raw_spin_unlock(&cfs_b->lock);
				2527
				2528	if (!runtime)
				2529	return;
				2530
				2531	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
				2532
				2533	raw_spin_lock(&cfs_b->lock);
				2534	if (expires == cfs_b->runtime_expires)
				2535	cfs_b->runtime = runtime;
				2536	raw_spin_unlock(&cfs_b->lock);
				2537	}
				2538
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2539	/*
				2540	* When a group wakes up we want to make sure that its quota is not already
				2541	* expired/exceeded, otherwise it may be allowed to steal additional ticks of
				2542	* runtime as update_curr() throttling can not not trigger until it's on-rq.
				2543	*/
				2544	static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
				2545	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2546	if (!cfs_bandwidth_used())
				2547	return;
				2548
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2549	/* an active group must be handled by the update_curr()->put() path */
				2550	if (!cfs_rq->runtime_enabled \|\| cfs_rq->curr)
				2551	return;
				2552
				2553	/* ensure the group is not already throttled */
				2554	if (cfs_rq_throttled(cfs_rq))
				2555	return;
				2556
				2557	/* update runtime allocation */
				2558	account_cfs_rq_runtime(cfs_rq, 0);
				2559	if (cfs_rq->runtime_remaining <= 0)
				2560	throttle_cfs_rq(cfs_rq);
				2561	}
				2562
				2563	/* conditionally throttle active cfs_rq's from put_prev_entity() */
				2564	static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2565	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2566	if (!cfs_bandwidth_used())
				2567	return;
				2568
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2569	if (likely(!cfs_rq->runtime_enabled \|\| cfs_rq->runtime_remaining > 0))
				2570	return;
				2571
				2572	/*
				2573	* it's possible for a throttled entity to be forced into a running
				2574	* state (e.g. set_curr_task), in this case we're finished.
				2575	*/
				2576	if (cfs_rq_throttled(cfs_rq))
				2577	return;
				2578
				2579	throttle_cfs_rq(cfs_rq);
				2580	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2581
				2582	static inline u64 default_cfs_period(void);
				2583	static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
				2584	static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
				2585
				2586	static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
				2587	{
				2588	struct cfs_bandwidth *cfs_b =
				2589	container_of(timer, struct cfs_bandwidth, slack_timer);
				2590	do_sched_cfs_slack_timer(cfs_b);
				2591
				2592	return HRTIMER_NORESTART;
				2593	}
				2594
				2595	static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
				2596	{
				2597	struct cfs_bandwidth *cfs_b =
				2598	container_of(timer, struct cfs_bandwidth, period_timer);
				2599	ktime_t now;
				2600	int overrun;
				2601	int idle = 0;
				2602
				2603	for (;;) {
				2604	now = hrtimer_cb_get_time(timer);
				2605	overrun = hrtimer_forward(timer, now, cfs_b->period);
				2606
				2607	if (!overrun)
				2608	break;
				2609
				2610	idle = do_sched_cfs_period_timer(cfs_b, overrun);
				2611	}
				2612
				2613	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
				2614	}
				2615
				2616	void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				2617	{
				2618	raw_spin_lock_init(&cfs_b->lock);
				2619	cfs_b->runtime = 0;
				2620	cfs_b->quota = RUNTIME_INF;
				2621	cfs_b->period = ns_to_ktime(default_cfs_period());
				2622
				2623	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
				2624	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
				2625	cfs_b->period_timer.function = sched_cfs_period_timer;
				2626	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
				2627	cfs_b->slack_timer.function = sched_cfs_slack_timer;
				2628	}
				2629
				2630	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2631	{
				2632	cfs_rq->runtime_enabled = 0;
				2633	INIT_LIST_HEAD(&cfs_rq->throttled_list);
				2634	}
				2635
				2636	/* requires cfs_b->lock, may release to reprogram timer */
				2637	void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				2638	{
				2639	/*
				2640	* The timer may be active because we're trying to set a new bandwidth
				2641	* period or because we're racing with the tear-down path
				2642	* (timer_active==0 becomes visible before the hrtimer call-back
				2643	* terminates). In either case we ensure that it's re-programmed
				2644	*/
				2645	while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
				2646	raw_spin_unlock(&cfs_b->lock);
				2647	/* ensure cfs_b->lock is available while we wait */
				2648	hrtimer_cancel(&cfs_b->period_timer);
				2649
				2650	raw_spin_lock(&cfs_b->lock);
				2651	/* if someone else restarted the timer then we're done */
				2652	if (cfs_b->timer_active)
				2653	return;
				2654	}
				2655
				2656	cfs_b->timer_active = 1;
				2657	start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
				2658	}
				2659
				2660	static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				2661	{
				2662	hrtimer_cancel(&cfs_b->period_timer);
				2663	hrtimer_cancel(&cfs_b->slack_timer);
				2664	}
				2665
Peter Boonstoppel	a4c96ae	2012-08-09 15:34:47 -0700	[diff] [blame]	2666	static void unthrottle_offline_cfs_rqs(struct rq *rq)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2667	{
				2668	struct cfs_rq *cfs_rq;
				2669
				2670	for_each_leaf_cfs_rq(rq, cfs_rq) {
				2671	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2672
				2673	if (!cfs_rq->runtime_enabled)
				2674	continue;
				2675
				2676	/*
				2677	* clock_task is not advancing so we just need to make sure
				2678	* there's some valid quota amount
				2679	*/
				2680	cfs_rq->runtime_remaining = cfs_b->quota;
				2681	if (cfs_rq_throttled(cfs_rq))
				2682	unthrottle_cfs_rq(cfs_rq);
				2683	}
				2684	}
				2685
				2686	#else /* CONFIG_CFS_BANDWIDTH */
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2687	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
				2688	{
				2689	return rq_of(cfs_rq)->clock_task;
				2690	}
				2691
				2692	static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
				2693	unsigned long delta_exec) {}
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2694	static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
				2695	static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	2696	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2697
				2698	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
				2699	{
				2700	return 0;
				2701	}
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2702
				2703	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
				2704	{
				2705	return 0;
				2706	}
				2707
				2708	static inline int throttled_lb_pair(struct task_group *tg,
				2709	int src_cpu, int dest_cpu)
				2710	{
				2711	return 0;
				2712	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2713
				2714	void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
				2715
				2716	#ifdef CONFIG_FAIR_GROUP_SCHED
				2717	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
Paul Turner	ab84d31	2011-07-21 09:43:28 -0700	[diff] [blame]	2718	#endif
				2719
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2720	static inline struct cfs_bandwidth tg_cfs_bandwidth(struct task_group tg)
				2721	{
				2722	return NULL;
				2723	}
				2724	static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
Peter Boonstoppel	a4c96ae	2012-08-09 15:34:47 -0700	[diff] [blame]	2725	static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2726
				2727	#endif /* CONFIG_CFS_BANDWIDTH */
				2728
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2729	/**************************************************
				2730	* CFS operations on tasks:
				2731	*/
				2732
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2733	#ifdef CONFIG_SCHED_HRTICK
				2734	static void hrtick_start_fair(struct rq rq, struct task_struct p)
				2735	{
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2736	struct sched_entity *se = &p->se;
				2737	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				2738
				2739	WARN_ON(task_rq(p) != rq);
				2740
Mike Galbraith	b39e66e	2011-11-22 15:20:07 +0100	[diff] [blame]	2741	if (cfs_rq->nr_running > 1) {
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2742	u64 slice = sched_slice(cfs_rq, se);
				2743	u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
				2744	s64 delta = slice - ran;
				2745
				2746	if (delta < 0) {
				2747	if (rq->curr == p)
				2748	resched_task(p);
				2749	return;
				2750	}
				2751
				2752	/*
				2753	* Don't schedule slices shorter than 10000ns, that just
				2754	* doesn't make sense. Rely on vruntime for fairness.
				2755	*/
Peter Zijlstra	3165651	2008-07-18 18:01:23 +0200	[diff] [blame]	2756	if (rq->curr != p)
Peter Zijlstra	157124c	2008-07-28 11:53:11 +0200	[diff] [blame]	2757	delta = max_t(s64, 10000LL, delta);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2758
Peter Zijlstra	3165651	2008-07-18 18:01:23 +0200	[diff] [blame]	2759	hrtick_start(rq, delta);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2760	}
				2761	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	2762
				2763	/*
				2764	* called from enqueue/dequeue and updates the hrtick when the
				2765	* current task is from our class and nr_running is low enough
				2766	* to matter.
				2767	*/
				2768	static void hrtick_update(struct rq *rq)
				2769	{
				2770	struct task_struct *curr = rq->curr;
				2771
Mike Galbraith	b39e66e	2011-11-22 15:20:07 +0100	[diff] [blame]	2772	if (!hrtick_enabled(rq) \|\| curr->sched_class != &fair_sched_class)
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	2773	return;
				2774
				2775	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
				2776	hrtick_start_fair(rq, curr);
				2777	}
Dhaval Giani	55e12e5	2008-06-24 23:39:43 +0530	[diff] [blame]	2778	#else /* !CONFIG_SCHED_HRTICK */
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2779	static inline void
				2780	hrtick_start_fair(struct rq rq, struct task_struct p)
				2781	{
				2782	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	2783
				2784	static inline void hrtick_update(struct rq *rq)
				2785	{
				2786	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2787	#endif
				2788
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2789	/*
				2790	* The enqueue_task method is called before nr_running is
				2791	* increased. Here we update the fair scheduling stats and
				2792	* then put the task into the rbtree:
				2793	*/
Thomas Gleixner	ea87bb7	2010-01-20 20:58:57 +0000	[diff] [blame]	2794	static void
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	2795	enqueue_task_fair(struct rq rq, struct task_struct p, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2796	{
				2797	struct cfs_rq *cfs_rq;
Peter Zijlstra	62fb185	2008-02-25 17:34:02 +0100	[diff] [blame]	2798	struct sched_entity *se = &p->se;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2799
				2800	for_each_sched_entity(se) {
Peter Zijlstra	62fb185	2008-02-25 17:34:02 +0100	[diff] [blame]	2801	if (se->on_rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2802	break;
				2803	cfs_rq = cfs_rq_of(se);
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2804	enqueue_entity(cfs_rq, se, flags);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2805
				2806	/*
				2807	* end evaluation on encountering a throttled cfs_rq
				2808	*
				2809	* note: in the case of encountering a throttled cfs_rq we will
				2810	* post the final h_nr_running increment below.
				2811	*/
				2812	if (cfs_rq_throttled(cfs_rq))
				2813	break;
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	2814	cfs_rq->h_nr_running++;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2815
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2816	flags = ENQUEUE_WAKEUP;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2817	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2818
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2819	for_each_sched_entity(se) {
Lin Ming	0f31714	2011-07-22 09:14:31 +0800	[diff] [blame]	2820	cfs_rq = cfs_rq_of(se);
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	2821	cfs_rq->h_nr_running++;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2822
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2823	if (cfs_rq_throttled(cfs_rq))
				2824	break;
				2825
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	2826	update_cfs_shares(cfs_rq);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2827	update_entity_load_avg(se, 1);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2828	}
				2829
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	2830	if (!se) {
				2831	update_rq_runnable_avg(rq, rq->nr_running);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2832	inc_nr_running(rq);
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	2833	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	2834	hrtick_update(rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2835	}
				2836
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	2837	static void set_next_buddy(struct sched_entity *se);
				2838
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2839	/*
				2840	* The dequeue_task method is called before nr_running is
				2841	* decreased. We remove the task from the rbtree and
				2842	* update the fair scheduling stats:
				2843	*/
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	2844	static void dequeue_task_fair(struct rq rq, struct task_struct p, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2845	{
				2846	struct cfs_rq *cfs_rq;
Peter Zijlstra	62fb185	2008-02-25 17:34:02 +0100	[diff] [blame]	2847	struct sched_entity *se = &p->se;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	2848	int task_sleep = flags & DEQUEUE_SLEEP;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2849
				2850	for_each_sched_entity(se) {
				2851	cfs_rq = cfs_rq_of(se);
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	2852	dequeue_entity(cfs_rq, se, flags);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2853
				2854	/*
				2855	* end evaluation on encountering a throttled cfs_rq
				2856	*
				2857	* note: in the case of encountering a throttled cfs_rq we will
				2858	* post the final h_nr_running decrement below.
				2859	*/
				2860	if (cfs_rq_throttled(cfs_rq))
				2861	break;
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	2862	cfs_rq->h_nr_running--;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2863
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2864	/* Don't dequeue parent if it has other entities besides us */
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	2865	if (cfs_rq->load.weight) {
				2866	/*
				2867	* Bias pick_next to pick a task from this cfs_rq, as
				2868	* p is sleeping when it is within its sched_slice.
				2869	*/
				2870	if (task_sleep && parent_entity(se))
				2871	set_next_buddy(parent_entity(se));
Paul Turner	9598c82	2011-07-06 22:30:37 -0700	[diff] [blame]	2872
				2873	/* avoid re-evaluating load for this entity */
				2874	se = parent_entity(se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2875	break;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	2876	}
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	2877	flags \|= DEQUEUE_SLEEP;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2878	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2879
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2880	for_each_sched_entity(se) {
Lin Ming	0f31714	2011-07-22 09:14:31 +0800	[diff] [blame]	2881	cfs_rq = cfs_rq_of(se);
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	2882	cfs_rq->h_nr_running--;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2883
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2884	if (cfs_rq_throttled(cfs_rq))
				2885	break;
				2886
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	2887	update_cfs_shares(cfs_rq);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2888	update_entity_load_avg(se, 1);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2889	}
				2890
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	2891	if (!se) {
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2892	dec_nr_running(rq);
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	2893	update_rq_runnable_avg(rq, 1);
				2894	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	2895	hrtick_update(rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2896	}
				2897
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	2898	#ifdef CONFIG_SMP
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2899	/* Used instead of source_load when we know the type == 0 */
				2900	static unsigned long weighted_cpuload(const int cpu)
				2901	{
				2902	return cpu_rq(cpu)->load.weight;
				2903	}
				2904
				2905	/*
				2906	* Return a low guess at the load of a migration-source cpu weighted
				2907	* according to the scheduling class and "nice" value.
				2908	*
				2909	* We want to under-estimate the load of migration sources, to
				2910	* balance conservatively.
				2911	*/
				2912	static unsigned long source_load(int cpu, int type)
				2913	{
				2914	struct rq *rq = cpu_rq(cpu);
				2915	unsigned long total = weighted_cpuload(cpu);
				2916
				2917	if (type == 0 \|\| !sched_feat(LB_BIAS))
				2918	return total;
				2919
				2920	return min(rq->cpu_load[type-1], total);
				2921	}
				2922
				2923	/*
				2924	* Return a high guess at the load of a migration-target cpu weighted
				2925	* according to the scheduling class and "nice" value.
				2926	*/
				2927	static unsigned long target_load(int cpu, int type)
				2928	{
				2929	struct rq *rq = cpu_rq(cpu);
				2930	unsigned long total = weighted_cpuload(cpu);
				2931
				2932	if (type == 0 \|\| !sched_feat(LB_BIAS))
				2933	return total;
				2934
				2935	return max(rq->cpu_load[type-1], total);
				2936	}
				2937
				2938	static unsigned long power_of(int cpu)
				2939	{
				2940	return cpu_rq(cpu)->cpu_power;
				2941	}
				2942
				2943	static unsigned long cpu_avg_load_per_task(int cpu)
				2944	{
				2945	struct rq *rq = cpu_rq(cpu);
				2946	unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
				2947
				2948	if (nr_running)
				2949	return rq->load.weight / nr_running;
				2950
				2951	return 0;
				2952	}
				2953
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	2954
Peter Zijlstra	74f8e4b	2011-04-05 17:23:47 +0200	[diff] [blame]	2955	static void task_waking_fair(struct task_struct *p)
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2956	{
				2957	struct sched_entity *se = &p->se;
				2958	struct cfs_rq *cfs_rq = cfs_rq_of(se);
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	2959	u64 min_vruntime;
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2960
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	2961	#ifndef CONFIG_64BIT
				2962	u64 min_vruntime_copy;
Peter Zijlstra	74f8e4b	2011-04-05 17:23:47 +0200	[diff] [blame]	2963
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	2964	do {
				2965	min_vruntime_copy = cfs_rq->min_vruntime_copy;
				2966	smp_rmb();
				2967	min_vruntime = cfs_rq->min_vruntime;
				2968	} while (min_vruntime != min_vruntime_copy);
				2969	#else
				2970	min_vruntime = cfs_rq->min_vruntime;
				2971	#endif
				2972
				2973	se->vruntime -= min_vruntime;
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2974	}
				2975
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	2976	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	f5bfb7d	2008-06-27 13:41:39 +0200	[diff] [blame]	2977	/*
				2978	* effective_load() calculates the load change as seen from the root_task_group
				2979	*
				2980	* Adding load to a group doesn't make a group heavier, but can cause movement
				2981	* of group shares between cpus. Assuming the shares were perfectly aligned one
				2982	* can calculate the shift in shares.
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	2983	*
				2984	* Calculate the effective load difference if @wl is added (subtracted) to @tg
				2985	* on this @cpu and results in a total addition (subtraction) of @wg to the
				2986	* total group weight.
				2987	*
				2988	* Given a runqueue weight distribution (rw_i) we can compute a shares
				2989	* distribution (s_i) using:
				2990	*
				2991	* s_i = rw_i / \Sum rw_j (1)
				2992	*
				2993	* Suppose we have 4 CPUs and our @tg is a direct child of the root group and
				2994	* has 7 equal weight tasks, distributed as below (rw_i), with the resulting
				2995	* shares distribution (s_i):
				2996	*
				2997	* rw_i = { 2, 4, 1, 0 }
				2998	* s_i = { 2/7, 4/7, 1/7, 0 }
				2999	*
				3000	* As per wake_affine() we're interested in the load of two CPUs (the CPU the
				3001	* task used to run on and the CPU the waker is running on), we need to
				3002	* compute the effect of waking a task on either CPU and, in case of a sync
				3003	* wakeup, compute the effect of the current task going to sleep.
				3004	*
				3005	* So for a change of @wl to the local @cpu with an overall group weight change
				3006	* of @wl we can compute the new shares distribution (s'_i) using:
				3007	*
				3008	* s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
				3009	*
				3010	* Suppose we're interested in CPUs 0 and 1, and want to compute the load
				3011	* differences in waking a task to CPU 0. The additional task changes the
				3012	* weight and shares distributions like:
				3013	*
				3014	* rw'_i = { 3, 4, 1, 0 }
				3015	* s'_i = { 3/8, 4/8, 1/8, 0 }
				3016	*
				3017	* We can then compute the difference in effective weight by using:
				3018	*
				3019	* dw_i = S * (s'_i - s_i) (3)
				3020	*
				3021	* Where 'S' is the group weight as seen by its parent.
				3022	*
				3023	* Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
				3024	* times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
				3025	* 4/7) times the weight of the group.
Peter Zijlstra	f5bfb7d	2008-06-27 13:41:39 +0200	[diff] [blame]	3026	*/
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	3027	static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3028	{
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3029	struct sched_entity *se = tg->se[cpu];
Peter Zijlstra	f1d239f	2008-06-27 13:41:38 +0200	[diff] [blame]	3030
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3031	if (!tg->parent) /* the trivial, non-cgroup case */
Peter Zijlstra	f1d239f	2008-06-27 13:41:38 +0200	[diff] [blame]	3032	return wl;
				3033
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3034	for_each_sched_entity(se) {
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3035	long w, W;
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3036
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	3037	tg = se->my_q->tg;
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3038
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3039	/*
				3040	* W = @wg + \Sum rw_j
				3041	*/
				3042	W = wg + calc_tg_weight(tg, se->my_q);
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3043
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3044	/*
				3045	* w = rw_i + @wl
				3046	*/
				3047	w = se->my_q->load.weight + wl;
Peter Zijlstra	940959e	2008-09-23 15:33:42 +0200	[diff] [blame]	3048
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3049	/*
				3050	* wl = S * s'_i; see (2)
				3051	*/
				3052	if (W > 0 && w < W)
				3053	wl = (w * tg->shares) / W;
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	3054	else
				3055	wl = tg->shares;
Peter Zijlstra	940959e	2008-09-23 15:33:42 +0200	[diff] [blame]	3056
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3057	/*
				3058	* Per the above, wl is the new se->load.weight value; since
				3059	* those are clipped to [MIN_SHARES, ...) do so now. See
				3060	* calc_cfs_shares().
				3061	*/
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	3062	if (wl < MIN_SHARES)
				3063	wl = MIN_SHARES;
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3064
				3065	/*
				3066	* wl = dw_i = S * (s'_i - s_i); see (3)
				3067	*/
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	3068	wl -= se->load.weight;
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3069
				3070	/*
				3071	* Recursively apply this logic to all parent groups to compute
				3072	* the final effective load change on the root group. Since
				3073	* only the @tg group gets extra weight, all parent groups can
				3074	* only redistribute existing shares. @wl is the shift in shares
				3075	* resulting from this level per the above.
				3076	*/
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3077	wg = 0;
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3078	}
				3079
				3080	return wl;
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3081	}
				3082	#else
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3083
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3084	static inline unsigned long effective_load(struct task_group *tg, int cpu,
				3085	unsigned long wl, unsigned long wg)
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3086	{
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3087	return wl;
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3088	}
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3089
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3090	#endif
				3091
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3092	static int wake_affine(struct sched_domain sd, struct task_struct p, int sync)
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3093	{
Paul Turner	e37b6a7	2011-01-21 20:44:59 -0800	[diff] [blame]	3094	s64 this_load, load;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3095	int idx, this_cpu, prev_cpu;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3096	unsigned long tl_per_task;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3097	struct task_group *tg;
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3098	unsigned long weight;
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	3099	int balanced;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3100
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3101	idx = sd->wake_idx;
				3102	this_cpu = smp_processor_id();
				3103	prev_cpu = task_cpu(p);
				3104	load = source_load(prev_cpu, idx);
				3105	this_load = target_load(this_cpu, idx);
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3106
				3107	/*
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3108	* If sync wakeup then subtract the (maximum possible)
				3109	* effect of the currently running task from the load
				3110	* of the current CPU:
				3111	*/
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3112	if (sync) {
				3113	tg = task_group(current);
				3114	weight = current->se.load.weight;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3115
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3116	this_load += effective_load(tg, this_cpu, -weight, -weight);
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3117	load += effective_load(tg, prev_cpu, 0, -weight);
				3118	}
				3119
				3120	tg = task_group(p);
				3121	weight = p->se.load.weight;
				3122
Peter Zijlstra	71a29aa	2009-09-07 18:28:05 +0200	[diff] [blame]	3123	/*
				3124	* In low-load situations, where prev_cpu is idle and this_cpu is idle
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3125	* due to the sync cause above having dropped this_load to 0, we'll
				3126	* always have an imbalance, but there's really nothing you can do
				3127	* about that, so that's good too.
Peter Zijlstra	71a29aa	2009-09-07 18:28:05 +0200	[diff] [blame]	3128	*
				3129	* Otherwise check if either cpus are near enough in load to allow this
				3130	* task to be woken on this_cpu.
				3131	*/
Paul Turner	e37b6a7	2011-01-21 20:44:59 -0800	[diff] [blame]	3132	if (this_load > 0) {
				3133	s64 this_eff_load, prev_eff_load;
Peter Zijlstra	e51fd5e	2010-05-31 12:37:30 +0200	[diff] [blame]	3134
				3135	this_eff_load = 100;
				3136	this_eff_load *= power_of(prev_cpu);
				3137	this_eff_load *= this_load +
				3138	effective_load(tg, this_cpu, weight, weight);
				3139
				3140	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
				3141	prev_eff_load *= power_of(this_cpu);
				3142	prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
				3143
				3144	balanced = this_eff_load <= prev_eff_load;
				3145	} else
				3146	balanced = true;
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	3147
				3148	/*
				3149	* If the currently running task will sleep within
				3150	* a reasonable amount of time then attract this newly
				3151	* woken task:
				3152	*/
Peter Zijlstra	2fb7635	2008-10-08 09:16:04 +0200	[diff] [blame]	3153	if (sync && balanced)
				3154	return 1;
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	3155
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3156	schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	3157	tl_per_task = cpu_avg_load_per_task(this_cpu);
				3158
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3159	if (balanced \|\|
				3160	(this_load <= load &&
				3161	this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3162	/*
				3163	* This domain has SD_WAKE_AFFINE and
				3164	* p is cache cold in this domain, and
				3165	* there is no bad imbalance.
				3166	*/
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3167	schedstat_inc(sd, ttwu_move_affine);
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3168	schedstat_inc(p, se.statistics.nr_wakeups_affine);
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3169
				3170	return 1;
				3171	}
				3172	return 0;
				3173	}
				3174
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3175	/*
				3176	* find_idlest_group finds and returns the least busy CPU group within the
				3177	* domain.
				3178	*/
				3179	static struct sched_group *
Peter Zijlstra	78e7ed5	2009-09-03 13:16:51 +0200	[diff] [blame]	3180	find_idlest_group(struct sched_domain sd, struct task_struct p,
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	3181	int this_cpu, int load_idx)
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3182	{
Andi Kleen	b3bd3de	2010-08-10 14:17:51 -0700	[diff] [blame]	3183	struct sched_group idlest = NULL, group = sd->groups;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3184	unsigned long min_load = ULONG_MAX, this_load = 0;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3185	int imbalance = 100 + (sd->imbalance_pct-100)/2;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3186
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3187	do {
				3188	unsigned long load, avg_load;
				3189	int local_group;
				3190	int i;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3191
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3192	/* Skip over this group if it has no CPUs allowed */
				3193	if (!cpumask_intersects(sched_group_cpus(group),
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	3194	tsk_cpus_allowed(p)))
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3195	continue;
				3196
				3197	local_group = cpumask_test_cpu(this_cpu,
				3198	sched_group_cpus(group));
				3199
				3200	/* Tally up the load of all CPUs in the group */
				3201	avg_load = 0;
				3202
				3203	for_each_cpu(i, sched_group_cpus(group)) {
				3204	/* Bias balancing toward cpus of our domain */
				3205	if (local_group)
				3206	load = source_load(i, load_idx);
				3207	else
				3208	load = target_load(i, load_idx);
				3209
				3210	avg_load += load;
				3211	}
				3212
				3213	/* Adjust by relative CPU power of the group */
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	3214	avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3215
				3216	if (local_group) {
				3217	this_load = avg_load;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3218	} else if (avg_load < min_load) {
				3219	min_load = avg_load;
				3220	idlest = group;
				3221	}
				3222	} while (group = group->next, group != sd->groups);
				3223
				3224	if (!idlest \|\| 100this_load < imbalancemin_load)
				3225	return NULL;
				3226	return idlest;
				3227	}
				3228
				3229	/*
				3230	* find_idlest_cpu - find the idlest cpu among the cpus in group.
				3231	*/
				3232	static int
				3233	find_idlest_cpu(struct sched_group group, struct task_struct p, int this_cpu)
				3234	{
				3235	unsigned long load, min_load = ULONG_MAX;
				3236	int idlest = -1;
				3237	int i;
				3238
				3239	/* Traverse only the allowed CPUs */
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	3240	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3241	load = weighted_cpuload(i);
				3242
				3243	if (load < min_load \|\| (load == min_load && i == this_cpu)) {
				3244	min_load = load;
				3245	idlest = i;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3246	}
				3247	}
				3248
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3249	return idlest;
				3250	}
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3251
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3252	/*
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3253	* Try and locate an idle CPU in the sched_domain.
				3254	*/
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3255	static int select_idle_sibling(struct task_struct *p, int target)
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3256	{
				3257	int cpu = smp_processor_id();
				3258	int prev_cpu = task_cpu(p);
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3259	struct sched_domain *sd;
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3260	struct sched_group *sg;
				3261	int i;
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3262
				3263	/*
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3264	* If the task is going to be woken-up on this cpu and if it is
				3265	* already idle, then it is the right target.
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3266	*/
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3267	if (target == cpu && idle_cpu(cpu))
				3268	return cpu;
				3269
				3270	/*
				3271	* If the task is going to be woken-up on the cpu where it previously
				3272	* ran and if it is currently idle, then it the right target.
				3273	*/
				3274	if (target == prev_cpu && idle_cpu(prev_cpu))
Peter Zijlstra	fe3bcfe	2009-11-12 15:55:29 +0100	[diff] [blame]	3275	return prev_cpu;
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3276
				3277	/*
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3278	* Otherwise, iterate the domains and find an elegible idle cpu.
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3279	*/
Peter Zijlstra	518cd62	2011-12-07 15:07:31 +0100	[diff] [blame]	3280	sd = rcu_dereference(per_cpu(sd_llc, target));
Suresh Siddha	77e8136	2011-11-17 11:08:23 -0800	[diff] [blame]	3281	for_each_lower_domain(sd) {
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3282	sg = sd->groups;
				3283	do {
				3284	if (!cpumask_intersects(sched_group_cpus(sg),
				3285	tsk_cpus_allowed(p)))
				3286	goto next;
Mike Galbraith	970e178	2012-06-12 05:18:32 +0200	[diff] [blame]	3287
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3288	for_each_cpu(i, sched_group_cpus(sg)) {
				3289	if (!idle_cpu(i))
				3290	goto next;
				3291	}
				3292
				3293	target = cpumask_first_and(sched_group_cpus(sg),
				3294	tsk_cpus_allowed(p));
				3295	goto done;
				3296	next:
				3297	sg = sg->next;
				3298	} while (sg != sd->groups);
				3299	}
				3300	done:
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3301	return target;
				3302	}
				3303
				3304	/*
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3305	* sched_balance_self: balance the current task (running on cpu) in domains
				3306	* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
				3307	* SD_BALANCE_EXEC.
				3308	*
				3309	* Balance, ie. select the least loaded group.
				3310	*
				3311	* Returns the target CPU number, or the same CPU if no balancing is needed.
				3312	*
				3313	* preempt must be disabled.
				3314	*/
Peter Zijlstra	0017d73	2010-03-24 18:34:10 +0100	[diff] [blame]	3315	static int
Peter Zijlstra	7608dec	2011-04-05 17:23:46 +0200	[diff] [blame]	3316	select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3317	{
Peter Zijlstra	29cd8ba	2009-09-17 09:01:14 +0200	[diff] [blame]	3318	struct sched_domain tmp, affine_sd = NULL, *sd = NULL;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3319	int cpu = smp_processor_id();
				3320	int prev_cpu = task_cpu(p);
				3321	int new_cpu = cpu;
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3322	int want_affine = 0;
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	3323	int sync = wake_flags & WF_SYNC;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3324
Peter Zijlstra	29baa74	2012-04-23 12:11:21 +0200	[diff] [blame]	3325	if (p->nr_cpus_allowed == 1)
Mike Galbraith	76854c7	2011-11-22 15:18:24 +0100	[diff] [blame]	3326	return prev_cpu;
				3327
Peter Zijlstra	0763a66	2009-09-14 19:37:39 +0200	[diff] [blame]	3328	if (sd_flag & SD_BALANCE_WAKE) {
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	3329	if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3330	want_affine = 1;
				3331	new_cpu = prev_cpu;
				3332	}
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3333
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	3334	rcu_read_lock();
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3335	for_each_domain(cpu, tmp) {
Peter Zijlstra	e4f42888	2009-12-16 18:04:34 +0100	[diff] [blame]	3336	if (!(tmp->flags & SD_LOAD_BALANCE))
				3337	continue;
				3338
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3339	/*
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3340	* If both cpu and prev_cpu are part of this domain,
				3341	* cpu is a valid SD_WAKE_AFFINE target.
Peter Zijlstra	fe3bcfe	2009-11-12 15:55:29 +0100	[diff] [blame]	3342	*/
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3343	if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
				3344	cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
				3345	affine_sd = tmp;
Alex Shi	f03542a	2012-07-26 08:55:34 +0800	[diff] [blame]	3346	break;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3347	}
				3348
Alex Shi	f03542a	2012-07-26 08:55:34 +0800	[diff] [blame]	3349	if (tmp->flags & sd_flag)
Peter Zijlstra	29cd8ba	2009-09-17 09:01:14 +0200	[diff] [blame]	3350	sd = tmp;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3351	}
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3352
Mike Galbraith	8b911ac	2010-03-11 17:17:16 +0100	[diff] [blame]	3353	if (affine_sd) {
Alex Shi	f03542a	2012-07-26 08:55:34 +0800	[diff] [blame]	3354	if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	3355	prev_cpu = cpu;
				3356
				3357	new_cpu = select_idle_sibling(p, prev_cpu);
				3358	goto unlock;
Mike Galbraith	8b911ac	2010-03-11 17:17:16 +0100	[diff] [blame]	3359	}
Peter Zijlstra	3b64089	2009-09-16 13:44:33 +0200	[diff] [blame]	3360
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3361	while (sd) {
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	3362	int load_idx = sd->forkexec_idx;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3363	struct sched_group *group;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3364	int weight;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3365
Peter Zijlstra	0763a66	2009-09-14 19:37:39 +0200	[diff] [blame]	3366	if (!(sd->flags & sd_flag)) {
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3367	sd = sd->child;
				3368	continue;
				3369	}
				3370
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	3371	if (sd_flag & SD_BALANCE_WAKE)
				3372	load_idx = sd->wake_idx;
				3373
				3374	group = find_idlest_group(sd, p, cpu, load_idx);
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3375	if (!group) {
				3376	sd = sd->child;
				3377	continue;
				3378	}
				3379
Peter Zijlstra	d7c33c4	2009-09-11 12:45:38 +0200	[diff] [blame]	3380	new_cpu = find_idlest_cpu(group, p, cpu);
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3381	if (new_cpu == -1 \|\| new_cpu == cpu) {
				3382	/* Now try balancing at a lower domain level of cpu */
				3383	sd = sd->child;
				3384	continue;
				3385	}
				3386
				3387	/* Now try balancing at a lower domain level of new_cpu */
				3388	cpu = new_cpu;
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	3389	weight = sd->span_weight;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3390	sd = NULL;
				3391	for_each_domain(cpu, tmp) {
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	3392	if (weight <= tmp->span_weight)
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3393	break;
Peter Zijlstra	0763a66	2009-09-14 19:37:39 +0200	[diff] [blame]	3394	if (tmp->flags & sd_flag)
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3395	sd = tmp;
				3396	}
				3397	/* while loop will break here if sd == NULL */
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3398	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	3399	unlock:
				3400	rcu_read_unlock();
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3401
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3402	return new_cpu;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3403	}
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	3404
				3405	/*
Paul Turner	f4e26b1	2012-10-04 13:18:32 +0200	[diff] [blame]	3406	* Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
				3407	* removed when useful for applications beyond shares distribution (e.g.
				3408	* load-balance).
				3409	*/
				3410	#ifdef CONFIG_FAIR_GROUP_SCHED
				3411	/*
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	3412	* Called immediately before a task is migrated to a new cpu; task_cpu(p) and
				3413	* cfs_rq_of(p) references at time of call are still valid and identify the
				3414	* previous cpu. However, the caller only guarantees p->pi_lock is held; no
				3415	* other assumptions, including the state of rq->lock, should be made.
				3416	*/
				3417	static void
				3418	migrate_task_rq_fair(struct task_struct *p, int next_cpu)
				3419	{
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	3420	struct sched_entity *se = &p->se;
				3421	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				3422
				3423	/*
				3424	* Load tracking: accumulate removed load so that it can be processed
				3425	* when we next update owning cfs_rq under rq->lock. Tasks contribute
				3426	* to blocked load iff they have a positive decay-count. It can never
				3427	* be negative here since on-rq tasks have decay-count == 0.
				3428	*/
				3429	if (se->avg.decay_count) {
				3430	se->avg.decay_count = -__synchronize_entity_decay(se);
				3431	atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
				3432	}
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	3433	}
Paul Turner	f4e26b1	2012-10-04 13:18:32 +0200	[diff] [blame]	3434	#endif
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3435	#endif /* CONFIG_SMP */
				3436
Peter Zijlstra	e52fb7c	2009-01-14 12:39:19 +0100	[diff] [blame]	3437	static unsigned long
				3438	wakeup_gran(struct sched_entity curr, struct sched_entity se)
Peter Zijlstra	0bbd333	2008-04-19 19:44:57 +0200	[diff] [blame]	3439	{
				3440	unsigned long gran = sysctl_sched_wakeup_granularity;
				3441
				3442	/*
Peter Zijlstra	e52fb7c	2009-01-14 12:39:19 +0100	[diff] [blame]	3443	* Since its curr running now, convert the gran from real-time
				3444	* to virtual-time in his units.
Mike Galbraith	13814d4	2010-03-11 17:17:04 +0100	[diff] [blame]	3445	*
				3446	* By using 'se' instead of 'curr' we penalize light tasks, so
				3447	* they get preempted easier. That is, if 'se' < 'curr' then
				3448	* the resulting gran will be larger, therefore penalizing the
				3449	* lighter, if otoh 'se' > 'curr' then the resulting gran will
				3450	* be smaller, again penalizing the lighter task.
				3451	*
				3452	* This is especially important for buddies when the leftmost
				3453	* task is higher priority than the buddy.
Peter Zijlstra	0bbd333	2008-04-19 19:44:57 +0200	[diff] [blame]	3454	*/
Shaohua Li	f4ad9bd	2011-04-08 12:53:09 +0800	[diff] [blame]	3455	return calc_delta_fair(gran, se);
Peter Zijlstra	0bbd333	2008-04-19 19:44:57 +0200	[diff] [blame]	3456	}
				3457
				3458	/*
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	3459	* Should 'se' preempt 'curr'.
				3460	*
				3461	* \|s1
				3462	* \|s2
				3463	* \|s3
				3464	* g
				3465	* \|<--->\|c
				3466	*
				3467	* w(c, s1) = -1
				3468	* w(c, s2) = 0
				3469	* w(c, s3) = 1
				3470	*
				3471	*/
				3472	static int
				3473	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se)
				3474	{
				3475	s64 gran, vdiff = curr->vruntime - se->vruntime;
				3476
				3477	if (vdiff <= 0)
				3478	return -1;
				3479
Peter Zijlstra	e52fb7c	2009-01-14 12:39:19 +0100	[diff] [blame]	3480	gran = wakeup_gran(curr, se);
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	3481	if (vdiff > gran)
				3482	return 1;
				3483
				3484	return 0;
				3485	}
				3486
Peter Zijlstra	0247909	2008-11-04 21:25:10 +0100	[diff] [blame]	3487	static void set_last_buddy(struct sched_entity *se)
				3488	{
Venkatesh Pallipadi	69c80f3	2011-04-13 18:21:09 -0700	[diff] [blame]	3489	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
				3490	return;
				3491
				3492	for_each_sched_entity(se)
				3493	cfs_rq_of(se)->last = se;
Peter Zijlstra	0247909	2008-11-04 21:25:10 +0100	[diff] [blame]	3494	}
				3495
				3496	static void set_next_buddy(struct sched_entity *se)
				3497	{
Venkatesh Pallipadi	69c80f3	2011-04-13 18:21:09 -0700	[diff] [blame]	3498	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
				3499	return;
				3500
				3501	for_each_sched_entity(se)
				3502	cfs_rq_of(se)->next = se;
Peter Zijlstra	0247909	2008-11-04 21:25:10 +0100	[diff] [blame]	3503	}
				3504
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3505	static void set_skip_buddy(struct sched_entity *se)
				3506	{
Venkatesh Pallipadi	69c80f3	2011-04-13 18:21:09 -0700	[diff] [blame]	3507	for_each_sched_entity(se)
				3508	cfs_rq_of(se)->skip = se;
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3509	}
				3510
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	3511	/*
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3512	* Preempt the current task with a newly woken task if needed:
				3513	*/
Peter Zijlstra	5a9b86f	2009-09-16 13:47:58 +0200	[diff] [blame]	3514	static void check_preempt_wakeup(struct rq rq, struct task_struct p, int wake_flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3515	{
				3516	struct task_struct *curr = rq->curr;
Srivatsa Vaddagiri	8651a86	2007-10-15 17:00:12 +0200	[diff] [blame]	3517	struct sched_entity se = &curr->se, pse = &p->se;
Mike Galbraith	03e89e4	2008-12-16 08:45:30 +0100	[diff] [blame]	3518	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	3519	int scale = cfs_rq->nr_running >= sched_nr_latency;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3520	int next_buddy_marked = 0;
Mike Galbraith	03e89e4	2008-12-16 08:45:30 +0100	[diff] [blame]	3521
Ingo Molnar	4ae7d5c	2008-03-19 01:42:00 +0100	[diff] [blame]	3522	if (unlikely(se == pse))
				3523	return;
				3524
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	3525	/*
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3526	* This is possible from callers such as move_task(), in which we
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	3527	* unconditionally check_prempt_curr() after an enqueue (which may have
				3528	* lead to a throttle). This both saves work and prevents false
				3529	* next-buddy nomination below.
				3530	*/
				3531	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
				3532	return;
				3533
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3534	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
Mike Galbraith	3cb63d5	2009-09-11 12:01:17 +0200	[diff] [blame]	3535	set_next_buddy(pse);
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3536	next_buddy_marked = 1;
				3537	}
Peter Zijlstra	57fdc26	2008-09-23 15:33:45 +0200	[diff] [blame]	3538
Bharata B Rao	aec0a51	2008-08-28 14:42:49 +0530	[diff] [blame]	3539	/*
				3540	* We can come here with TIF_NEED_RESCHED already set from new task
				3541	* wake up path.
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	3542	*
				3543	* Note: this also catches the edge-case of curr being in a throttled
				3544	* group (e.g. via set_curr_task), since update_curr() (in the
				3545	* enqueue of curr) will have resulted in resched being set. This
				3546	* prevents us from potentially nominating it as a false LAST_BUDDY
				3547	* below.
Bharata B Rao	aec0a51	2008-08-28 14:42:49 +0530	[diff] [blame]	3548	*/
				3549	if (test_tsk_need_resched(curr))
				3550	return;
				3551
Darren Hart	a2f5c9a	2011-02-22 13:04:33 -0800	[diff] [blame]	3552	/* Idle tasks are by definition preempted by non-idle tasks. */
				3553	if (unlikely(curr->policy == SCHED_IDLE) &&
				3554	likely(p->policy != SCHED_IDLE))
				3555	goto preempt;
				3556
Ingo Molnar	91c234b	2007-10-15 17:00:18 +0200	[diff] [blame]	3557	/*
Darren Hart	a2f5c9a	2011-02-22 13:04:33 -0800	[diff] [blame]	3558	* Batch and idle tasks do not preempt non-idle tasks (their preemption
				3559	* is driven by the tick):
Ingo Molnar	91c234b	2007-10-15 17:00:18 +0200	[diff] [blame]	3560	*/
Ingo Molnar	8ed92e51	2012-10-14 14:28:50 +0200	[diff] [blame]	3561	if (unlikely(p->policy != SCHED_NORMAL) \|\| !sched_feat(WAKEUP_PREEMPTION))
Ingo Molnar	91c234b	2007-10-15 17:00:18 +0200	[diff] [blame]	3562	return;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3563
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	3564	find_matching_se(&se, &pse);
Paul Turner	9bbd737	2011-07-05 19:07:21 -0700	[diff] [blame]	3565	update_curr(cfs_rq_of(se));
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	3566	BUG_ON(!pse);
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3567	if (wakeup_preempt_entity(se, pse) == 1) {
				3568	/*
				3569	* Bias pick_next to pick the sched entity that is
				3570	* triggering this preemption.
				3571	*/
				3572	if (!next_buddy_marked)
				3573	set_next_buddy(pse);
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	3574	goto preempt;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3575	}
Jupyung Lee	a65ac74	2009-11-17 18:51:40 +0900	[diff] [blame]	3576
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	3577	return;
				3578
				3579	preempt:
				3580	resched_task(curr);
				3581	/*
				3582	* Only set the backward buddy when the current task is still
				3583	* on the rq. This can happen when a wakeup gets interleaved
				3584	* with schedule on the ->pre_schedule() or idle_balance()
				3585	* point, either of which can * drop the rq lock.
				3586	*
				3587	* Also, during early boot the idle thread is in the fair class,
				3588	* for obvious reasons its a bad idea to schedule back to it.
				3589	*/
				3590	if (unlikely(!se->on_rq \|\| curr == rq->idle))
				3591	return;
				3592
				3593	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
				3594	set_last_buddy(se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3595	}
				3596
Ingo Molnar	fb8d472	2007-08-09 11:16:48 +0200	[diff] [blame]	3597	static struct task_struct pick_next_task_fair(struct rq rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3598	{
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3599	struct task_struct *p;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3600	struct cfs_rq *cfs_rq = &rq->cfs;
				3601	struct sched_entity *se;
				3602
Tim Blechmann	36ace27	2009-11-24 11:55:45 +0100	[diff] [blame]	3603	if (!cfs_rq->nr_running)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3604	return NULL;
				3605
				3606	do {
Ingo Molnar	9948f4b	2007-08-09 11:16:48 +0200	[diff] [blame]	3607	se = pick_next_entity(cfs_rq);
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	3608	set_next_entity(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3609	cfs_rq = group_cfs_rq(se);
				3610	} while (cfs_rq);
				3611
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3612	p = task_of(se);
Mike Galbraith	b39e66e	2011-11-22 15:20:07 +0100	[diff] [blame]	3613	if (hrtick_enabled(rq))
				3614	hrtick_start_fair(rq, p);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3615
				3616	return p;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3617	}
				3618
				3619	/*
				3620	* Account for a descheduled task:
				3621	*/
Ingo Molnar	31ee529	2007-08-09 11:16:49 +0200	[diff] [blame]	3622	static void put_prev_task_fair(struct rq rq, struct task_struct prev)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3623	{
				3624	struct sched_entity *se = &prev->se;
				3625	struct cfs_rq *cfs_rq;
				3626
				3627	for_each_sched_entity(se) {
				3628	cfs_rq = cfs_rq_of(se);
Ingo Molnar	ab6cde2	2007-08-09 11:16:48 +0200	[diff] [blame]	3629	put_prev_entity(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3630	}
				3631	}
				3632
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3633	/*
				3634	* sched_yield() is very simple
				3635	*
				3636	* The magic of dealing with the ->skip buddy is in pick_next_entity.
				3637	*/
				3638	static void yield_task_fair(struct rq *rq)
				3639	{
				3640	struct task_struct *curr = rq->curr;
				3641	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
				3642	struct sched_entity *se = &curr->se;
				3643
				3644	/*
				3645	* Are we the only task in the tree?
				3646	*/
				3647	if (unlikely(rq->nr_running == 1))
				3648	return;
				3649
				3650	clear_buddies(cfs_rq, se);
				3651
				3652	if (curr->policy != SCHED_BATCH) {
				3653	update_rq_clock(rq);
				3654	/*
				3655	* Update run-time statistics of the 'current'.
				3656	*/
				3657	update_curr(cfs_rq);
Mike Galbraith	916671c	2011-11-22 15:21:26 +0100	[diff] [blame]	3658	/*
				3659	* Tell update_rq_clock() that we've just updated,
				3660	* so we don't do microscopic update in schedule()
				3661	* and double the fastpath cost.
				3662	*/
				3663	rq->skip_clock_update = 1;
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3664	}
				3665
				3666	set_skip_buddy(se);
				3667	}
				3668
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	3669	static bool yield_to_task_fair(struct rq rq, struct task_struct p, bool preempt)
				3670	{
				3671	struct sched_entity *se = &p->se;
				3672
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	3673	/* throttled hierarchies are not runnable */
				3674	if (!se->on_rq \|\| throttled_hierarchy(cfs_rq_of(se)))
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	3675	return false;
				3676
				3677	/* Tell the scheduler that we'd really like pse to run next. */
				3678	set_next_buddy(se);
				3679
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	3680	yield_task_fair(rq);
				3681
				3682	return true;
				3683	}
				3684
Peter Williams	681f3e6	2007-10-24 18:23:51 +0200	[diff] [blame]	3685	#ifdef CONFIG_SMP
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3686	/**************************************************
Peter Zijlstra	e9c84cb	2012-07-03 13:53:26 +0200	[diff] [blame]	3687	* Fair scheduling class load-balancing methods.
				3688	*
				3689	* BASICS
				3690	*
				3691	* The purpose of load-balancing is to achieve the same basic fairness the
				3692	* per-cpu scheduler provides, namely provide a proportional amount of compute
				3693	* time to each task. This is expressed in the following equation:
				3694	*
				3695	* W_i,n/P_i == W_j,n/P_j for all i,j (1)
				3696	*
				3697	* Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
				3698	* W_i,0 is defined as:
				3699	*
				3700	* W_i,0 = \Sum_j w_i,j (2)
				3701	*
				3702	* Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
				3703	* is derived from the nice value as per prio_to_weight[].
				3704	*
				3705	* The weight average is an exponential decay average of the instantaneous
				3706	* weight:
				3707	*
				3708	* W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
				3709	*
				3710	* P_i is the cpu power (or compute capacity) of cpu i, typically it is the
				3711	* fraction of 'recent' time available for SCHED_OTHER task execution. But it
				3712	* can also include other factors [XXX].
				3713	*
				3714	* To achieve this balance we define a measure of imbalance which follows
				3715	* directly from (1):
				3716	*
				3717	* imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4)
				3718	*
				3719	* We them move tasks around to minimize the imbalance. In the continuous
				3720	* function space it is obvious this converges, in the discrete case we get
				3721	* a few fun cases generally called infeasible weight scenarios.
				3722	*
				3723	* [XXX expand on:
				3724	* - infeasible weights;
				3725	* - local vs global optima in the discrete case. ]
				3726	*
				3727	*
				3728	* SCHED DOMAINS
				3729	*
				3730	* In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
				3731	* for all i,j solution, we create a tree of cpus that follows the hardware
				3732	* topology where each level pairs two lower groups (or better). This results
				3733	* in O(log n) layers. Furthermore we reduce the number of cpus going up the
				3734	* tree to only the first of the previous level and we decrease the frequency
				3735	* of load-balance at each level inv. proportional to the number of cpus in
				3736	* the groups.
				3737	*
				3738	* This yields:
				3739	*
				3740	* log_2 n 1 n
				3741	* \Sum { --- * --- * 2^i } = O(n) (5)
				3742	* i = 0 2^i 2^i
				3743	* `- size of each group
				3744	* \| \| `- number of cpus doing load-balance
				3745	* \| `- freq
				3746	* `- sum over all levels
				3747	*
				3748	* Coupled with a limit on how many tasks we can migrate every balance pass,
				3749	* this makes (5) the runtime complexity of the balancer.
				3750	*
				3751	* An important property here is that each CPU is still (indirectly) connected
				3752	* to every other cpu in at most O(log n) steps:
				3753	*
				3754	* The adjacency matrix of the resulting graph is given by:
				3755	*
				3756	* log_2 n
				3757	* A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
				3758	* k = 0
				3759	*
				3760	* And you'll find that:
				3761	*
				3762	* A^(log_2 n)_i,j != 0 for all i,j (7)
				3763	*
				3764	* Showing there's indeed a path between every cpu in at most O(log n) steps.
				3765	* The task movement gives a factor of O(m), giving a convergence complexity
				3766	* of:
				3767	*
				3768	* O(nm log n), n := nr_cpus, m := nr_tasks (8)
				3769	*
				3770	*
				3771	* WORK CONSERVING
				3772	*
				3773	* In order to avoid CPUs going idle while there's still work to do, new idle
				3774	* balancing is more aggressive and has the newly idle cpu iterate up the domain
				3775	* tree itself instead of relying on other CPUs to bring it work.
				3776	*
				3777	* This adds some complexity to both (5) and (8) but it reduces the total idle
				3778	* time.
				3779	*
				3780	* [XXX more?]
				3781	*
				3782	*
				3783	* CGROUPS
				3784	*
				3785	* Cgroups make a horror show out of (2), instead of a simple sum we get:
				3786	*
				3787	* s_k,i
				3788	* W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
				3789	* S_k
				3790	*
				3791	* Where
				3792	*
				3793	* s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
				3794	*
				3795	* w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
				3796	*
				3797	* The big problem is S_k, its a global sum needed to compute a local (W_i)
				3798	* property.
				3799	*
				3800	* [XXX write more on how we solve this.. _after_ merging pjt's patches that
				3801	* rewrite all of this once again.]
				3802	*/
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3803
Hiroshi Shimamoto	ed387b7	2012-01-31 11:40:32 +0900	[diff] [blame]	3804	static unsigned long __read_mostly max_load_balance_interval = HZ/10;
				3805
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3806	#define LBF_ALL_PINNED 0x01
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3807	#define LBF_NEED_BREAK 0x02
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	3808	#define LBF_SOME_PINNED 0x04
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3809
				3810	struct lb_env {
				3811	struct sched_domain *sd;
				3812
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3813	struct rq *src_rq;
Prashanth Nageshappa	85c1e7d	2012-06-19 17:47:34 +0530	[diff] [blame]	3814	int src_cpu;
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3815
				3816	int dst_cpu;
				3817	struct rq *dst_rq;
				3818
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	3819	struct cpumask *dst_grpmask;
				3820	int new_dst_cpu;
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3821	enum cpu_idle_type idle;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	3822	long imbalance;
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	3823	/* The set of CPUs under consideration for load-balancing */
				3824	struct cpumask *cpus;
				3825
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3826	unsigned int flags;
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3827
				3828	unsigned int loop;
				3829	unsigned int loop_break;
				3830	unsigned int loop_max;
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3831	};
				3832
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3833	/*
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3834	* move_task - move a task from one runqueue to another runqueue.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3835	* Both runqueues must be locked.
				3836	*/
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3837	static void move_task(struct task_struct p, struct lb_env env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3838	{
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3839	deactivate_task(env->src_rq, p, 0);
				3840	set_task_cpu(p, env->dst_cpu);
				3841	activate_task(env->dst_rq, p, 0);
				3842	check_preempt_curr(env->dst_rq, p, 0);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3843	}
				3844
				3845	/*
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3846	* Is this task likely cache-hot:
				3847	*/
				3848	static int
				3849	task_hot(struct task_struct p, u64 now, struct sched_domain sd)
				3850	{
				3851	s64 delta;
				3852
				3853	if (p->sched_class != &fair_sched_class)
				3854	return 0;
				3855
				3856	if (unlikely(p->policy == SCHED_IDLE))
				3857	return 0;
				3858
				3859	/*
				3860	* Buddy candidates are cache hot:
				3861	*/
				3862	if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
				3863	(&p->se == cfs_rq_of(&p->se)->next \|\|
				3864	&p->se == cfs_rq_of(&p->se)->last))
				3865	return 1;
				3866
				3867	if (sysctl_sched_migration_cost == -1)
				3868	return 1;
				3869	if (sysctl_sched_migration_cost == 0)
				3870	return 0;
				3871
				3872	delta = now - p->se.exec_start;
				3873
				3874	return delta < (s64)sysctl_sched_migration_cost;
				3875	}
				3876
				3877	/*
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3878	* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
				3879	*/
				3880	static
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	3881	int can_migrate_task(struct task_struct p, struct lb_env env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3882	{
				3883	int tsk_cache_hot = 0;
				3884	/*
				3885	* We do not migrate tasks that are:
				3886	* 1) running (obviously), or
				3887	* 2) cannot be migrated to this CPU due to cpus_allowed, or
				3888	* 3) are cache-hot on their current CPU.
				3889	*/
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3890	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	3891	int new_dst_cpu;
				3892
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3893	schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	3894
				3895	/*
				3896	* Remember if this task can be migrated to any other cpu in
				3897	* our sched_group. We may want to revisit it if we couldn't
				3898	* meet load balance goals by pulling other tasks on src_cpu.
				3899	*
				3900	* Also avoid computing new_dst_cpu if we have already computed
				3901	* one in current iteration.
				3902	*/
				3903	if (!env->dst_grpmask \|\| (env->flags & LBF_SOME_PINNED))
				3904	return 0;
				3905
				3906	new_dst_cpu = cpumask_first_and(env->dst_grpmask,
				3907	tsk_cpus_allowed(p));
				3908	if (new_dst_cpu < nr_cpu_ids) {
				3909	env->flags \|= LBF_SOME_PINNED;
				3910	env->new_dst_cpu = new_dst_cpu;
				3911	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3912	return 0;
				3913	}
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	3914
				3915	/* Record that we found atleast one task that could run on dst_cpu */
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	3916	env->flags &= ~LBF_ALL_PINNED;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3917
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3918	if (task_running(env->src_rq, p)) {
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3919	schedstat_inc(p, se.statistics.nr_failed_migrations_running);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3920	return 0;
				3921	}
				3922
				3923	/*
				3924	* Aggressive migration if:
				3925	* 1) task is cache cold, or
				3926	* 2) too many balance attempts have failed.
				3927	*/
				3928
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3929	tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3930	if (!tsk_cache_hot \|\|
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	3931	env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3932	#ifdef CONFIG_SCHEDSTATS
				3933	if (tsk_cache_hot) {
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	3934	schedstat_inc(env->sd, lb_hot_gained[env->idle]);
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3935	schedstat_inc(p, se.statistics.nr_forced_migrations);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3936	}
				3937	#endif
				3938	return 1;
				3939	}
				3940
				3941	if (tsk_cache_hot) {
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3942	schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3943	return 0;
				3944	}
				3945	return 1;
				3946	}
				3947
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3948	/*
				3949	* move_one_task tries to move exactly one task from busiest to this_rq, as
				3950	* part of active balancing operations within "domain".
				3951	* Returns 1 if successful and 0 otherwise.
				3952	*
				3953	* Called with both runqueues locked.
				3954	*/
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	3955	static int move_one_task(struct lb_env *env)
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3956	{
				3957	struct task_struct p, n;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3958
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3959	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
				3960	if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
				3961	continue;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3962
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3963	if (!can_migrate_task(p, env))
				3964	continue;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3965
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3966	move_task(p, env);
				3967	/*
				3968	* Right now, this is only the second place move_task()
				3969	* is called, so we can safely collect move_task()
				3970	* stats here rather than inside move_task().
				3971	*/
				3972	schedstat_inc(env->sd, lb_gained[env->idle]);
				3973	return 1;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3974	}
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3975	return 0;
				3976	}
				3977
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3978	static unsigned long task_h_load(struct task_struct *p);
				3979
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	3980	static const unsigned int sched_nr_migrate_break = 32;
				3981
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	3982	/*
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	3983	* move_tasks tries to move up to imbalance weighted load from busiest to
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	3984	* this_rq, as part of a balancing operation within domain "sd".
				3985	* Returns 1 if successful and 0 otherwise.
				3986	*
				3987	* Called with both runqueues locked.
				3988	*/
				3989	static int move_tasks(struct lb_env *env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3990	{
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	3991	struct list_head *tasks = &env->src_rq->cfs_tasks;
				3992	struct task_struct *p;
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3993	unsigned long load;
				3994	int pulled = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3995
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	3996	if (env->imbalance <= 0)
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	3997	return 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3998
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	3999	while (!list_empty(tasks)) {
				4000	p = list_first_entry(tasks, struct task_struct, se.group_node);
				4001
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4002	env->loop++;
				4003	/* We've more or less seen every task there is, call it quits */
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4004	if (env->loop > env->loop_max)
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4005	break;
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4006
				4007	/* take a breather every nr_migrate tasks */
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4008	if (env->loop > env->loop_break) {
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	4009	env->loop_break += sched_nr_migrate_break;
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	4010	env->flags \|= LBF_NEED_BREAK;
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4011	break;
Peter Zijlstra	a195f00	2011-09-22 15:30:18 +0200	[diff] [blame]	4012	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4013
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4014	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4015	goto next;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4016
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4017	load = task_h_load(p);
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4018
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	4019	if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4020	goto next;
				4021
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4022	if ((load / 2) > env->imbalance)
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4023	goto next;
				4024
				4025	if (!can_migrate_task(p, env))
				4026	goto next;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4027
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4028	move_task(p, env);
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4029	pulled++;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4030	env->imbalance -= load;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4031
				4032	#ifdef CONFIG_PREEMPT
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4033	/*
				4034	* NEWIDLE balancing is a source of latency, so preemptible
				4035	* kernels will stop after the first task is pulled to minimize
				4036	* the critical section.
				4037	*/
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4038	if (env->idle == CPU_NEWLY_IDLE)
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4039	break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4040	#endif
				4041
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4042	/*
				4043	* We only want to steal up to the prescribed amount of
				4044	* weighted load.
				4045	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4046	if (env->imbalance <= 0)
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4047	break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4048
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4049	continue;
				4050	next:
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4051	list_move_tail(&p->se.group_node, tasks);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4052	}
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4053
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4054	/*
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4055	* Right now, this is one of only two places move_task() is called,
				4056	* so we can safely collect move_task() stats here rather than
				4057	* inside move_task().
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4058	*/
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	4059	schedstat_add(env->sd, lb_gained[env->idle], pulled);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4060
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4061	return pulled;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4062	}
				4063
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4064	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4065	/*
				4066	* update tg->load_weight by folding this cpu's load_avg
				4067	*/
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4068	static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4069	{
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4070	struct sched_entity *se = tg->se[cpu];
				4071	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4072
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4073	/* throttled entities do not contribute to load */
				4074	if (throttled_hierarchy(cfs_rq))
				4075	return;
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4076
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	4077	update_cfs_rq_blocked_load(cfs_rq, 1);
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4078
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	4079	if (se) {
				4080	update_entity_load_avg(se, 1);
				4081	/*
				4082	* We pivot on our runnable average having decayed to zero for
				4083	* list removal. This generally implies that all our children
				4084	* have also been removed (modulo rounding error or bandwidth
				4085	* control); however, such cases are rare and we can fix these
				4086	* at enqueue.
				4087	*
				4088	* TODO: fix up out-of-order children on enqueue.
				4089	*/
				4090	if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
				4091	list_del_leaf_cfs_rq(cfs_rq);
				4092	} else {
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4093	struct rq *rq = rq_of(cfs_rq);
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	4094	update_rq_runnable_avg(rq, rq->nr_running);
				4095	}
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4096	}
				4097
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4098	static void update_blocked_averages(int cpu)
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4099	{
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4100	struct rq *rq = cpu_rq(cpu);
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4101	struct cfs_rq *cfs_rq;
				4102	unsigned long flags;
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4103
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4104	raw_spin_lock_irqsave(&rq->lock, flags);
				4105	update_rq_clock(rq);
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4106	/*
				4107	* Iterates the task_group tree in a bottom up fashion, see
				4108	* list_add_leaf_cfs_rq() for details.
				4109	*/
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	4110	for_each_leaf_cfs_rq(rq, cfs_rq) {
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4111	/*
				4112	* Note: We may want to consider periodically releasing
				4113	* rq->lock about these updates so that creating many task
				4114	* groups does not result in continually extending hold time.
				4115	*/
				4116	__update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	4117	}
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4118
				4119	raw_spin_unlock_irqrestore(&rq->lock, flags);
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4120	}
				4121
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4122	/*
				4123	* Compute the cpu's hierarchical load factor for each task group.
				4124	* This needs to be done in a top-down fashion because the load of a child
				4125	* group is a fraction of its parents load.
				4126	*/
				4127	static int tg_load_down(struct task_group tg, void data)
				4128	{
				4129	unsigned long load;
				4130	long cpu = (long)data;
				4131
				4132	if (!tg->parent) {
				4133	load = cpu_rq(cpu)->load.weight;
				4134	} else {
				4135	load = tg->parent->cfs_rq[cpu]->h_load;
				4136	load *= tg->se[cpu]->load.weight;
				4137	load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
				4138	}
				4139
				4140	tg->cfs_rq[cpu]->h_load = load;
				4141
				4142	return 0;
				4143	}
				4144
				4145	static void update_h_load(long cpu)
				4146	{
Peter Zijlstra	a35b646	2012-08-08 21:46:40 +0200	[diff] [blame]	4147	struct rq *rq = cpu_rq(cpu);
				4148	unsigned long now = jiffies;
				4149
				4150	if (rq->h_load_throttle == now)
				4151	return;
				4152
				4153	rq->h_load_throttle = now;
				4154
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4155	rcu_read_lock();
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4156	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4157	rcu_read_unlock();
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4158	}
				4159
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4160	static unsigned long task_h_load(struct task_struct *p)
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4161	{
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4162	struct cfs_rq *cfs_rq = task_cfs_rq(p);
				4163	unsigned long load;
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4164
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4165	load = p->se.load.weight;
				4166	load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1);
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4167
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4168	return load;
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4169	}
				4170	#else
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4171	static inline void update_blocked_averages(int cpu)
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4172	{
				4173	}
				4174
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4175	static inline void update_h_load(long cpu)
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4176	{
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4177	}
				4178
				4179	static unsigned long task_h_load(struct task_struct *p)
				4180	{
				4181	return p->se.load.weight;
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4182	}
				4183	#endif
				4184
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4185	/******** Helpers for find_busiest_group **********************/
				4186	/*
				4187	* sd_lb_stats - Structure to store the statistics of a sched_domain
				4188	* during load balancing.
				4189	*/
				4190	struct sd_lb_stats {
				4191	struct sched_group busiest; / Busiest group in this sd */
				4192	struct sched_group this; / Local group in this sd */
				4193	unsigned long total_load; /* Total load of all groups in sd */
				4194	unsigned long total_pwr; /* Total power of all groups in sd */
				4195	unsigned long avg_load; /* Average load across all groups in sd */
				4196
				4197	/** Statistics of this group */
				4198	unsigned long this_load;
				4199	unsigned long this_load_per_task;
				4200	unsigned long this_nr_running;
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4201	unsigned long this_has_capacity;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4202	unsigned int this_idle_cpus;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4203
				4204	/* Statistics of the busiest group */
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4205	unsigned int busiest_idle_cpus;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4206	unsigned long max_load;
				4207	unsigned long busiest_load_per_task;
				4208	unsigned long busiest_nr_running;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4209	unsigned long busiest_group_capacity;
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4210	unsigned long busiest_has_capacity;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4211	unsigned int busiest_group_weight;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4212
				4213	int group_imb; /* Is there imbalance in this sd */
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4214	};
				4215
				4216	/*
				4217	* sg_lb_stats - stats of a sched_group required for load_balancing
				4218	*/
				4219	struct sg_lb_stats {
				4220	unsigned long avg_load; /Avg load across the CPUs of the group /
				4221	unsigned long group_load; /* Total load over the CPUs of the group */
				4222	unsigned long sum_nr_running; /* Nr tasks running in the group */
				4223	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
				4224	unsigned long group_capacity;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4225	unsigned long idle_cpus;
				4226	unsigned long group_weight;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4227	int group_imb; /* Is there an imbalance in the group ? */
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4228	int group_has_capacity; /* Is there extra capacity in the group? */
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4229	};
				4230
				4231	/**
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4232	* get_sd_load_idx - Obtain the load index for a given sched domain.
				4233	* @sd: The sched_domain whose load_idx is to be obtained.
				4234	* @idle: The Idle status of the CPU for whose sd load_icx is obtained.
				4235	*/
				4236	static inline int get_sd_load_idx(struct sched_domain *sd,
				4237	enum cpu_idle_type idle)
				4238	{
				4239	int load_idx;
				4240
				4241	switch (idle) {
				4242	case CPU_NOT_IDLE:
				4243	load_idx = sd->busy_idx;
				4244	break;
				4245
				4246	case CPU_NEWLY_IDLE:
				4247	load_idx = sd->newidle_idx;
				4248	break;
				4249	default:
				4250	load_idx = sd->idle_idx;
				4251	break;
				4252	}
				4253
				4254	return load_idx;
				4255	}
				4256
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4257	unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
				4258	{
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4259	return SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4260	}
				4261
				4262	unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
				4263	{
				4264	return default_scale_freq_power(sd, cpu);
				4265	}
				4266
				4267	unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
				4268	{
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	4269	unsigned long weight = sd->span_weight;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4270	unsigned long smt_gain = sd->smt_gain;
				4271
				4272	smt_gain /= weight;
				4273
				4274	return smt_gain;
				4275	}
				4276
				4277	unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
				4278	{
				4279	return default_scale_smt_power(sd, cpu);
				4280	}
				4281
				4282	unsigned long scale_rt_power(int cpu)
				4283	{
				4284	struct rq *rq = cpu_rq(cpu);
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	4285	u64 total, available, age_stamp, avg;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4286
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	4287	/*
				4288	* Since we're reading these variables without serialization make sure
				4289	* we read them once before doing sanity checks on them.
				4290	*/
				4291	age_stamp = ACCESS_ONCE(rq->age_stamp);
				4292	avg = ACCESS_ONCE(rq->rt_avg);
Venkatesh Pallipadi	aa48380	2010-10-04 17:03:22 -0700	[diff] [blame]	4293
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	4294	total = sched_avg_period() + (rq->clock - age_stamp);
				4295
				4296	if (unlikely(total < avg)) {
Venkatesh Pallipadi	aa48380	2010-10-04 17:03:22 -0700	[diff] [blame]	4297	/* Ensures that power won't end up being negative */
				4298	available = 0;
				4299	} else {
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	4300	available = total - avg;
Venkatesh Pallipadi	aa48380	2010-10-04 17:03:22 -0700	[diff] [blame]	4301	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4302
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4303	if (unlikely((s64)total < SCHED_POWER_SCALE))
				4304	total = SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4305
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4306	total >>= SCHED_POWER_SHIFT;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4307
				4308	return div_u64(available, total);
				4309	}
				4310
				4311	static void update_cpu_power(struct sched_domain *sd, int cpu)
				4312	{
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	4313	unsigned long weight = sd->span_weight;
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4314	unsigned long power = SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4315	struct sched_group *sdg = sd->groups;
				4316
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4317	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
				4318	if (sched_feat(ARCH_POWER))
				4319	power *= arch_scale_smt_power(sd, cpu);
				4320	else
				4321	power *= default_scale_smt_power(sd, cpu);
				4322
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4323	power >>= SCHED_POWER_SHIFT;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4324	}
				4325
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4326	sdg->sgp->power_orig = power;
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4327
				4328	if (sched_feat(ARCH_POWER))
				4329	power *= arch_scale_freq_power(sd, cpu);
				4330	else
				4331	power *= default_scale_freq_power(sd, cpu);
				4332
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4333	power >>= SCHED_POWER_SHIFT;
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4334
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4335	power *= scale_rt_power(cpu);
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4336	power >>= SCHED_POWER_SHIFT;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4337
				4338	if (!power)
				4339	power = 1;
				4340
Peter Zijlstra	e51fd5e	2010-05-31 12:37:30 +0200	[diff] [blame]	4341	cpu_rq(cpu)->cpu_power = power;
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4342	sdg->sgp->power = power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4343	}
				4344
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4345	void update_group_power(struct sched_domain *sd, int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4346	{
				4347	struct sched_domain *child = sd->child;
				4348	struct sched_group group, sdg = sd->groups;
				4349	unsigned long power;
Vincent Guittot	4ec4412	2011-12-12 20:21:08 +0100	[diff] [blame]	4350	unsigned long interval;
				4351
				4352	interval = msecs_to_jiffies(sd->balance_interval);
				4353	interval = clamp(interval, 1UL, max_load_balance_interval);
				4354	sdg->sgp->next_update = jiffies + interval;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4355
				4356	if (!child) {
				4357	update_cpu_power(sd, cpu);
				4358	return;
				4359	}
				4360
				4361	power = 0;
				4362
Peter Zijlstra	74a5ce2	2012-05-23 18:00:43 +0200	[diff] [blame]	4363	if (child->flags & SD_OVERLAP) {
				4364	/*
				4365	* SD_OVERLAP domains cannot assume that child groups
				4366	* span the current group.
				4367	*/
				4368
				4369	for_each_cpu(cpu, sched_group_cpus(sdg))
				4370	power += power_of(cpu);
				4371	} else {
				4372	/*
				4373	* !SD_OVERLAP domains can assume that child groups
				4374	* span the current group.
				4375	*/
				4376
				4377	group = child->groups;
				4378	do {
				4379	power += group->sgp->power;
				4380	group = group->next;
				4381	} while (group != child->groups);
				4382	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4383
Peter Zijlstra	c3decf0	2012-05-31 12:05:32 +0200	[diff] [blame]	4384	sdg->sgp->power_orig = sdg->sgp->power = power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4385	}
				4386
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4387	/*
				4388	* Try and fix up capacity for tiny siblings, this is needed when
				4389	* things like SD_ASYM_PACKING need f_b_g to select another sibling
				4390	* which on its own isn't powerful enough.
				4391	*
				4392	* See update_sd_pick_busiest() and check_asym_packing().
				4393	*/
				4394	static inline int
				4395	fix_small_capacity(struct sched_domain sd, struct sched_group group)
				4396	{
				4397	/*
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4398	* Only siblings can have significantly less than SCHED_POWER_SCALE
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4399	*/
Peter Zijlstra	a6c75f2	2011-04-07 14:09:52 +0200	[diff] [blame]	4400	if (!(sd->flags & SD_SHARE_CPUPOWER))
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4401	return 0;
				4402
				4403	/*
				4404	* If ~90% of the cpu_power is still there, we're good.
				4405	*/
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4406	if (group->sgp->power * 32 > group->sgp->power_orig * 29)
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4407	return 1;
				4408
				4409	return 0;
				4410	}
				4411
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4412	/**
				4413	* update_sg_lb_stats - Update sched_group's statistics for load balancing.
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4414	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4415	* @group: sched_group whose statistics are to be updated.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4416	* @load_idx: Load index of sched_domain of this_cpu for load calc.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4417	* @local_group: Does group contain this_cpu.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4418	* @balance: Should we balance.
				4419	* @sgs: variable to hold the statistics for this group.
				4420	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4421	static inline void update_sg_lb_stats(struct lb_env *env,
				4422	struct sched_group *group, int load_idx,
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	4423	int local_group, int balance, struct sg_lb_stats sgs)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4424	{
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4425	unsigned long nr_running, max_nr_running, min_nr_running;
				4426	unsigned long load, max_cpu_load, min_cpu_load;
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	4427	unsigned int balance_cpu = -1, first_idle_cpu = 0;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4428	unsigned long avg_load_per_task = 0;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4429	int i;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4430
Gautham R Shenoy	871e35b	2010-01-20 14:02:44 -0600	[diff] [blame]	4431	if (local_group)
Peter Zijlstra	c117487	2012-05-31 14:47:33 +0200	[diff] [blame]	4432	balance_cpu = group_balance_cpu(group);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4433
				4434	/* Tally up the load of all CPUs in the group */
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4435	max_cpu_load = 0;
				4436	min_cpu_load = ~0UL;
Nikhil Rao	2582f0e	2010-10-13 12:09:36 -0700	[diff] [blame]	4437	max_nr_running = 0;
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4438	min_nr_running = ~0UL;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4439
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	4440	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4441	struct rq *rq = cpu_rq(i);
				4442
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4443	nr_running = rq->nr_running;
				4444
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4445	/* Bias balancing toward cpus of our domain */
				4446	if (local_group) {
Peter Zijlstra	c117487	2012-05-31 14:47:33 +0200	[diff] [blame]	4447	if (idle_cpu(i) && !first_idle_cpu &&
				4448	cpumask_test_cpu(i, sched_group_mask(group))) {
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	4449	first_idle_cpu = 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4450	balance_cpu = i;
				4451	}
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	4452
				4453	load = target_load(i, load_idx);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4454	} else {
				4455	load = source_load(i, load_idx);
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4456	if (load > max_cpu_load)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4457	max_cpu_load = load;
				4458	if (min_cpu_load > load)
				4459	min_cpu_load = load;
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4460
				4461	if (nr_running > max_nr_running)
				4462	max_nr_running = nr_running;
				4463	if (min_nr_running > nr_running)
				4464	min_nr_running = nr_running;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4465	}
				4466
				4467	sgs->group_load += load;
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4468	sgs->sum_nr_running += nr_running;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4469	sgs->sum_weighted_load += weighted_cpuload(i);
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4470	if (idle_cpu(i))
				4471	sgs->idle_cpus++;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4472	}
				4473
				4474	/*
				4475	* First idle cpu or the first cpu(busiest) in this sched group
				4476	* is eligible for doing load balancing at this and above
				4477	* domains. In the newly idle case, we will allow all the cpu's
				4478	* to do the newly idle load balance.
				4479	*/
Vincent Guittot	4ec4412	2011-12-12 20:21:08 +0100	[diff] [blame]	4480	if (local_group) {
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4481	if (env->idle != CPU_NEWLY_IDLE) {
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	4482	if (balance_cpu != env->dst_cpu) {
Vincent Guittot	4ec4412	2011-12-12 20:21:08 +0100	[diff] [blame]	4483	*balance = 0;
				4484	return;
				4485	}
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4486	update_group_power(env->sd, env->dst_cpu);
Vincent Guittot	4ec4412	2011-12-12 20:21:08 +0100	[diff] [blame]	4487	} else if (time_after_eq(jiffies, group->sgp->next_update))
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4488	update_group_power(env->sd, env->dst_cpu);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4489	}
				4490
				4491	/* Adjust by relative CPU power of the group */
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4492	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4493
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4494	/*
				4495	* Consider the group unbalanced when the imbalance is larger
Peter Zijlstra	866ab43	2011-02-21 18:56:47 +0100	[diff] [blame]	4496	* than the average weight of a task.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4497	*
				4498	* APZ: with cgroup the avg task weight can vary wildly and
				4499	* might not be a suitable number - should we keep a
				4500	* normalized nr_running number somewhere that negates
				4501	* the hierarchy?
				4502	*/
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4503	if (sgs->sum_nr_running)
				4504	avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4505
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4506	if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
				4507	(max_nr_running - min_nr_running) > 1)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4508	sgs->group_imb = 1;
				4509
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4510	sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4511	SCHED_POWER_SCALE);
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4512	if (!sgs->group_capacity)
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4513	sgs->group_capacity = fix_small_capacity(env->sd, group);
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4514	sgs->group_weight = group->group_weight;
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4515
				4516	if (sgs->group_capacity > sgs->sum_nr_running)
				4517	sgs->group_has_capacity = 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4518	}
				4519
				4520	/**
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4521	* update_sd_pick_busiest - return 1 on busiest group
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4522	* @env: The load balancing environment.
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4523	* @sds: sched_domain statistics
				4524	* @sg: sched_group candidate to be checked for being the busiest
Michael Neuling	b6b1229	2010-06-10 12:06:21 +1000	[diff] [blame]	4525	* @sgs: sched_group statistics
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4526	*
				4527	* Determine if @sg is a busier group than the previously selected
				4528	* busiest group.
				4529	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4530	static bool update_sd_pick_busiest(struct lb_env *env,
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4531	struct sd_lb_stats *sds,
				4532	struct sched_group *sg,
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4533	struct sg_lb_stats *sgs)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4534	{
				4535	if (sgs->avg_load <= sds->max_load)
				4536	return false;
				4537
				4538	if (sgs->sum_nr_running > sgs->group_capacity)
				4539	return true;
				4540
				4541	if (sgs->group_imb)
				4542	return true;
				4543
				4544	/*
				4545	* ASYM_PACKING needs to move all the work to the lowest
				4546	* numbered CPUs in the group, therefore mark all groups
				4547	* higher than ourself as busy.
				4548	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4549	if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
				4550	env->dst_cpu < group_first_cpu(sg)) {
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4551	if (!sds->busiest)
				4552	return true;
				4553
				4554	if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
				4555	return true;
				4556	}
				4557
				4558	return false;
				4559	}
				4560
				4561	/**
Hui Kang	461819a	2011-10-11 23:00:59 -0400	[diff] [blame]	4562	* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4563	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4564	* @balance: Should we balance.
				4565	* @sds: variable to hold the statistics for this sched_domain.
				4566	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4567	static inline void update_sd_lb_stats(struct lb_env *env,
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	4568	int balance, struct sd_lb_stats sds)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4569	{
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4570	struct sched_domain *child = env->sd->child;
				4571	struct sched_group *sg = env->sd->groups;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4572	struct sg_lb_stats sgs;
				4573	int load_idx, prefer_sibling = 0;
				4574
				4575	if (child && child->flags & SD_PREFER_SIBLING)
				4576	prefer_sibling = 1;
				4577
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4578	load_idx = get_sd_load_idx(env->sd, env->idle);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4579
				4580	do {
				4581	int local_group;
				4582
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4583	local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4584	memset(&sgs, 0, sizeof(sgs));
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	4585	update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4586
Peter Zijlstra	8f190fb	2009-12-24 14:18:21 +0100	[diff] [blame]	4587	if (local_group && !(*balance))
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4588	return;
				4589
				4590	sds->total_load += sgs.group_load;
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4591	sds->total_pwr += sg->sgp->power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4592
				4593	/*
				4594	* In case the child domain prefers tasks go to siblings
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4595	* first, lower the sg capacity to one so that we'll try
Nikhil Rao	75dd321	2010-10-15 13:12:30 -0700	[diff] [blame]	4596	* and move all the excess tasks away. We lower the capacity
				4597	* of a group only if the local group has the capacity to fit
				4598	* these excess tasks, i.e. nr_running < group_capacity. The
				4599	* extra check prevents the case where you always pull from the
				4600	* heaviest group when it is already under-utilized (possible
				4601	* with a large weight task outweighs the tasks on the system).
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4602	*/
Nikhil Rao	75dd321	2010-10-15 13:12:30 -0700	[diff] [blame]	4603	if (prefer_sibling && !local_group && sds->this_has_capacity)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4604	sgs.group_capacity = min(sgs.group_capacity, 1UL);
				4605
				4606	if (local_group) {
				4607	sds->this_load = sgs.avg_load;
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4608	sds->this = sg;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4609	sds->this_nr_running = sgs.sum_nr_running;
				4610	sds->this_load_per_task = sgs.sum_weighted_load;
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4611	sds->this_has_capacity = sgs.group_has_capacity;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4612	sds->this_idle_cpus = sgs.idle_cpus;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4613	} else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4614	sds->max_load = sgs.avg_load;
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4615	sds->busiest = sg;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4616	sds->busiest_nr_running = sgs.sum_nr_running;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4617	sds->busiest_idle_cpus = sgs.idle_cpus;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4618	sds->busiest_group_capacity = sgs.group_capacity;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4619	sds->busiest_load_per_task = sgs.sum_weighted_load;
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4620	sds->busiest_has_capacity = sgs.group_has_capacity;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4621	sds->busiest_group_weight = sgs.group_weight;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4622	sds->group_imb = sgs.group_imb;
				4623	}
				4624
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4625	sg = sg->next;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4626	} while (sg != env->sd->groups);
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4627	}
				4628
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4629	/**
				4630	* check_asym_packing - Check to see if the group is packed into the
				4631	* sched doman.
				4632	*
				4633	* This is primarily intended to used at the sibling level. Some
				4634	* cores like POWER7 prefer to use lower numbered SMT threads. In the
				4635	* case of POWER7, it can move to lower SMT modes only when higher
				4636	* threads are idle. When in lower SMT modes, the threads will
				4637	* perform better since they share less core resources. Hence when we
				4638	* have idle threads, we want them to be the higher ones.
				4639	*
				4640	* This packing function is run on idle threads. It checks to see if
				4641	* the busiest CPU in this domain (core in the P7 case) has a higher
				4642	* CPU number than the packing function is being run on. Here we are
				4643	* assuming lower CPU number will be equivalent to lower a SMT thread
				4644	* number.
				4645	*
Michael Neuling	b6b1229	2010-06-10 12:06:21 +1000	[diff] [blame]	4646	* Returns 1 when packing is required and a task should be moved to
				4647	* this CPU. The amount of the imbalance is returned in *imbalance.
				4648	*
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4649	* @env: The load balancing environment.
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4650	* @sds: Statistics of the sched_domain which is to be packed
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4651	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4652	static int check_asym_packing(struct lb_env env, struct sd_lb_stats sds)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4653	{
				4654	int busiest_cpu;
				4655
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4656	if (!(env->sd->flags & SD_ASYM_PACKING))
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4657	return 0;
				4658
				4659	if (!sds->busiest)
				4660	return 0;
				4661
				4662	busiest_cpu = group_first_cpu(sds->busiest);
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4663	if (env->dst_cpu > busiest_cpu)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4664	return 0;
				4665
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4666	env->imbalance = DIV_ROUND_CLOSEST(
				4667	sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
				4668
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4669	return 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4670	}
				4671
				4672	/**
				4673	* fix_small_imbalance - Calculate the minor imbalance that exists
				4674	* amongst the groups of a sched_domain, during
				4675	* load balancing.
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4676	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4677	* @sds: Statistics of the sched_domain whose imbalance is to be calculated.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4678	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4679	static inline
				4680	void fix_small_imbalance(struct lb_env env, struct sd_lb_stats sds)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4681	{
				4682	unsigned long tmp, pwr_now = 0, pwr_move = 0;
				4683	unsigned int imbn = 2;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4684	unsigned long scaled_busy_load_per_task;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4685
				4686	if (sds->this_nr_running) {
				4687	sds->this_load_per_task /= sds->this_nr_running;
				4688	if (sds->busiest_load_per_task >
				4689	sds->this_load_per_task)
				4690	imbn = 1;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4691	} else {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4692	sds->this_load_per_task =
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4693	cpu_avg_load_per_task(env->dst_cpu);
				4694	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4695
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4696	scaled_busy_load_per_task = sds->busiest_load_per_task
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4697	* SCHED_POWER_SCALE;
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4698	scaled_busy_load_per_task /= sds->busiest->sgp->power;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4699
				4700	if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
				4701	(scaled_busy_load_per_task * imbn)) {
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4702	env->imbalance = sds->busiest_load_per_task;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4703	return;
				4704	}
				4705
				4706	/*
				4707	* OK, we don't have enough imbalance to justify moving tasks,
				4708	* however we may be able to increase total CPU power used by
				4709	* moving them.
				4710	*/
				4711
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4712	pwr_now += sds->busiest->sgp->power *
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4713	min(sds->busiest_load_per_task, sds->max_load);
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4714	pwr_now += sds->this->sgp->power *
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4715	min(sds->this_load_per_task, sds->this_load);
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4716	pwr_now /= SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4717
				4718	/* Amount of load we'd subtract */
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4719	tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4720	sds->busiest->sgp->power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4721	if (sds->max_load > tmp)
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4722	pwr_move += sds->busiest->sgp->power *
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4723	min(sds->busiest_load_per_task, sds->max_load - tmp);
				4724
				4725	/* Amount of load we'd add */
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4726	if (sds->max_load * sds->busiest->sgp->power <
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4727	sds->busiest_load_per_task * SCHED_POWER_SCALE)
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4728	tmp = (sds->max_load * sds->busiest->sgp->power) /
				4729	sds->this->sgp->power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4730	else
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4731	tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4732	sds->this->sgp->power;
				4733	pwr_move += sds->this->sgp->power *
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4734	min(sds->this_load_per_task, sds->this_load + tmp);
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4735	pwr_move /= SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4736
				4737	/* Move if we gain throughput */
				4738	if (pwr_move > pwr_now)
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4739	env->imbalance = sds->busiest_load_per_task;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4740	}
				4741
				4742	/**
				4743	* calculate_imbalance - Calculate the amount of imbalance present within the
				4744	* groups of a given sched_domain during load balance.
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4745	* @env: load balance environment
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4746	* @sds: statistics of the sched_domain whose imbalance is to be calculated.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4747	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4748	static inline void calculate_imbalance(struct lb_env env, struct sd_lb_stats sds)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4749	{
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4750	unsigned long max_pull, load_above_capacity = ~0UL;
				4751
				4752	sds->busiest_load_per_task /= sds->busiest_nr_running;
				4753	if (sds->group_imb) {
				4754	sds->busiest_load_per_task =
				4755	min(sds->busiest_load_per_task, sds->avg_load);
				4756	}
				4757
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4758	/*
				4759	* In the presence of smp nice balancing, certain scenarios can have
				4760	* max load less than avg load(as we skip the groups at or below
				4761	* its cpu_power, while calculating max_load..)
				4762	*/
				4763	if (sds->max_load < sds->avg_load) {
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4764	env->imbalance = 0;
				4765	return fix_small_imbalance(env, sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4766	}
				4767
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4768	if (!sds->group_imb) {
				4769	/*
				4770	* Don't want to pull so many tasks that a group would go idle.
				4771	*/
				4772	load_above_capacity = (sds->busiest_nr_running -
				4773	sds->busiest_group_capacity);
				4774
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4775	load_above_capacity = (SCHED_LOAD_SCALE SCHED_POWER_SCALE);
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4776
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4777	load_above_capacity /= sds->busiest->sgp->power;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4778	}
				4779
				4780	/*
				4781	* We're trying to get all the cpus to the average_load, so we don't
				4782	* want to push ourselves above the average load, nor do we wish to
				4783	* reduce the max loaded cpu below the average load. At the same time,
				4784	* we also don't want to reduce the group load below the group capacity
				4785	* (so that we can implement power-savings policies etc). Thus we look
				4786	* for the minimum possible imbalance.
				4787	* Be careful of negative numbers as they'll appear as very large values
				4788	* with unsigned longs.
				4789	*/
				4790	max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4791
				4792	/* How much load to actually move to equalise the imbalance */
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4793	env->imbalance = min(max_pull * sds->busiest->sgp->power,
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4794	(sds->avg_load - sds->this_load) * sds->this->sgp->power)
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4795	/ SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4796
				4797	/*
				4798	* if *imbalance is less than the average load per runnable task
Lucas De Marchi	25985ed	2011-03-30 22:57:33 -0300	[diff] [blame]	4799	* there is no guarantee that any tasks will be moved so we'll have
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4800	* a think about bumping its value to force at least one task to be
				4801	* moved
				4802	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4803	if (env->imbalance < sds->busiest_load_per_task)
				4804	return fix_small_imbalance(env, sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4805
				4806	}
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4807
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4808	/***** find_busiest_group() helpers end here *******************/
				4809
				4810	/**
				4811	* find_busiest_group - Returns the busiest group within the sched_domain
				4812	* if there is an imbalance. If there isn't an imbalance, and
				4813	* the user has opted for power-savings, it returns a group whose
				4814	* CPUs can be put to idle by rebalancing those tasks elsewhere, if
				4815	* such a group exists.
				4816	*
				4817	* Also calculates the amount of weighted load which should be moved
				4818	* to restore balance.
				4819	*
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4820	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4821	* @balance: Pointer to a variable indicating if this_cpu
				4822	* is the appropriate cpu to perform load balancing at this_level.
				4823	*
				4824	* Returns: - the busiest group if imbalance exists.
				4825	* - If no imbalance and user has opted for power-savings balance,
				4826	* return the least loaded group whose CPUs can be
				4827	* put to idle by rebalancing its tasks onto our group.
				4828	*/
				4829	static struct sched_group *
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	4830	find_busiest_group(struct lb_env env, int balance)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4831	{
				4832	struct sd_lb_stats sds;
				4833
				4834	memset(&sds, 0, sizeof(sds));
				4835
				4836	/*
				4837	* Compute the various statistics relavent for load balancing at
				4838	* this level.
				4839	*/
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	4840	update_sd_lb_stats(env, balance, &sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4841
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	4842	/*
				4843	* this_cpu is not the appropriate cpu to perform load balancing at
				4844	* this level.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4845	*/
Peter Zijlstra	8f190fb	2009-12-24 14:18:21 +0100	[diff] [blame]	4846	if (!(*balance))
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4847	goto ret;
				4848
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4849	if ((env->idle == CPU_IDLE \|\| env->idle == CPU_NEWLY_IDLE) &&
				4850	check_asym_packing(env, &sds))
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4851	return sds.busiest;
				4852
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	4853	/* There is no busy sibling group to pull tasks from */
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4854	if (!sds.busiest \|\| sds.busiest_nr_running == 0)
				4855	goto out_balanced;
				4856
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4857	sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
Ken Chen	b0432d8	2011-04-07 17:23:22 -0700	[diff] [blame]	4858
Peter Zijlstra	866ab43	2011-02-21 18:56:47 +0100	[diff] [blame]	4859	/*
				4860	* If the busiest group is imbalanced the below checks don't
				4861	* work because they assumes all things are equal, which typically
				4862	* isn't true due to cpus_allowed constraints and the like.
				4863	*/
				4864	if (sds.group_imb)
				4865	goto force_balance;
				4866
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	4867	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4868	if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4869	!sds.busiest_has_capacity)
				4870	goto force_balance;
				4871
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	4872	/*
				4873	* If the local group is more busy than the selected busiest group
				4874	* don't try and pull any tasks.
				4875	*/
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4876	if (sds.this_load >= sds.max_load)
				4877	goto out_balanced;
				4878
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	4879	/*
				4880	* Don't pull any tasks if this group is already above the domain
				4881	* average load.
				4882	*/
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4883	if (sds.this_load >= sds.avg_load)
				4884	goto out_balanced;
				4885
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4886	if (env->idle == CPU_IDLE) {
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4887	/*
				4888	* This cpu is idle. If the busiest group load doesn't
				4889	* have more tasks than the number of available cpu's and
				4890	* there is no imbalance between this and busiest group
				4891	* wrt to idle cpu's, it is balanced.
				4892	*/
Peter Zijlstra	c186faf	2011-02-21 18:52:53 +0100	[diff] [blame]	4893	if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4894	sds.busiest_nr_running <= sds.busiest_group_weight)
				4895	goto out_balanced;
Peter Zijlstra	c186faf	2011-02-21 18:52:53 +0100	[diff] [blame]	4896	} else {
				4897	/*
				4898	* In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
				4899	* imbalance_pct to be conservative.
				4900	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4901	if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
Peter Zijlstra	c186faf	2011-02-21 18:52:53 +0100	[diff] [blame]	4902	goto out_balanced;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4903	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4904
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4905	force_balance:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4906	/* Looks like there is an imbalance. Compute it */
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4907	calculate_imbalance(env, &sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4908	return sds.busiest;
				4909
				4910	out_balanced:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4911	ret:
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4912	env->imbalance = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4913	return NULL;
				4914	}
				4915
				4916	/*
				4917	* find_busiest_queue - find the busiest runqueue among the cpus in group.
				4918	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4919	static struct rq find_busiest_queue(struct lb_env env,
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	4920	struct sched_group *group)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4921	{
				4922	struct rq busiest = NULL, rq;
				4923	unsigned long max_load = 0;
				4924	int i;
				4925
				4926	for_each_cpu(i, sched_group_cpus(group)) {
				4927	unsigned long power = power_of(i);
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4928	unsigned long capacity = DIV_ROUND_CLOSEST(power,
				4929	SCHED_POWER_SCALE);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4930	unsigned long wl;
				4931
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4932	if (!capacity)
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4933	capacity = fix_small_capacity(env->sd, group);
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4934
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	4935	if (!cpumask_test_cpu(i, env->cpus))
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4936	continue;
				4937
				4938	rq = cpu_rq(i);
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	4939	wl = weighted_cpuload(i);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4940
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	4941	/*
				4942	* When comparing with imbalance, use weighted_cpuload()
				4943	* which is not scaled with the cpu power.
				4944	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4945	if (capacity && rq->nr_running == 1 && wl > env->imbalance)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4946	continue;
				4947
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	4948	/*
				4949	* For the load comparisons with the other cpu's, consider
				4950	* the weighted_cpuload() scaled with the cpu power, so that
				4951	* the load can be moved away from the cpu that is potentially
				4952	* running at a lower capacity.
				4953	*/
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4954	wl = (wl * SCHED_POWER_SCALE) / power;
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	4955
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4956	if (wl > max_load) {
				4957	max_load = wl;
				4958	busiest = rq;
				4959	}
				4960	}
				4961
				4962	return busiest;
				4963	}
				4964
				4965	/*
				4966	* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
				4967	* so long as it is large enough.
				4968	*/
				4969	#define MAX_PINNED_INTERVAL 512
				4970
				4971	/* Working cpumask for load_balance and load_balance_newidle. */
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4972	DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4973
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4974	static int need_active_balance(struct lb_env *env)
Peter Zijlstra	1af3ed3	2009-12-23 15:10:31 +0100	[diff] [blame]	4975	{
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4976	struct sched_domain *sd = env->sd;
				4977
				4978	if (env->idle == CPU_NEWLY_IDLE) {
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4979
				4980	/*
				4981	* ASYM_PACKING needs to force migrate tasks from busy but
				4982	* higher numbered CPUs in order to pack all tasks in the
				4983	* lowest numbered CPUs.
				4984	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4985	if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4986	return 1;
Peter Zijlstra	1af3ed3	2009-12-23 15:10:31 +0100	[diff] [blame]	4987	}
				4988
				4989	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
				4990	}
				4991
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	4992	static int active_load_balance_cpu_stop(void *data);
				4993
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4994	/*
				4995	* Check this_cpu to ensure it is balanced within domain. Attempt to move
				4996	* tasks if there is an imbalance.
				4997	*/
				4998	static int load_balance(int this_cpu, struct rq *this_rq,
				4999	struct sched_domain *sd, enum cpu_idle_type idle,
				5000	int *balance)
				5001	{
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5002	int ld_moved, cur_ld_moved, active_balance = 0;
				5003	int lb_iterations, max_lb_iterations;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5004	struct sched_group *group;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5005	struct rq *busiest;
				5006	unsigned long flags;
				5007	struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
				5008
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5009	struct lb_env env = {
				5010	.sd = sd,
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	5011	.dst_cpu = this_cpu,
				5012	.dst_rq = this_rq,
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5013	.dst_grpmask = sched_group_cpus(sd->groups),
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5014	.idle = idle,
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	5015	.loop_break = sched_nr_migrate_break,
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	5016	.cpus = cpus,
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5017	};
				5018
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5019	cpumask_copy(cpus, cpu_active_mask);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5020	max_lb_iterations = cpumask_weight(env.dst_grpmask);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5021
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5022	schedstat_inc(sd, lb_count[idle]);
				5023
				5024	redo:
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	5025	group = find_busiest_group(&env, balance);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5026
				5027	if (*balance == 0)
				5028	goto out_balanced;
				5029
				5030	if (!group) {
				5031	schedstat_inc(sd, lb_nobusyg[idle]);
				5032	goto out_balanced;
				5033	}
				5034
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	5035	busiest = find_busiest_queue(&env, group);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5036	if (!busiest) {
				5037	schedstat_inc(sd, lb_nobusyq[idle]);
				5038	goto out_balanced;
				5039	}
				5040
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	5041	BUG_ON(busiest == env.dst_rq);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5042
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5043	schedstat_add(sd, lb_imbalance[idle], env.imbalance);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5044
				5045	ld_moved = 0;
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5046	lb_iterations = 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5047	if (busiest->nr_running > 1) {
				5048	/*
				5049	* Attempt to move tasks. If find_busiest_group has found
				5050	* an imbalance but busiest->nr_running <= 1, the group is
				5051	* still unbalanced. ld_moved simply stays zero, so it is
				5052	* correctly treated as an imbalance.
				5053	*/
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5054	env.flags \|= LBF_ALL_PINNED;
Peter Zijlstra	c82513e	2012-04-26 13:12:27 +0200	[diff] [blame]	5055	env.src_cpu = busiest->cpu;
				5056	env.src_rq = busiest;
				5057	env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5058
Peter Zijlstra	a35b646	2012-08-08 21:46:40 +0200	[diff] [blame]	5059	update_h_load(env.src_cpu);
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	5060	more_balance:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5061	local_irq_save(flags);
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	5062	double_rq_lock(env.dst_rq, busiest);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5063
				5064	/*
				5065	* cur_ld_moved - load moved in current iteration
				5066	* ld_moved - cumulative load moved across iterations
				5067	*/
				5068	cur_ld_moved = move_tasks(&env);
				5069	ld_moved += cur_ld_moved;
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	5070	double_rq_unlock(env.dst_rq, busiest);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5071	local_irq_restore(flags);
				5072
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	5073	if (env.flags & LBF_NEED_BREAK) {
				5074	env.flags &= ~LBF_NEED_BREAK;
				5075	goto more_balance;
				5076	}
				5077
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5078	/*
				5079	* some other cpu did the load balance for us.
				5080	*/
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5081	if (cur_ld_moved && env.dst_cpu != smp_processor_id())
				5082	resched_cpu(env.dst_cpu);
				5083
				5084	/*
				5085	* Revisit (affine) tasks on src_cpu that couldn't be moved to
				5086	* us and move them to an alternate dst_cpu in our sched_group
				5087	* where they can run. The upper limit on how many times we
				5088	* iterate on same src_cpu is dependent on number of cpus in our
				5089	* sched_group.
				5090	*
				5091	* This changes load balance semantics a bit on who can move
				5092	* load to a given_cpu. In addition to the given_cpu itself
				5093	* (or a ilb_cpu acting on its behalf where given_cpu is
				5094	* nohz-idle), we now have balance_cpu in a position to move
				5095	* load to given_cpu. In rare situations, this may cause
				5096	* conflicts (balance_cpu and given_cpu/ilb_cpu deciding
				5097	* _independently_ and at _same_ time to move some load to
				5098	* given_cpu) causing exceess load to be moved to given_cpu.
				5099	* This however should not happen so much in practice and
				5100	* moreover subsequent load balance cycles should correct the
				5101	* excess load moved.
				5102	*/
				5103	if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
				5104	lb_iterations++ < max_lb_iterations) {
				5105
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	5106	env.dst_rq = cpu_rq(env.new_dst_cpu);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5107	env.dst_cpu = env.new_dst_cpu;
				5108	env.flags &= ~LBF_SOME_PINNED;
				5109	env.loop = 0;
				5110	env.loop_break = sched_nr_migrate_break;
				5111	/*
				5112	* Go back to "more_balance" rather than "redo" since we
				5113	* need to continue with same src_cpu.
				5114	*/
				5115	goto more_balance;
				5116	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5117
				5118	/* All tasks on this runqueue were pinned by CPU affinity */
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5119	if (unlikely(env.flags & LBF_ALL_PINNED)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5120	cpumask_clear_cpu(cpu_of(busiest), cpus);
Prashanth Nageshappa	bbf18b1	2012-06-19 17:52:07 +0530	[diff] [blame]	5121	if (!cpumask_empty(cpus)) {
				5122	env.loop = 0;
				5123	env.loop_break = sched_nr_migrate_break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5124	goto redo;
Prashanth Nageshappa	bbf18b1	2012-06-19 17:52:07 +0530	[diff] [blame]	5125	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5126	goto out_balanced;
				5127	}
				5128	}
				5129
				5130	if (!ld_moved) {
				5131	schedstat_inc(sd, lb_failed[idle]);
Venkatesh Pallipadi	58b26c4	2010-09-10 18:19:17 -0700	[diff] [blame]	5132	/*
				5133	* Increment the failure counter only on periodic balance.
				5134	* We do not want newidle balance, which can be very
				5135	* frequent, pollute the failure counter causing
				5136	* excessive cache_hot migrations and active balances.
				5137	*/
				5138	if (idle != CPU_NEWLY_IDLE)
				5139	sd->nr_balance_failed++;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5140
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5141	if (need_active_balance(&env)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5142	raw_spin_lock_irqsave(&busiest->lock, flags);
				5143
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5144	/* don't kick the active_load_balance_cpu_stop,
				5145	* if the curr task on busiest cpu can't be
				5146	* moved to this_cpu
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5147	*/
				5148	if (!cpumask_test_cpu(this_cpu,
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	5149	tsk_cpus_allowed(busiest->curr))) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5150	raw_spin_unlock_irqrestore(&busiest->lock,
				5151	flags);
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5152	env.flags \|= LBF_ALL_PINNED;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5153	goto out_one_pinned;
				5154	}
				5155
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5156	/*
				5157	* ->active_balance synchronizes accesses to
				5158	* ->active_balance_work. Once set, it's cleared
				5159	* only after active load balance is finished.
				5160	*/
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5161	if (!busiest->active_balance) {
				5162	busiest->active_balance = 1;
				5163	busiest->push_cpu = this_cpu;
				5164	active_balance = 1;
				5165	}
				5166	raw_spin_unlock_irqrestore(&busiest->lock, flags);
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5167
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5168	if (active_balance) {
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5169	stop_one_cpu_nowait(cpu_of(busiest),
				5170	active_load_balance_cpu_stop, busiest,
				5171	&busiest->active_balance_work);
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5172	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5173
				5174	/*
				5175	* We've kicked active balancing, reset the failure
				5176	* counter.
				5177	*/
				5178	sd->nr_balance_failed = sd->cache_nice_tries+1;
				5179	}
				5180	} else
				5181	sd->nr_balance_failed = 0;
				5182
				5183	if (likely(!active_balance)) {
				5184	/* We were unbalanced, so reset the balancing interval */
				5185	sd->balance_interval = sd->min_interval;
				5186	} else {
				5187	/*
				5188	* If we've begun active balancing, start to back off. This
				5189	* case may not be covered by the all_pinned logic if there
				5190	* is only 1 task on the busy runqueue (because we don't call
				5191	* move_tasks).
				5192	*/
				5193	if (sd->balance_interval < sd->max_interval)
				5194	sd->balance_interval *= 2;
				5195	}
				5196
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5197	goto out;
				5198
				5199	out_balanced:
				5200	schedstat_inc(sd, lb_balanced[idle]);
				5201
				5202	sd->nr_balance_failed = 0;
				5203
				5204	out_one_pinned:
				5205	/* tune up the balancing interval */
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5206	if (((env.flags & LBF_ALL_PINNED) &&
Peter Zijlstra	5b54b56	2011-09-22 15:23:13 +0200	[diff] [blame]	5207	sd->balance_interval < MAX_PINNED_INTERVAL) \|\|
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5208	(sd->balance_interval < sd->max_interval))
				5209	sd->balance_interval *= 2;
				5210
Venkatesh Pallipadi	46e49b3	2011-02-14 14:38:50 -0800	[diff] [blame]	5211	ld_moved = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5212	out:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5213	return ld_moved;
				5214	}
				5215
				5216	/*
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5217	* idle_balance is called by schedule() if this_cpu is about to become
				5218	* idle. Attempts to pull tasks from other CPUs.
				5219	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5220	void idle_balance(int this_cpu, struct rq *this_rq)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5221	{
				5222	struct sched_domain *sd;
				5223	int pulled_task = 0;
				5224	unsigned long next_balance = jiffies + HZ;
				5225
				5226	this_rq->idle_stamp = this_rq->clock;
				5227
				5228	if (this_rq->avg_idle < sysctl_sched_migration_cost)
				5229	return;
				5230
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	5231	update_rq_runnable_avg(this_rq, 1);
				5232
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5233	/*
				5234	* Drop the rq->lock, but keep IRQ/preempt disabled.
				5235	*/
				5236	raw_spin_unlock(&this_rq->lock);
				5237
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	5238	update_blocked_averages(this_cpu);
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5239	rcu_read_lock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5240	for_each_domain(this_cpu, sd) {
				5241	unsigned long interval;
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5242	int balance = 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5243
				5244	if (!(sd->flags & SD_LOAD_BALANCE))
				5245	continue;
				5246
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5247	if (sd->flags & SD_BALANCE_NEWIDLE) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5248	/* If we've pulled tasks over stop searching: */
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5249	pulled_task = load_balance(this_cpu, this_rq,
				5250	sd, CPU_NEWLY_IDLE, &balance);
				5251	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5252
				5253	interval = msecs_to_jiffies(sd->balance_interval);
				5254	if (time_after(next_balance, sd->last_balance + interval))
				5255	next_balance = sd->last_balance + interval;
Nikhil Rao	d5ad140	2010-11-17 11:42:04 -0800	[diff] [blame]	5256	if (pulled_task) {
				5257	this_rq->idle_stamp = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5258	break;
Nikhil Rao	d5ad140	2010-11-17 11:42:04 -0800	[diff] [blame]	5259	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5260	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5261	rcu_read_unlock();
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5262
				5263	raw_spin_lock(&this_rq->lock);
				5264
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5265	if (pulled_task \|\| time_after(jiffies, this_rq->next_balance)) {
				5266	/*
				5267	* We are going idle. next_balance may be set based on
				5268	* a busy processor. So reset next_balance.
				5269	*/
				5270	this_rq->next_balance = next_balance;
				5271	}
				5272	}
				5273
				5274	/*
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5275	* active_load_balance_cpu_stop is run by cpu stopper. It pushes
				5276	* running tasks off the busiest CPU onto idle CPUs. It requires at
				5277	* least 1 task to be running on each physical CPU where possible, and
				5278	* avoids physical / logical imbalances.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5279	*/
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5280	static int active_load_balance_cpu_stop(void *data)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5281	{
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5282	struct rq *busiest_rq = data;
				5283	int busiest_cpu = cpu_of(busiest_rq);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5284	int target_cpu = busiest_rq->push_cpu;
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5285	struct rq *target_rq = cpu_rq(target_cpu);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5286	struct sched_domain *sd;
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5287
				5288	raw_spin_lock_irq(&busiest_rq->lock);
				5289
				5290	/* make sure the requested cpu hasn't gone down in the meantime */
				5291	if (unlikely(busiest_cpu != smp_processor_id() \|\|
				5292	!busiest_rq->active_balance))
				5293	goto out_unlock;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5294
				5295	/* Is there any task to move? */
				5296	if (busiest_rq->nr_running <= 1)
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5297	goto out_unlock;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5298
				5299	/*
				5300	* This condition is "impossible", if it occurs
				5301	* we need to fix it. Originally reported by
				5302	* Bjorn Helgaas on a 128-cpu setup.
				5303	*/
				5304	BUG_ON(busiest_rq == target_rq);
				5305
				5306	/* move a task from busiest_rq to target_rq */
				5307	double_lock_balance(busiest_rq, target_rq);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5308
				5309	/* Search for an sd spanning us and the target CPU. */
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5310	rcu_read_lock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5311	for_each_domain(target_cpu, sd) {
				5312	if ((sd->flags & SD_LOAD_BALANCE) &&
				5313	cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
				5314	break;
				5315	}
				5316
				5317	if (likely(sd)) {
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5318	struct lb_env env = {
				5319	.sd = sd,
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	5320	.dst_cpu = target_cpu,
				5321	.dst_rq = target_rq,
				5322	.src_cpu = busiest_rq->cpu,
				5323	.src_rq = busiest_rq,
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5324	.idle = CPU_IDLE,
				5325	};
				5326
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5327	schedstat_inc(sd, alb_count);
				5328
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5329	if (move_one_task(&env))
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5330	schedstat_inc(sd, alb_pushed);
				5331	else
				5332	schedstat_inc(sd, alb_failed);
				5333	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5334	rcu_read_unlock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5335	double_unlock_balance(busiest_rq, target_rq);
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5336	out_unlock:
				5337	busiest_rq->active_balance = 0;
				5338	raw_spin_unlock_irq(&busiest_rq->lock);
				5339	return 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5340	}
				5341
				5342	#ifdef CONFIG_NO_HZ
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5343	/*
				5344	* idle load balancing details
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5345	* - When one of the busy CPUs notice that there may be an idle rebalancing
				5346	* needed, they will kick the idle load balancer, which then does idle
				5347	* load balancing for all the idle CPUs.
				5348	*/
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5349	static struct {
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5350	cpumask_var_t idle_cpus_mask;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5351	atomic_t nr_cpus;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5352	unsigned long next_balance; /* in jiffy units */
				5353	} nohz ____cacheline_aligned;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5354
Peter Zijlstra	8e7fbcb	2012-01-09 11:28:35 +0100	[diff] [blame]	5355	static inline int find_new_ilb(int call_cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5356	{
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5357	int ilb = cpumask_first(nohz.idle_cpus_mask);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5358
Suresh Siddha	786d6dc7	2011-12-01 17:07:35 -0800	[diff] [blame]	5359	if (ilb < nr_cpu_ids && idle_cpu(ilb))
				5360	return ilb;
				5361
				5362	return nr_cpu_ids;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5363	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5364
				5365	/*
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5366	* Kick a CPU to do the nohz balancing, if it is time for it. We pick the
				5367	* nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
				5368	* CPU (if there is one).
				5369	*/
				5370	static void nohz_balancer_kick(int cpu)
				5371	{
				5372	int ilb_cpu;
				5373
				5374	nohz.next_balance++;
				5375
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5376	ilb_cpu = find_new_ilb(cpu);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5377
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5378	if (ilb_cpu >= nr_cpu_ids)
				5379	return;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5380
Suresh Siddha	cd490c5	2011-12-06 11:26:34 -0800	[diff] [blame]	5381	if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5382	return;
				5383	/*
				5384	* Use smp_send_reschedule() instead of resched_cpu().
				5385	* This way we generate a sched IPI on the target cpu which
				5386	* is idle. And the softirq performing nohz idle load balance
				5387	* will be run before returning from the IPI.
				5388	*/
				5389	smp_send_reschedule(ilb_cpu);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5390	return;
				5391	}
				5392
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5393	static inline void nohz_balance_exit_idle(int cpu)
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	5394	{
				5395	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
				5396	cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
				5397	atomic_dec(&nohz.nr_cpus);
				5398	clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
				5399	}
				5400	}
				5401
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	5402	static inline void set_cpu_sd_state_busy(void)
				5403	{
				5404	struct sched_domain *sd;
				5405	int cpu = smp_processor_id();
				5406
				5407	if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
				5408	return;
				5409	clear_bit(NOHZ_IDLE, nohz_flags(cpu));
				5410
				5411	rcu_read_lock();
				5412	for_each_domain(cpu, sd)
				5413	atomic_inc(&sd->groups->sgp->nr_busy_cpus);
				5414	rcu_read_unlock();
				5415	}
				5416
				5417	void set_cpu_sd_state_idle(void)
				5418	{
				5419	struct sched_domain *sd;
				5420	int cpu = smp_processor_id();
				5421
				5422	if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
				5423	return;
				5424	set_bit(NOHZ_IDLE, nohz_flags(cpu));
				5425
				5426	rcu_read_lock();
				5427	for_each_domain(cpu, sd)
				5428	atomic_dec(&sd->groups->sgp->nr_busy_cpus);
				5429	rcu_read_unlock();
				5430	}
				5431
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5432	/*
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5433	* This routine will record that the cpu is going idle with tick stopped.
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5434	* This info will be used in performing idle load balancing in the future.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5435	*/
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5436	void nohz_balance_enter_idle(int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5437	{
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	5438	/*
				5439	* If this cpu is going down, then nothing needs to be done.
				5440	*/
				5441	if (!cpu_active(cpu))
				5442	return;
				5443
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5444	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
				5445	return;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5446
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5447	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
				5448	atomic_inc(&nohz.nr_cpus);
				5449	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5450	}
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	5451
				5452	static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
				5453	unsigned long action, void *hcpu)
				5454	{
				5455	switch (action & ~CPU_TASKS_FROZEN) {
				5456	case CPU_DYING:
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5457	nohz_balance_exit_idle(smp_processor_id());
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	5458	return NOTIFY_OK;
				5459	default:
				5460	return NOTIFY_DONE;
				5461	}
				5462	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5463	#endif
				5464
				5465	static DEFINE_SPINLOCK(balancing);
				5466
Peter Zijlstra	49c022e	2011-04-05 10:14:25 +0200	[diff] [blame]	5467	/*
				5468	* Scale the max load_balance interval with the number of CPUs in the system.
				5469	* This trades load-balance latency on larger machines for less cross talk.
				5470	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5471	void update_max_interval(void)
Peter Zijlstra	49c022e	2011-04-05 10:14:25 +0200	[diff] [blame]	5472	{
				5473	max_load_balance_interval = HZ*num_online_cpus()/10;
				5474	}
				5475
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5476	/*
				5477	* It checks each scheduling domain to see if it is due to be balanced,
				5478	* and initiates a balancing operation if so.
				5479	*
				5480	* Balancing parameters are set up in arch_init_sched_domains.
				5481	*/
				5482	static void rebalance_domains(int cpu, enum cpu_idle_type idle)
				5483	{
				5484	int balance = 1;
				5485	struct rq *rq = cpu_rq(cpu);
				5486	unsigned long interval;
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	5487	struct sched_domain *sd;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5488	/* Earliest time when we have to do rebalance again */
				5489	unsigned long next_balance = jiffies + 60*HZ;
				5490	int update_next_balance = 0;
				5491	int need_serialize;
				5492
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	5493	update_blocked_averages(cpu);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	5494
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5495	rcu_read_lock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5496	for_each_domain(cpu, sd) {
				5497	if (!(sd->flags & SD_LOAD_BALANCE))
				5498	continue;
				5499
				5500	interval = sd->balance_interval;
				5501	if (idle != CPU_IDLE)
				5502	interval *= sd->busy_factor;
				5503
				5504	/* scale ms to jiffies */
				5505	interval = msecs_to_jiffies(interval);
Peter Zijlstra	49c022e	2011-04-05 10:14:25 +0200	[diff] [blame]	5506	interval = clamp(interval, 1UL, max_load_balance_interval);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5507
				5508	need_serialize = sd->flags & SD_SERIALIZE;
				5509
				5510	if (need_serialize) {
				5511	if (!spin_trylock(&balancing))
				5512	goto out;
				5513	}
				5514
				5515	if (time_after_eq(jiffies, sd->last_balance + interval)) {
				5516	if (load_balance(cpu, rq, sd, idle, &balance)) {
				5517	/*
				5518	* We've pulled tasks over so either we're no
Peter Zijlstra	c186faf	2011-02-21 18:52:53 +0100	[diff] [blame]	5519	* longer idle.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5520	*/
				5521	idle = CPU_NOT_IDLE;
				5522	}
				5523	sd->last_balance = jiffies;
				5524	}
				5525	if (need_serialize)
				5526	spin_unlock(&balancing);
				5527	out:
				5528	if (time_after(next_balance, sd->last_balance + interval)) {
				5529	next_balance = sd->last_balance + interval;
				5530	update_next_balance = 1;
				5531	}
				5532
				5533	/*
				5534	* Stop the load balance at this level. There is another
				5535	* CPU in our sched group which is doing load balancing more
				5536	* actively.
				5537	*/
				5538	if (!balance)
				5539	break;
				5540	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5541	rcu_read_unlock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5542
				5543	/*
				5544	* next_balance will be updated only when there is a need.
				5545	* When the cpu is attached to null domain for ex, it will not be
				5546	* updated.
				5547	*/
				5548	if (likely(update_next_balance))
				5549	rq->next_balance = next_balance;
				5550	}
				5551
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5552	#ifdef CONFIG_NO_HZ
				5553	/*
				5554	* In CONFIG_NO_HZ case, the idle balance kickee will do the
				5555	* rebalancing for all the cpus for whom scheduler ticks are stopped.
				5556	*/
				5557	static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
				5558	{
				5559	struct rq *this_rq = cpu_rq(this_cpu);
				5560	struct rq *rq;
				5561	int balance_cpu;
				5562
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5563	if (idle != CPU_IDLE \|\|
				5564	!test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
				5565	goto end;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5566
				5567	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
Suresh Siddha	8a6d42d	2011-12-06 11:19:37 -0800	[diff] [blame]	5568	if (balance_cpu == this_cpu \|\| !idle_cpu(balance_cpu))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5569	continue;
				5570
				5571	/*
				5572	* If this cpu gets work to do, stop the load balancing
				5573	* work being done for other cpus. Next load
				5574	* balancing owner will pick it up.
				5575	*/
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5576	if (need_resched())
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5577	break;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5578
Vincent Guittot	5ed4f1d	2012-09-13 06:11:26 +0200	[diff] [blame]	5579	rq = cpu_rq(balance_cpu);
				5580
				5581	raw_spin_lock_irq(&rq->lock);
				5582	update_rq_clock(rq);
				5583	update_idle_cpu_load(rq);
				5584	raw_spin_unlock_irq(&rq->lock);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5585
				5586	rebalance_domains(balance_cpu, CPU_IDLE);
				5587
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5588	if (time_after(this_rq->next_balance, rq->next_balance))
				5589	this_rq->next_balance = rq->next_balance;
				5590	}
				5591	nohz.next_balance = this_rq->next_balance;
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5592	end:
				5593	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5594	}
				5595
				5596	/*
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5597	* Current heuristic for kicking the idle load balancer in the presence
				5598	* of an idle cpu is the system.
				5599	* - This rq has more than one task.
				5600	* - At any scheduler domain level, this cpu's scheduler group has multiple
				5601	* busy cpu's exceeding the group's power.
				5602	* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
				5603	* domain span are idle.
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5604	*/
				5605	static inline int nohz_kick_needed(struct rq *rq, int cpu)
				5606	{
				5607	unsigned long now = jiffies;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5608	struct sched_domain *sd;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5609
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5610	if (unlikely(idle_cpu(cpu)))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5611	return 0;
				5612
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5613	/*
				5614	* We may be recently in ticked or tickless idle mode. At the first
				5615	* busy tick after returning from idle, we will update the busy stats.
				5616	*/
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	5617	set_cpu_sd_state_busy();
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5618	nohz_balance_exit_idle(cpu);
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5619
				5620	/*
				5621	* None are in tickless mode and hence no need for NOHZ idle load
				5622	* balancing.
				5623	*/
				5624	if (likely(!atomic_read(&nohz.nr_cpus)))
				5625	return 0;
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5626
				5627	if (time_before(now, nohz.next_balance))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5628	return 0;
				5629
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5630	if (rq->nr_running >= 2)
				5631	goto need_kick;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5632
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5633	rcu_read_lock();
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5634	for_each_domain(cpu, sd) {
				5635	struct sched_group *sg = sd->groups;
				5636	struct sched_group_power *sgp = sg->sgp;
				5637	int nr_busy = atomic_read(&sgp->nr_busy_cpus);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5638
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5639	if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5640	goto need_kick_unlock;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5641
				5642	if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
				5643	&& (cpumask_first_and(nohz.idle_cpus_mask,
				5644	sched_domain_span(sd)) < cpu))
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5645	goto need_kick_unlock;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5646
				5647	if (!(sd->flags & (SD_SHARE_PKG_RESOURCES \| SD_ASYM_PACKING)))
				5648	break;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5649	}
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5650	rcu_read_unlock();
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5651	return 0;
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5652
				5653	need_kick_unlock:
				5654	rcu_read_unlock();
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5655	need_kick:
				5656	return 1;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5657	}
				5658	#else
				5659	static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
				5660	#endif
				5661
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5662	/*
				5663	* run_rebalance_domains is triggered when needed from the scheduler tick.
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5664	* Also triggered for nohz idle balancing (with nohz_balancing_kick set).
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5665	*/
				5666	static void run_rebalance_domains(struct softirq_action *h)
				5667	{
				5668	int this_cpu = smp_processor_id();
				5669	struct rq *this_rq = cpu_rq(this_cpu);
Suresh Siddha	6eb57e0	2011-10-03 15:09:01 -0700	[diff] [blame]	5670	enum cpu_idle_type idle = this_rq->idle_balance ?
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5671	CPU_IDLE : CPU_NOT_IDLE;
				5672
				5673	rebalance_domains(this_cpu, idle);
				5674
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5675	/*
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5676	* If this cpu has a pending nohz_balance_kick, then do the
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5677	* balancing on behalf of the other idle cpus whose ticks are
				5678	* stopped.
				5679	*/
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5680	nohz_idle_balance(this_cpu, idle);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5681	}
				5682
				5683	static inline int on_null_domain(int cpu)
				5684	{
Paul E. McKenney	90a6501	2010-02-28 08:32:18 -0800	[diff] [blame]	5685	return !rcu_dereference_sched(cpu_rq(cpu)->sd);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5686	}
				5687
				5688	/*
				5689	* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5690	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5691	void trigger_load_balance(struct rq *rq, int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5692	{
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5693	/* Don't need to rebalance while attached to NULL domain */
				5694	if (time_after_eq(jiffies, rq->next_balance) &&
				5695	likely(!on_null_domain(cpu)))
				5696	raise_softirq(SCHED_SOFTIRQ);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5697	#ifdef CONFIG_NO_HZ
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5698	if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5699	nohz_balancer_kick(cpu);
				5700	#endif
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5701	}
				5702
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	5703	static void rq_online_fair(struct rq *rq)
				5704	{
				5705	update_sysctl();
				5706	}
				5707
				5708	static void rq_offline_fair(struct rq *rq)
				5709	{
				5710	update_sysctl();
Peter Boonstoppel	a4c96ae	2012-08-09 15:34:47 -0700	[diff] [blame]	5711
				5712	/* Ensure any throttled groups are reachable by pick_next_task */
				5713	unthrottle_offline_cfs_rqs(rq);
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	5714	}
				5715
Dhaval Giani	55e12e5	2008-06-24 23:39:43 +0530	[diff] [blame]	5716	#endif /* CONFIG_SMP */
Peter Williams	e1d1484	2007-10-24 18:23:51 +0200	[diff] [blame]	5717
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5718	/*
				5719	* scheduler tick hitting a task of our scheduling class:
				5720	*/
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	5721	static void task_tick_fair(struct rq rq, struct task_struct curr, int queued)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5722	{
				5723	struct cfs_rq *cfs_rq;
				5724	struct sched_entity *se = &curr->se;
				5725
				5726	for_each_sched_entity(se) {
				5727	cfs_rq = cfs_rq_of(se);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	5728	entity_tick(cfs_rq, se, queued);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5729	}
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	5730
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	5731	if (sched_feat_numa(NUMA))
				5732	task_tick_numa(rq, curr);
Linus Torvalds	3d59eeb	2012-12-16 14:33:25 -0800	[diff] [blame]	5733
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	5734	update_rq_runnable_avg(rq, 1);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5735	}
				5736
				5737	/*
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5738	* called on fork with the child task as argument from the parent's context
				5739	* - child not yet on the tasklist
				5740	* - preemption disabled
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5741	*/
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5742	static void task_fork_fair(struct task_struct *p)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5743	{
Daisuke Nishimura	4fc420c	2011-12-15 14:36:55 +0900	[diff] [blame]	5744	struct cfs_rq *cfs_rq;
				5745	struct sched_entity se = &p->se, curr;
Ingo Molnar	00bf7bf	2007-10-15 17:00:14 +0200	[diff] [blame]	5746	int this_cpu = smp_processor_id();
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5747	struct rq *rq = this_rq();
				5748	unsigned long flags;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5749
Thomas Gleixner	05fa785	2009-11-17 14:28:38 +0100	[diff] [blame]	5750	raw_spin_lock_irqsave(&rq->lock, flags);
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5751
Peter Zijlstra	861d034	2010-08-19 13:31:43 +0200	[diff] [blame]	5752	update_rq_clock(rq);
				5753
Daisuke Nishimura	4fc420c	2011-12-15 14:36:55 +0900	[diff] [blame]	5754	cfs_rq = task_cfs_rq(current);
				5755	curr = cfs_rq->curr;
				5756
Paul E. McKenney	b0a0f66	2010-10-06 17:32:51 -0700	[diff] [blame]	5757	if (unlikely(task_cpu(p) != this_cpu)) {
				5758	rcu_read_lock();
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5759	__set_task_cpu(p, this_cpu);
Paul E. McKenney	b0a0f66	2010-10-06 17:32:51 -0700	[diff] [blame]	5760	rcu_read_unlock();
				5761	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5762
Ting Yang	7109c44	2007-08-28 12:53:24 +0200	[diff] [blame]	5763	update_curr(cfs_rq);
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5764
Mike Galbraith	b5d9d73	2009-09-08 11:12:28 +0200	[diff] [blame]	5765	if (curr)
				5766	se->vruntime = curr->vruntime;
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	5767	place_entity(cfs_rq, se, 1);
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	5768
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5769	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
Dmitry Adamushko	87fefa3	2007-10-15 17:00:08 +0200	[diff] [blame]	5770	/*
Ingo Molnar	edcb60a	2007-10-15 17:00:08 +0200	[diff] [blame]	5771	* Upon rescheduling, sched_class::put_prev_task() will place
				5772	* 'current' within the tree based on its new key value.
				5773	*/
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	5774	swap(curr->vruntime, se->vruntime);
Bharata B Rao	aec0a51	2008-08-28 14:42:49 +0530	[diff] [blame]	5775	resched_task(rq->curr);
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	5776	}
				5777
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	5778	se->vruntime -= cfs_rq->min_vruntime;
				5779
Thomas Gleixner	05fa785	2009-11-17 14:28:38 +0100	[diff] [blame]	5780	raw_spin_unlock_irqrestore(&rq->lock, flags);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5781	}
				5782
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5783	/*
				5784	* Priority of the task has changed. Check to see if we preempt
				5785	* the current task.
				5786	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5787	static void
				5788	prio_changed_fair(struct rq rq, struct task_struct p, int oldprio)
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5789	{
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5790	if (!p->se.on_rq)
				5791	return;
				5792
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5793	/*
				5794	* Reschedule if we are currently running on this runqueue and
				5795	* our priority decreased, or if we are not currently running on
				5796	* this runqueue and our priority is higher than the current's
				5797	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5798	if (rq->curr == p) {
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5799	if (p->prio > oldprio)
				5800	resched_task(rq->curr);
				5801	} else
Peter Zijlstra	15afe09	2008-09-20 23:38:02 +0200	[diff] [blame]	5802	check_preempt_curr(rq, p, 0);
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5803	}
				5804
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5805	static void switched_from_fair(struct rq rq, struct task_struct p)
				5806	{
				5807	struct sched_entity *se = &p->se;
				5808	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				5809
				5810	/*
				5811	* Ensure the task's vruntime is normalized, so that when its
				5812	* switched back to the fair class the enqueue_entity(.flags=0) will
				5813	* do the right thing.
				5814	*
				5815	* If it was on_rq, then the dequeue_entity(.flags=0) will already
				5816	* have normalized the vruntime, if it was !on_rq, then only when
				5817	* the task is sleeping will it still have non-normalized vruntime.
				5818	*/
				5819	if (!se->on_rq && p->state != TASK_RUNNING) {
				5820	/*
				5821	* Fix up our vruntime so that the current sleep doesn't
				5822	* cause 'unlimited' sleep bonus.
				5823	*/
				5824	place_entity(cfs_rq, se, 0);
				5825	se->vruntime -= cfs_rq->min_vruntime;
				5826	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	5827
				5828	#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
				5829	/*
				5830	* Remove our load from contribution when we leave sched_fair
				5831	* and ensure we don't carry in an old decay_count if we
				5832	* switch back.
				5833	*/
				5834	if (p->se.avg.decay_count) {
				5835	struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
				5836	__synchronize_entity_decay(&p->se);
				5837	subtract_blocked_load_contrib(cfs_rq,
				5838	p->se.avg.load_avg_contrib);
				5839	}
				5840	#endif
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5841	}
				5842
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5843	/*
				5844	* We switched to the sched_fair class.
				5845	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5846	static void switched_to_fair(struct rq rq, struct task_struct p)
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5847	{
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5848	if (!p->se.on_rq)
				5849	return;
				5850
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5851	/*
				5852	* We were most likely switched from sched_rt, so
				5853	* kick off the schedule if running, otherwise just see
				5854	* if we can still preempt the current task.
				5855	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5856	if (rq->curr == p)
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5857	resched_task(rq->curr);
				5858	else
Peter Zijlstra	15afe09	2008-09-20 23:38:02 +0200	[diff] [blame]	5859	check_preempt_curr(rq, p, 0);
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5860	}
				5861
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	5862	/* Account for a task changing its policy or group.
				5863	*
				5864	* This routine is mostly called to set cfs_rq->curr field when a task
				5865	* migrates between groups/classes.
				5866	*/
				5867	static void set_curr_task_fair(struct rq *rq)
				5868	{
				5869	struct sched_entity *se = &rq->curr->se;
				5870
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	5871	for_each_sched_entity(se) {
				5872	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				5873
				5874	set_next_entity(cfs_rq, se);
				5875	/* ensure bandwidth has been allocated on our new cfs_rq */
				5876	account_cfs_rq_runtime(cfs_rq, 0);
				5877	}
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	5878	}
				5879
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5880	void init_cfs_rq(struct cfs_rq *cfs_rq)
				5881	{
				5882	cfs_rq->tasks_timeline = RB_ROOT;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5883	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
				5884	#ifndef CONFIG_64BIT
				5885	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
				5886	#endif
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	5887	#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
				5888	atomic64_set(&cfs_rq->decay_counter, 1);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	5889	atomic64_set(&cfs_rq->removed_load, 0);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	5890	#endif
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5891	}
				5892
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	5893	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	5894	static void task_move_group_fair(struct task_struct *p, int on_rq)
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	5895	{
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	5896	struct cfs_rq *cfs_rq;
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	5897	/*
				5898	* If the task was not on the rq at the time of this cgroup movement
				5899	* it must have been asleep, sleeping tasks keep their ->vruntime
				5900	* absolute on their old rq until wakeup (needed for the fair sleeper
				5901	* bonus in place_entity()).
				5902	*
				5903	* If it was on the rq, we've just 'preempted' it, which does convert
				5904	* ->vruntime to a relative base.
				5905	*
				5906	* Make sure both cases convert their relative position when migrating
				5907	* to another cgroup's rq. This does somewhat interfere with the
				5908	* fair sleeper stuff for the first placement, but who cares.
				5909	*/
Daisuke Nishimura	7ceff01	2011-12-15 14:36:07 +0900	[diff] [blame]	5910	/*
				5911	* When !on_rq, vruntime of the task has usually NOT been normalized.
				5912	* But there are some cases where it has already been normalized:
				5913	*
				5914	* - Moving a forked child which is waiting for being woken up by
				5915	* wake_up_new_task().
Daisuke Nishimura	62af378	2011-12-15 14:37:41 +0900	[diff] [blame]	5916	* - Moving a task which has been woken up by try_to_wake_up() and
				5917	* waiting for actually being woken up by sched_ttwu_pending().
Daisuke Nishimura	7ceff01	2011-12-15 14:36:07 +0900	[diff] [blame]	5918	*
				5919	* To prevent boost or penalty in the new cfs_rq caused by delta
				5920	* min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
				5921	*/
Daisuke Nishimura	62af378	2011-12-15 14:37:41 +0900	[diff] [blame]	5922	if (!on_rq && (!p->se.sum_exec_runtime \|\| p->state == TASK_WAKING))
Daisuke Nishimura	7ceff01	2011-12-15 14:36:07 +0900	[diff] [blame]	5923	on_rq = 1;
				5924
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	5925	if (!on_rq)
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	5926	p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
				5927	set_task_rq(p, task_cpu(p));
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	5928	if (!on_rq) {
				5929	cfs_rq = cfs_rq_of(&p->se);
				5930	p->se.vruntime += cfs_rq->min_vruntime;
				5931	#ifdef CONFIG_SMP
				5932	/*
				5933	* migrate_task_rq_fair() will have removed our previous
				5934	* contribution, but we must synchronize for ongoing future
				5935	* decay.
				5936	*/
				5937	p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
				5938	cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
				5939	#endif
				5940	}
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	5941	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5942
				5943	void free_fair_sched_group(struct task_group *tg)
				5944	{
				5945	int i;
				5946
				5947	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
				5948
				5949	for_each_possible_cpu(i) {
				5950	if (tg->cfs_rq)
				5951	kfree(tg->cfs_rq[i]);
				5952	if (tg->se)
				5953	kfree(tg->se[i]);
				5954	}
				5955
				5956	kfree(tg->cfs_rq);
				5957	kfree(tg->se);
				5958	}
				5959
				5960	int alloc_fair_sched_group(struct task_group tg, struct task_group parent)
				5961	{
				5962	struct cfs_rq *cfs_rq;
				5963	struct sched_entity *se;
				5964	int i;
				5965
				5966	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
				5967	if (!tg->cfs_rq)
				5968	goto err;
				5969	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
				5970	if (!tg->se)
				5971	goto err;
				5972
				5973	tg->shares = NICE_0_LOAD;
				5974
				5975	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
				5976
				5977	for_each_possible_cpu(i) {
				5978	cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
				5979	GFP_KERNEL, cpu_to_node(i));
				5980	if (!cfs_rq)
				5981	goto err;
				5982
				5983	se = kzalloc_node(sizeof(struct sched_entity),
				5984	GFP_KERNEL, cpu_to_node(i));
				5985	if (!se)
				5986	goto err_free_rq;
				5987
				5988	init_cfs_rq(cfs_rq);
				5989	init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
				5990	}
				5991
				5992	return 1;
				5993
				5994	err_free_rq:
				5995	kfree(cfs_rq);
				5996	err:
				5997	return 0;
				5998	}
				5999
				6000	void unregister_fair_sched_group(struct task_group *tg, int cpu)
				6001	{
				6002	struct rq *rq = cpu_rq(cpu);
				6003	unsigned long flags;
				6004
				6005	/*
				6006	* Only empty task groups can be destroyed; so we can speculatively
				6007	* check on_list without danger of it being re-added.
				6008	*/
				6009	if (!tg->cfs_rq[cpu]->on_list)
				6010	return;
				6011
				6012	raw_spin_lock_irqsave(&rq->lock, flags);
				6013	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
				6014	raw_spin_unlock_irqrestore(&rq->lock, flags);
				6015	}
				6016
				6017	void init_tg_cfs_entry(struct task_group tg, struct cfs_rq cfs_rq,
				6018	struct sched_entity *se, int cpu,
				6019	struct sched_entity *parent)
				6020	{
				6021	struct rq *rq = cpu_rq(cpu);
				6022
				6023	cfs_rq->tg = tg;
				6024	cfs_rq->rq = rq;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6025	init_cfs_rq_runtime(cfs_rq);
				6026
				6027	tg->cfs_rq[cpu] = cfs_rq;
				6028	tg->se[cpu] = se;
				6029
				6030	/* se could be NULL for root_task_group */
				6031	if (!se)
				6032	return;
				6033
				6034	if (!parent)
				6035	se->cfs_rq = &rq->cfs;
				6036	else
				6037	se->cfs_rq = parent->my_q;
				6038
				6039	se->my_q = cfs_rq;
				6040	update_load_set(&se->load, 0);
				6041	se->parent = parent;
				6042	}
				6043
				6044	static DEFINE_MUTEX(shares_mutex);
				6045
				6046	int sched_group_set_shares(struct task_group *tg, unsigned long shares)
				6047	{
				6048	int i;
				6049	unsigned long flags;
				6050
				6051	/*
				6052	* We can't change the weight of the root cgroup.
				6053	*/
				6054	if (!tg->se[0])
				6055	return -EINVAL;
				6056
				6057	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
				6058
				6059	mutex_lock(&shares_mutex);
				6060	if (tg->shares == shares)
				6061	goto done;
				6062
				6063	tg->shares = shares;
				6064	for_each_possible_cpu(i) {
				6065	struct rq *rq = cpu_rq(i);
				6066	struct sched_entity *se;
				6067
				6068	se = tg->se[i];
				6069	/* Propagate contribution to hierarchy */
				6070	raw_spin_lock_irqsave(&rq->lock, flags);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	6071	for_each_sched_entity(se)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6072	update_cfs_shares(group_cfs_rq(se));
				6073	raw_spin_unlock_irqrestore(&rq->lock, flags);
				6074	}
				6075
				6076	done:
				6077	mutex_unlock(&shares_mutex);
				6078	return 0;
				6079	}
				6080	#else /* CONFIG_FAIR_GROUP_SCHED */
				6081
				6082	void free_fair_sched_group(struct task_group *tg) { }
				6083
				6084	int alloc_fair_sched_group(struct task_group tg, struct task_group parent)
				6085	{
				6086	return 1;
				6087	}
				6088
				6089	void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
				6090
				6091	#endif /* CONFIG_FAIR_GROUP_SCHED */
				6092
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6093
H Hartley Sweeten	6d686f4	2010-01-13 20:21:52 -0700	[diff] [blame]	6094	static unsigned int get_rr_interval_fair(struct rq rq, struct task_struct task)
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6095	{
				6096	struct sched_entity *se = &task->se;
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6097	unsigned int rr_interval = 0;
				6098
				6099	/*
				6100	* Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
				6101	* idle runqueue:
				6102	*/
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6103	if (rq->cfs.load.weight)
				6104	rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6105
				6106	return rr_interval;
				6107	}
				6108
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6109	/*
				6110	* All the scheduling class methods:
				6111	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6112	const struct sched_class fair_sched_class = {
Ingo Molnar	5522d5d	2007-10-15 17:00:12 +0200	[diff] [blame]	6113	.next = &idle_sched_class,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6114	.enqueue_task = enqueue_task_fair,
				6115	.dequeue_task = dequeue_task_fair,
				6116	.yield_task = yield_task_fair,
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	6117	.yield_to_task = yield_to_task_fair,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6118
Ingo Molnar	2e09bf5	2007-10-15 17:00:05 +0200	[diff] [blame]	6119	.check_preempt_curr = check_preempt_wakeup,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6120
				6121	.pick_next_task = pick_next_task_fair,
				6122	.put_prev_task = put_prev_task_fair,
				6123
Peter Williams	681f3e6	2007-10-24 18:23:51 +0200	[diff] [blame]	6124	#ifdef CONFIG_SMP
Li Zefan	4ce72a2	2008-10-22 15:25:26 +0800	[diff] [blame]	6125	.select_task_rq = select_task_rq_fair,
Paul Turner	f4e26b1	2012-10-04 13:18:32 +0200	[diff] [blame]	6126	#ifdef CONFIG_FAIR_GROUP_SCHED
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	6127	.migrate_task_rq = migrate_task_rq_fair,
Paul Turner	f4e26b1	2012-10-04 13:18:32 +0200	[diff] [blame]	6128	#endif
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	6129	.rq_online = rq_online_fair,
				6130	.rq_offline = rq_offline_fair,
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	6131
				6132	.task_waking = task_waking_fair,
Peter Williams	681f3e6	2007-10-24 18:23:51 +0200	[diff] [blame]	6133	#endif
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6134
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	6135	.set_curr_task = set_curr_task_fair,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6136	.task_tick = task_tick_fair,
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	6137	.task_fork = task_fork_fair,
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6138
				6139	.prio_changed = prio_changed_fair,
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	6140	.switched_from = switched_from_fair,
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6141	.switched_to = switched_to_fair,
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6142
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6143	.get_rr_interval = get_rr_interval_fair,
				6144
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6145	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	6146	.task_move_group = task_move_group_fair,
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6147	#endif
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6148	};
				6149
				6150	#ifdef CONFIG_SCHED_DEBUG
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6151	void print_cfs_stats(struct seq_file *m, int cpu)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6152	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6153	struct cfs_rq *cfs_rq;
				6154
Peter Zijlstra	5973e5b	2008-01-25 21:08:34 +0100	[diff] [blame]	6155	rcu_read_lock();
Ingo Molnar	c3b64f1	2007-08-09 11:16:51 +0200	[diff] [blame]	6156	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
Ingo Molnar	5cef9ec	2007-08-09 11:16:47 +0200	[diff] [blame]	6157	print_cfs_rq(m, cpu, cfs_rq);
Peter Zijlstra	5973e5b	2008-01-25 21:08:34 +0100	[diff] [blame]	6158	rcu_read_unlock();
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6159	}
				6160	#endif
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6161
				6162	__init void init_sched_fair_class(void)
				6163	{
				6164	#ifdef CONFIG_SMP
				6165	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
				6166
				6167	#ifdef CONFIG_NO_HZ
Diwakar Tundlam	554ceca	2012-03-07 14:44:26 -0800	[diff] [blame]	6168	nohz.next_balance = jiffies;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6169	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	6170	cpu_notifier(sched_ilb_notifier, 0);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6171	#endif
				6172	#endif /* SMP */
				6173
				6174	}