Blame - kernel/sched/fair.c - SHIFTPHONES/android_kernel_shift_sdm845

blob: 8dbee9f4ceb2a16661a990a6d1e92af2160a3781 [file] [log] [blame]

Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1	/*
				2	* Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
				3	*
				4	* Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
				5	*
				6	* Interactivity improvements by Mike Galbraith
				7	* (C) 2007 Mike Galbraith <efault@gmx.de>
				8	*
				9	* Various enhancements by Dmitry Adamushko.
				10	* (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
				11	*
				12	* Group scheduling enhancements by Srivatsa Vaddagiri
				13	* Copyright IBM Corporation, 2007
				14	* Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
				15	*
				16	* Scaled math optimizations by Thomas Gleixner
				17	* Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	18	*
				19	* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
				20	* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	21	*/
				22
Arjan van de Ven	9745512	2008-01-25 21:08:34 +0100	[diff] [blame]	23	#include <linux/latencytop.h>
Christian Ehrhardt	1983a92	2009-11-30 12:16:47 +0100	[diff] [blame]	24	#include <linux/sched.h>
Sisir Koppaka	3436ae1	2011-03-26 18:22:55 +0530	[diff] [blame]	25	#include <linux/cpumask.h>
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	26	#include <linux/slab.h>
				27	#include <linux/profile.h>
				28	#include <linux/interrupt.h>
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	29	#include <linux/mempolicy.h>
Mel Gorman	e14808b	2012-11-19 10:59:15 +0000	[diff] [blame]	30	#include <linux/migrate.h>
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	31	#include <linux/task_work.h>
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	32
				33	#include <trace/events/sched.h>
				34
				35	#include "sched.h"
Arjan van de Ven	9745512	2008-01-25 21:08:34 +0100	[diff] [blame]	36
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	37	/*
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	38	* Targeted preemption latency for CPU-bound tasks:
Takuya Yoshikawa	864616e	2010-10-14 16:09:13 +0900	[diff] [blame]	39	* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	40	*
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	41	* NOTE: this latency value is not the same as the concept of
Ingo Molnar	d274a4c	2007-10-15 17:00:14 +0200	[diff] [blame]	42	* 'timeslice length' - timeslices in CFS are of variable length
				43	* and have no persistent notion like in traditional, time-slice
				44	* based scheduling concepts.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	45	*
Ingo Molnar	d274a4c	2007-10-15 17:00:14 +0200	[diff] [blame]	46	* (to see the precise effective timeslice length of your workload,
				47	* run vmstat and monitor the context-switches (cs) field)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	48	*/
Mike Galbraith	2140692	2010-03-11 17:17:15 +0100	[diff] [blame]	49	unsigned int sysctl_sched_latency = 6000000ULL;
				50	unsigned int normalized_sysctl_sched_latency = 6000000ULL;
Ingo Molnar	2bd8e6d	2007-10-15 17:00:02 +0200	[diff] [blame]	51
				52	/*
Christian Ehrhardt	1983a92	2009-11-30 12:16:47 +0100	[diff] [blame]	53	* The initial- and re-scaling of tunables is configurable
				54	* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
				55	*
				56	* Options are:
				57	* SCHED_TUNABLESCALING_NONE - unscaled, always *1
				58	* SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
				59	* SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
				60	*/
				61	enum sched_tunable_scaling sysctl_sched_tunable_scaling
				62	= SCHED_TUNABLESCALING_LOG;
				63
				64	/*
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	65	* Minimal preemption granularity for CPU-bound tasks:
Takuya Yoshikawa	864616e	2010-10-14 16:09:13 +0900	[diff] [blame]	66	* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	67	*/
Ingo Molnar	0bf377b	2010-09-12 08:14:52 +0200	[diff] [blame]	68	unsigned int sysctl_sched_min_granularity = 750000ULL;
				69	unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	70
				71	/*
				72	* is kept at sysctl_sched_latency / sysctl_sched_min_granularity
				73	*/
Ingo Molnar	0bf377b	2010-09-12 08:14:52 +0200	[diff] [blame]	74	static unsigned int sched_nr_latency = 8;
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	75
				76	/*
Mike Galbraith	2bba22c	2009-09-09 15:41:37 +0200	[diff] [blame]	77	* After fork, child runs first. If set to 0 (default) then
Ingo Molnar	2bd8e6d	2007-10-15 17:00:02 +0200	[diff] [blame]	78	* parent will (try to) run first.
				79	*/
Mike Galbraith	2bba22c	2009-09-09 15:41:37 +0200	[diff] [blame]	80	unsigned int sysctl_sched_child_runs_first __read_mostly;
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	81
				82	/*
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	83	* SCHED_OTHER wake-up granularity.
Mike Galbraith	172e082	2009-09-09 15:41:37 +0200	[diff] [blame]	84	* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	85	*
				86	* This option delays the preemption effects of decoupled workloads
				87	* and reduces their over-scheduling. Synchronous workloads will still
				88	* have immediate wakeup/sleep latencies.
				89	*/
Mike Galbraith	172e082	2009-09-09 15:41:37 +0200	[diff] [blame]	90	unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	91	unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	92
Ingo Molnar	da84d96	2007-10-15 17:00:18 +0200	[diff] [blame]	93	const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
				94
Paul Turner	a7a4f8a	2010-11-15 15:47:06 -0800	[diff] [blame]	95	/*
				96	* The exponential sliding window over which load is averaged for shares
				97	* distribution.
				98	* (default: 10msec)
				99	*/
				100	unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
				101
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	102	#ifdef CONFIG_CFS_BANDWIDTH
				103	/*
				104	* Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
				105	* each time a cfs_rq requests quota.
				106	*
				107	* Note: in the case that the slice exceeds the runtime remaining (either due
				108	* to consumption or the quota being specified to be smaller than the slice)
				109	* we will always only issue the remaining available time.
				110	*
				111	* default: 5 msec, units: microseconds
				112	*/
				113	unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
				114	#endif
				115
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	116	/*
				117	* Increase the granularity value when there are more CPUs,
				118	* because with more CPUs the 'effective latency' as visible
				119	* to users decreases. But the relationship is not linear,
				120	* so pick a second-best guess by going with the log2 of the
				121	* number of CPUs.
				122	*
				123	* This idea comes from the SD scheduler of Con Kolivas:
				124	*/
				125	static int get_update_sysctl_factor(void)
				126	{
				127	unsigned int cpus = min_t(int, num_online_cpus(), 8);
				128	unsigned int factor;
				129
				130	switch (sysctl_sched_tunable_scaling) {
				131	case SCHED_TUNABLESCALING_NONE:
				132	factor = 1;
				133	break;
				134	case SCHED_TUNABLESCALING_LINEAR:
				135	factor = cpus;
				136	break;
				137	case SCHED_TUNABLESCALING_LOG:
				138	default:
				139	factor = 1 + ilog2(cpus);
				140	break;
				141	}
				142
				143	return factor;
				144	}
				145
				146	static void update_sysctl(void)
				147	{
				148	unsigned int factor = get_update_sysctl_factor();
				149
				150	#define SET_SYSCTL(name) \
				151	(sysctl_##name = (factor) * normalized_sysctl_##name)
				152	SET_SYSCTL(sched_min_granularity);
				153	SET_SYSCTL(sched_latency);
				154	SET_SYSCTL(sched_wakeup_granularity);
				155	#undef SET_SYSCTL
				156	}
				157
				158	void sched_init_granularity(void)
				159	{
				160	update_sysctl();
				161	}
				162
				163	#if BITS_PER_LONG == 32
				164	# define WMULT_CONST (~0UL)
				165	#else
				166	# define WMULT_CONST (1UL << 32)
				167	#endif
				168
				169	#define WMULT_SHIFT 32
				170
				171	/*
				172	* Shift right and round:
				173	*/
				174	#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
				175
				176	/*
				177	* delta *= weight / lw
				178	*/
				179	static unsigned long
				180	calc_delta_mine(unsigned long delta_exec, unsigned long weight,
				181	struct load_weight *lw)
				182	{
				183	u64 tmp;
				184
				185	/*
				186	* weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
				187	* entities since MIN_SHARES = 2. Treat weight as 1 if less than
				188	* 2^SCHED_LOAD_RESOLUTION.
				189	*/
				190	if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
				191	tmp = (u64)delta_exec * scale_load_down(weight);
				192	else
				193	tmp = (u64)delta_exec;
				194
				195	if (!lw->inv_weight) {
				196	unsigned long w = scale_load_down(lw->weight);
				197
				198	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
				199	lw->inv_weight = 1;
				200	else if (unlikely(!w))
				201	lw->inv_weight = WMULT_CONST;
				202	else
				203	lw->inv_weight = WMULT_CONST / w;
				204	}
				205
				206	/*
				207	* Check whether we'd overflow the 64-bit multiplication:
				208	*/
				209	if (unlikely(tmp > WMULT_CONST))
				210	tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
				211	WMULT_SHIFT/2);
				212	else
				213	tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
				214
				215	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
				216	}
				217
				218
				219	const struct sched_class fair_sched_class;
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	220
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	221	/**************************************************************
				222	* CFS operations on generic schedulable entities:
				223	*/
				224
				225	#ifdef CONFIG_FAIR_GROUP_SCHED
				226
				227	/* cpu runqueue to which this cfs_rq is attached */
				228	static inline struct rq rq_of(struct cfs_rq cfs_rq)
				229	{
				230	return cfs_rq->rq;
				231	}
				232
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	233	/* An entity is a task if it doesn't "own" a runqueue */
				234	#define entity_is_task(se) (!se->my_q)
				235
Peter Zijlstra	8f48894	2009-07-24 12:25:30 +0200	[diff] [blame]	236	static inline struct task_struct task_of(struct sched_entity se)
				237	{
				238	#ifdef CONFIG_SCHED_DEBUG
				239	WARN_ON_ONCE(!entity_is_task(se));
				240	#endif
				241	return container_of(se, struct task_struct, se);
				242	}
				243
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	244	/* Walk up scheduling entities hierarchy */
				245	#define for_each_sched_entity(se) \
				246	for (; se; se = se->parent)
				247
				248	static inline struct cfs_rq task_cfs_rq(struct task_struct p)
				249	{
				250	return p->se.cfs_rq;
				251	}
				252
				253	/* runqueue on which this entity is (to be) queued */
				254	static inline struct cfs_rq cfs_rq_of(struct sched_entity se)
				255	{
				256	return se->cfs_rq;
				257	}
				258
				259	/* runqueue "owned" by this group */
				260	static inline struct cfs_rq group_cfs_rq(struct sched_entity grp)
				261	{
				262	return grp->my_q;
				263	}
				264
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	265	static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
				266	int force_update);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	267
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	268	static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				269	{
				270	if (!cfs_rq->on_list) {
Paul Turner	67e8625	2010-11-15 15:47:05 -0800	[diff] [blame]	271	/*
				272	* Ensure we either appear before our parent (if already
				273	* enqueued) or force our parent to appear after us when it is
				274	* enqueued. The fact that we always enqueue bottom-up
				275	* reduces this to two cases.
				276	*/
				277	if (cfs_rq->tg->parent &&
				278	cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
				279	list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	280	&rq_of(cfs_rq)->leaf_cfs_rq_list);
Paul Turner	67e8625	2010-11-15 15:47:05 -0800	[diff] [blame]	281	} else {
				282	list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
				283	&rq_of(cfs_rq)->leaf_cfs_rq_list);
				284	}
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	285
				286	cfs_rq->on_list = 1;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	287	/* We should have no load, but we need to update last_decay. */
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	288	update_cfs_rq_blocked_load(cfs_rq, 0);
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	289	}
				290	}
				291
				292	static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				293	{
				294	if (cfs_rq->on_list) {
				295	list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
				296	cfs_rq->on_list = 0;
				297	}
				298	}
				299
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	300	/* Iterate thr' all leaf cfs_rq's on a runqueue */
				301	#define for_each_leaf_cfs_rq(rq, cfs_rq) \
				302	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
				303
				304	/* Do the two (enqueued) entities belong to the same group ? */
				305	static inline int
				306	is_same_group(struct sched_entity se, struct sched_entity pse)
				307	{
				308	if (se->cfs_rq == pse->cfs_rq)
				309	return 1;
				310
				311	return 0;
				312	}
				313
				314	static inline struct sched_entity parent_entity(struct sched_entity se)
				315	{
				316	return se->parent;
				317	}
				318
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	319	/* return depth at which a sched entity is present in the hierarchy */
				320	static inline int depth_se(struct sched_entity *se)
				321	{
				322	int depth = 0;
				323
				324	for_each_sched_entity(se)
				325	depth++;
				326
				327	return depth;
				328	}
				329
				330	static void
				331	find_matching_se(struct sched_entity se, struct sched_entity pse)
				332	{
				333	int se_depth, pse_depth;
				334
				335	/*
				336	* preemption test can be made between sibling entities who are in the
				337	* same cfs_rq i.e who have a common parent. Walk up the hierarchy of
				338	* both tasks until we find their ancestors who are siblings of common
				339	* parent.
				340	*/
				341
				342	/* First walk up until both entities are at same depth */
				343	se_depth = depth_se(*se);
				344	pse_depth = depth_se(*pse);
				345
				346	while (se_depth > pse_depth) {
				347	se_depth--;
				348	se = parent_entity(se);
				349	}
				350
				351	while (pse_depth > se_depth) {
				352	pse_depth--;
				353	pse = parent_entity(pse);
				354	}
				355
				356	while (!is_same_group(se, pse)) {
				357	se = parent_entity(se);
				358	pse = parent_entity(pse);
				359	}
				360	}
				361
Peter Zijlstra	8f48894	2009-07-24 12:25:30 +0200	[diff] [blame]	362	#else /* !CONFIG_FAIR_GROUP_SCHED */
				363
				364	static inline struct task_struct task_of(struct sched_entity se)
				365	{
				366	return container_of(se, struct task_struct, se);
				367	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	368
				369	static inline struct rq rq_of(struct cfs_rq cfs_rq)
				370	{
				371	return container_of(cfs_rq, struct rq, cfs);
				372	}
				373
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	374	#define entity_is_task(se) 1
				375
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	376	#define for_each_sched_entity(se) \
				377	for (; se; se = NULL)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	378
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	379	static inline struct cfs_rq task_cfs_rq(struct task_struct p)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	380	{
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	381	return &task_rq(p)->cfs;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	382	}
				383
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	384	static inline struct cfs_rq cfs_rq_of(struct sched_entity se)
				385	{
				386	struct task_struct *p = task_of(se);
				387	struct rq *rq = task_rq(p);
				388
				389	return &rq->cfs;
				390	}
				391
				392	/* runqueue "owned" by this group */
				393	static inline struct cfs_rq group_cfs_rq(struct sched_entity grp)
				394	{
				395	return NULL;
				396	}
				397
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	398	static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				399	{
				400	}
				401
				402	static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				403	{
				404	}
				405
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	406	#define for_each_leaf_cfs_rq(rq, cfs_rq) \
				407	for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
				408
				409	static inline int
				410	is_same_group(struct sched_entity se, struct sched_entity pse)
				411	{
				412	return 1;
				413	}
				414
				415	static inline struct sched_entity parent_entity(struct sched_entity se)
				416	{
				417	return NULL;
				418	}
				419
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	420	static inline void
				421	find_matching_se(struct sched_entity se, struct sched_entity pse)
				422	{
				423	}
				424
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	425	#endif /* CONFIG_FAIR_GROUP_SCHED */
				426
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	427	static __always_inline
				428	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	429
				430	/**************************************************************
				431	* Scheduling class tree data structure manipulation methods:
				432	*/
				433
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	434	static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime)
Peter Zijlstra	02e0431	2007-10-15 17:00:07 +0200	[diff] [blame]	435	{
Peter Zijlstra	368059a	2007-10-15 17:00:11 +0200	[diff] [blame]	436	s64 delta = (s64)(vruntime - min_vruntime);
				437	if (delta > 0)
Peter Zijlstra	02e0431	2007-10-15 17:00:07 +0200	[diff] [blame]	438	min_vruntime = vruntime;
				439
				440	return min_vruntime;
				441	}
				442
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	443	static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
Peter Zijlstra	b0ffd24	2007-10-15 17:00:12 +0200	[diff] [blame]	444	{
				445	s64 delta = (s64)(vruntime - min_vruntime);
				446	if (delta < 0)
				447	min_vruntime = vruntime;
				448
				449	return min_vruntime;
				450	}
				451
Fabio Checconi	54fdc58	2009-07-16 12:32:27 +0200	[diff] [blame]	452	static inline int entity_before(struct sched_entity *a,
				453	struct sched_entity *b)
				454	{
				455	return (s64)(a->vruntime - b->vruntime) < 0;
				456	}
				457
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	458	static void update_min_vruntime(struct cfs_rq *cfs_rq)
				459	{
				460	u64 vruntime = cfs_rq->min_vruntime;
				461
				462	if (cfs_rq->curr)
				463	vruntime = cfs_rq->curr->vruntime;
				464
				465	if (cfs_rq->rb_leftmost) {
				466	struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
				467	struct sched_entity,
				468	run_node);
				469
Peter Zijlstra	e17036d	2009-01-15 14:53:39 +0100	[diff] [blame]	470	if (!cfs_rq->curr)
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	471	vruntime = se->vruntime;
				472	else
				473	vruntime = min_vruntime(vruntime, se->vruntime);
				474	}
				475
				476	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	477	#ifndef CONFIG_64BIT
				478	smp_wmb();
				479	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
				480	#endif
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	481	}
				482
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	483	/*
				484	* Enqueue an entity into the rb-tree:
				485	*/
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	486	static void __enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	487	{
				488	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
				489	struct rb_node *parent = NULL;
				490	struct sched_entity *entry;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	491	int leftmost = 1;
				492
				493	/*
				494	* Find the right place in the rbtree:
				495	*/
				496	while (*link) {
				497	parent = *link;
				498	entry = rb_entry(parent, struct sched_entity, run_node);
				499	/*
				500	* We dont care about collisions. Nodes with
				501	* the same key stay together.
				502	*/
Stephan Baerwolf	2bd2d6f	2011-07-20 14:46:59 +0200	[diff] [blame]	503	if (entity_before(se, entry)) {
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	504	link = &parent->rb_left;
				505	} else {
				506	link = &parent->rb_right;
				507	leftmost = 0;
				508	}
				509	}
				510
				511	/*
				512	* Maintain a cache of leftmost tree entries (it is frequently
				513	* used):
				514	*/
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	515	if (leftmost)
Ingo Molnar	57cb499	2007-10-15 17:00:11 +0200	[diff] [blame]	516	cfs_rq->rb_leftmost = &se->run_node;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	517
				518	rb_link_node(&se->run_node, parent, link);
				519	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	520	}
				521
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	522	static void __dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	523	{
Peter Zijlstra	3fe6974	2008-03-14 20:55:51 +0100	[diff] [blame]	524	if (cfs_rq->rb_leftmost == &se->run_node) {
				525	struct rb_node *next_node;
Peter Zijlstra	3fe6974	2008-03-14 20:55:51 +0100	[diff] [blame]	526
				527	next_node = rb_next(&se->run_node);
				528	cfs_rq->rb_leftmost = next_node;
Peter Zijlstra	3fe6974	2008-03-14 20:55:51 +0100	[diff] [blame]	529	}
Ingo Molnar	e9acbff	2007-10-15 17:00:04 +0200	[diff] [blame]	530
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	531	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	532	}
				533
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	534	struct sched_entity __pick_first_entity(struct cfs_rq cfs_rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	535	{
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	536	struct rb_node *left = cfs_rq->rb_leftmost;
				537
				538	if (!left)
				539	return NULL;
				540
				541	return rb_entry(left, struct sched_entity, run_node);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	542	}
				543
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	544	static struct sched_entity __pick_next_entity(struct sched_entity se)
				545	{
				546	struct rb_node *next = rb_next(&se->run_node);
				547
				548	if (!next)
				549	return NULL;
				550
				551	return rb_entry(next, struct sched_entity, run_node);
				552	}
				553
				554	#ifdef CONFIG_SCHED_DEBUG
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	555	struct sched_entity __pick_last_entity(struct cfs_rq cfs_rq)
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	556	{
Ingo Molnar	7eee3e6	2008-02-22 10:32:21 +0100	[diff] [blame]	557	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	558
Balbir Singh	70eee74	2008-02-22 13:25:53 +0530	[diff] [blame]	559	if (!last)
				560	return NULL;
Ingo Molnar	7eee3e6	2008-02-22 10:32:21 +0100	[diff] [blame]	561
				562	return rb_entry(last, struct sched_entity, run_node);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	563	}
				564
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	565	/**************************************************************
				566	* Scheduling class statistics methods:
				567	*/
				568
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	569	int sched_proc_update_handler(struct ctl_table *table, int write,
Alexey Dobriyan	8d65af7	2009-09-23 15:57:19 -0700	[diff] [blame]	570	void __user buffer, size_t lenp,
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	571	loff_t *ppos)
				572	{
Alexey Dobriyan	8d65af7	2009-09-23 15:57:19 -0700	[diff] [blame]	573	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	574	int factor = get_update_sysctl_factor();
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	575
				576	if (ret \|\| !write)
				577	return ret;
				578
				579	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
				580	sysctl_sched_min_granularity);
				581
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	582	#define WRT_SYSCTL(name) \
				583	(normalized_sysctl_##name = sysctl_##name / (factor))
				584	WRT_SYSCTL(sched_min_granularity);
				585	WRT_SYSCTL(sched_latency);
				586	WRT_SYSCTL(sched_wakeup_granularity);
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	587	#undef WRT_SYSCTL
				588
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	589	return 0;
				590	}
				591	#endif
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	592
				593	/*
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	594	* delta /= w
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	595	*/
				596	static inline unsigned long
				597	calc_delta_fair(unsigned long delta, struct sched_entity *se)
				598	{
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	599	if (unlikely(se->load.weight != NICE_0_LOAD))
				600	delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	601
				602	return delta;
				603	}
				604
				605	/*
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	606	* The idea is to set a period in which each task runs once.
				607	*
Borislav Petkov	532b185	2012-08-08 16:16:04 +0200	[diff] [blame]	608	* When there are too many tasks (sched_nr_latency) we have to stretch
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	609	* this period because otherwise the slices get too small.
				610	*
				611	* p = (nr <= nl) ? l : l*nr/nl
				612	*/
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	613	static u64 __sched_period(unsigned long nr_running)
				614	{
				615	u64 period = sysctl_sched_latency;
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	616	unsigned long nr_latency = sched_nr_latency;
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	617
				618	if (unlikely(nr_running > nr_latency)) {
Peter Zijlstra	4bf0b77	2008-01-25 21:08:21 +0100	[diff] [blame]	619	period = sysctl_sched_min_granularity;
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	620	period *= nr_running;
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	621	}
				622
				623	return period;
				624	}
				625
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	626	/*
				627	* We calculate the wall-time slice from the period by taking a part
				628	* proportional to the weight.
				629	*
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	630	* s = p*P[w/rw]
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	631	*/
Peter Zijlstra	6d0f0ebd	2007-10-15 17:00:05 +0200	[diff] [blame]	632	static u64 sched_slice(struct cfs_rq cfs_rq, struct sched_entity se)
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	633	{
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	634	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	635
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	636	for_each_sched_entity(se) {
Lin Ming	6272d68	2009-01-15 17:17:15 +0100	[diff] [blame]	637	struct load_weight *load;
Christian Engelmayer	3104bf0	2009-06-16 10:35:12 +0200	[diff] [blame]	638	struct load_weight lw;
Lin Ming	6272d68	2009-01-15 17:17:15 +0100	[diff] [blame]	639
				640	cfs_rq = cfs_rq_of(se);
				641	load = &cfs_rq->load;
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	642
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	643	if (unlikely(!se->on_rq)) {
Christian Engelmayer	3104bf0	2009-06-16 10:35:12 +0200	[diff] [blame]	644	lw = cfs_rq->load;
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	645
				646	update_load_add(&lw, se->load.weight);
				647	load = &lw;
				648	}
				649	slice = calc_delta_mine(slice, se->load.weight, load);
				650	}
				651	return slice;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	652	}
				653
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	654	/*
Peter Zijlstra	ac884de	2008-04-19 19:45:00 +0200	[diff] [blame]	655	* We calculate the vruntime slice of a to be inserted task
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	656	*
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	657	* vs = s/w
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	658	*/
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	659	static u64 sched_vslice(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	660	{
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	661	return calc_delta_fair(sched_slice(cfs_rq, se), se);
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	662	}
				663
				664	/*
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	665	* Update the current task's runtime statistics. Skip current tasks that
				666	* are not in our scheduling class.
				667	*/
				668	static inline void
Ingo Molnar	8ebc91d	2007-10-15 17:00:03 +0200	[diff] [blame]	669	__update_curr(struct cfs_rq cfs_rq, struct sched_entity curr,
				670	unsigned long delta_exec)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	671	{
Ingo Molnar	bbdba7c	2007-10-15 17:00:06 +0200	[diff] [blame]	672	unsigned long delta_exec_weighted;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	673
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	674	schedstat_set(curr->statistics.exec_max,
				675	max((u64)delta_exec, curr->statistics.exec_max));
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	676
				677	curr->sum_exec_runtime += delta_exec;
Ingo Molnar	7a62eab	2007-10-15 17:00:06 +0200	[diff] [blame]	678	schedstat_add(cfs_rq, exec_clock, delta_exec);
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	679	delta_exec_weighted = calc_delta_fair(delta_exec, curr);
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	680
Ingo Molnar	e9acbff	2007-10-15 17:00:04 +0200	[diff] [blame]	681	curr->vruntime += delta_exec_weighted;
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	682	update_min_vruntime(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	683	}
				684
Ingo Molnar	b7cc089	2007-08-09 11:16:47 +0200	[diff] [blame]	685	static void update_curr(struct cfs_rq *cfs_rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	686	{
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	687	struct sched_entity *curr = cfs_rq->curr;
Venkatesh Pallipadi	305e683	2010-10-04 17:03:21 -0700	[diff] [blame]	688	u64 now = rq_of(cfs_rq)->clock_task;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	689	unsigned long delta_exec;
				690
				691	if (unlikely(!curr))
				692	return;
				693
				694	/*
				695	* Get the amount of time the current task was running
				696	* since the last time we changed load (this cannot
				697	* overflow on 32 bits):
				698	*/
Ingo Molnar	8ebc91d	2007-10-15 17:00:03 +0200	[diff] [blame]	699	delta_exec = (unsigned long)(now - curr->exec_start);
Peter Zijlstra	34f28ec	2008-12-16 08:45:31 +0100	[diff] [blame]	700	if (!delta_exec)
				701	return;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	702
Ingo Molnar	8ebc91d	2007-10-15 17:00:03 +0200	[diff] [blame]	703	__update_curr(cfs_rq, curr, delta_exec);
				704	curr->exec_start = now;
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	705
				706	if (entity_is_task(curr)) {
				707	struct task_struct *curtask = task_of(curr);
				708
Ingo Molnar	f977bb4	2009-09-13 18:15:54 +0200	[diff] [blame]	709	trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	710	cpuacct_charge(curtask, delta_exec);
Frank Mayhar	f06febc	2008-09-12 09:54:39 -0700	[diff] [blame]	711	account_group_exec_runtime(curtask, delta_exec);
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	712	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	713
				714	account_cfs_rq_runtime(cfs_rq, delta_exec);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	715	}
				716
				717	static inline void
Ingo Molnar	5870db5	2007-08-09 11:16:47 +0200	[diff] [blame]	718	update_stats_wait_start(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	719	{
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	720	schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	721	}
				722
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	723	/*
				724	* Task is being enqueued - update stats:
				725	*/
Ingo Molnar	d2417e5	2007-08-09 11:16:47 +0200	[diff] [blame]	726	static void update_stats_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	727	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	728	/*
				729	* Are we enqueueing a waiting task? (for current tasks
				730	* a dequeue/enqueue event is a NOP)
				731	*/
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	732	if (se != cfs_rq->curr)
Ingo Molnar	5870db5	2007-08-09 11:16:47 +0200	[diff] [blame]	733	update_stats_wait_start(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	734	}
				735
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	736	static void
Ingo Molnar	9ef0a96	2007-08-09 11:16:47 +0200	[diff] [blame]	737	update_stats_wait_end(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	738	{
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	739	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
				740	rq_of(cfs_rq)->clock - se->statistics.wait_start));
				741	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
				742	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
				743	rq_of(cfs_rq)->clock - se->statistics.wait_start);
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	744	#ifdef CONFIG_SCHEDSTATS
				745	if (entity_is_task(se)) {
				746	trace_sched_stat_wait(task_of(se),
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	747	rq_of(cfs_rq)->clock - se->statistics.wait_start);
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	748	}
				749	#endif
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	750	schedstat_set(se->statistics.wait_start, 0);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	751	}
				752
				753	static inline void
Ingo Molnar	19b6a2e	2007-08-09 11:16:48 +0200	[diff] [blame]	754	update_stats_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	755	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	756	/*
				757	* Mark the end of the wait period if dequeueing a
				758	* waiting task:
				759	*/
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	760	if (se != cfs_rq->curr)
Ingo Molnar	9ef0a96	2007-08-09 11:16:47 +0200	[diff] [blame]	761	update_stats_wait_end(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	762	}
				763
				764	/*
				765	* We are picking a new current task - update its stats:
				766	*/
				767	static inline void
Ingo Molnar	79303e9	2007-08-09 11:16:47 +0200	[diff] [blame]	768	update_stats_curr_start(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	769	{
				770	/*
				771	* We are starting a new run period:
				772	*/
Venkatesh Pallipadi	305e683	2010-10-04 17:03:21 -0700	[diff] [blame]	773	se->exec_start = rq_of(cfs_rq)->clock_task;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	774	}
				775
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	776	/**************************************************
				777	* Scheduling class queueing methods:
				778	*/
				779
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	780	#ifdef CONFIG_NUMA_BALANCING
				781	/*
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	782	* numa task sample period in ms
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	783	*/
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	784	unsigned int sysctl_numa_balancing_scan_period_min = 100;
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	785	unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
				786	unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	787
				788	/* Portion of address space to scan in MB */
				789	unsigned int sysctl_numa_balancing_scan_size = 256;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	790
Peter Zijlstra	4b96a29	2012-10-25 14:16:47 +0200	[diff] [blame]	791	/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
				792	unsigned int sysctl_numa_balancing_scan_delay = 1000;
				793
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	794	static void task_numa_placement(struct task_struct *p)
				795	{
Hugh Dickins	2832bc1	2012-12-19 17:42:16 -0800	[diff] [blame]	796	int seq;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	797
Hugh Dickins	2832bc1	2012-12-19 17:42:16 -0800	[diff] [blame]	798	if (!p->mm) /* for example, ksmd faulting in a user's mm */
				799	return;
				800	seq = ACCESS_ONCE(p->mm->numa_scan_seq);
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	801	if (p->numa_scan_seq == seq)
				802	return;
				803	p->numa_scan_seq = seq;
				804
				805	/* FIXME: Scheduling placement policy hints go here */
				806	}
				807
				808	/*
				809	* Got a PROT_NONE fault for a page on @node.
				810	*/
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	811	void task_numa_fault(int node, int pages, bool migrated)
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	812	{
				813	struct task_struct *p = current;
				814
Mel Gorman	1a687c2	2012-11-22 11:16:36 +0000	[diff] [blame]	815	if (!sched_feat_numa(NUMA))
				816	return;
				817
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	818	/* FIXME: Allocate task-specific structure for placement policy here */
				819
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	820	/*
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	821	* If pages are properly placed (did not migrate) then scan slower.
				822	* This is reset periodically in case of phase changes
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	823	*/
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	824	if (!migrated)
				825	p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
				826	p->numa_scan_period + jiffies_to_msecs(10));
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	827
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	828	task_numa_placement(p);
				829	}
				830
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	831	static void reset_ptenuma_scan(struct task_struct *p)
				832	{
				833	ACCESS_ONCE(p->mm->numa_scan_seq)++;
				834	p->mm->numa_scan_offset = 0;
				835	}
				836
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	837	/*
				838	* The expensive part of numa migration is done from task_work context.
				839	* Triggered from task_tick_numa().
				840	*/
				841	void task_numa_work(struct callback_head *work)
				842	{
				843	unsigned long migrate, next_scan, now = jiffies;
				844	struct task_struct *p = current;
				845	struct mm_struct *mm = p->mm;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	846	struct vm_area_struct *vma;
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	847	unsigned long start, end;
				848	long pages;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	849
				850	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
				851
				852	work->next = work; /* protect against double add */
				853	/*
				854	* Who cares about NUMA placement when they're dying.
				855	*
				856	* NOTE: make sure not to dereference p->mm before this check,
				857	* exit_task_work() happens _after_ exit_mm() so we could be called
				858	* without p->mm even though we still had it when we enqueued this
				859	* work.
				860	*/
				861	if (p->flags & PF_EXITING)
				862	return;
				863
				864	/*
Mel Gorman	5bca230	2012-11-22 14:40:03 +0000	[diff] [blame]	865	* We do not care about task placement until a task runs on a node
				866	* other than the first one used by the address space. This is
				867	* largely because migrations are driven by what CPU the task
				868	* is running on. If it's never scheduled on another node, it'll
				869	* not migrate so why bother trapping the fault.
				870	*/
				871	if (mm->first_nid == NUMA_PTE_SCAN_INIT)
				872	mm->first_nid = numa_node_id();
				873	if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
				874	/* Are we running on a new node yet? */
				875	if (numa_node_id() == mm->first_nid &&
				876	!sched_feat_numa(NUMA_FORCE))
				877	return;
				878
				879	mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
				880	}
				881
				882	/*
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	883	* Reset the scan period if enough time has gone by. Objective is that
				884	* scanning will be reduced if pages are properly placed. As tasks
				885	* can enter different phases this needs to be re-examined. Lacking
				886	* proper tracking of reference behaviour, this blunt hammer is used.
				887	*/
				888	migrate = mm->numa_next_reset;
				889	if (time_after(now, migrate)) {
				890	p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
				891	next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
				892	xchg(&mm->numa_next_reset, next_scan);
				893	}
				894
				895	/*
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	896	* Enforce maximal scan/migration frequency..
				897	*/
				898	migrate = mm->numa_next_scan;
				899	if (time_before(now, migrate))
				900	return;
				901
				902	if (p->numa_scan_period == 0)
				903	p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
				904
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	905	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	906	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
				907	return;
				908
Mel Gorman	e14808b	2012-11-19 10:59:15 +0000	[diff] [blame]	909	/*
				910	* Do not set pte_numa if the current running node is rate-limited.
				911	* This loses statistics on the fault but if we are unwilling to
				912	* migrate to this node, it is less likely we can do useful work
				913	*/
				914	if (migrate_ratelimited(numa_node_id()))
				915	return;
				916
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	917	start = mm->numa_scan_offset;
				918	pages = sysctl_numa_balancing_scan_size;
				919	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
				920	if (!pages)
				921	return;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	922
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	923	down_read(&mm->mmap_sem);
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	924	vma = find_vma(mm, start);
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	925	if (!vma) {
				926	reset_ptenuma_scan(p);
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	927	start = 0;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	928	vma = mm->mmap;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	929	}
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	930	for (; vma; vma = vma->vm_next) {
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	931	if (!vma_migratable(vma))
				932	continue;
				933
				934	/* Skip small VMAs. They are not likely to be of relevance */
Mel Gorman	221392c	2012-12-17 14:05:53 +0000	[diff] [blame]	935	if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	936	continue;
				937
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	938	do {
				939	start = max(start, vma->vm_start);
				940	end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
				941	end = min(end, vma->vm_end);
				942	pages -= change_prot_numa(vma, start, end);
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	943
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	944	start = end;
				945	if (pages <= 0)
				946	goto out;
				947	} while (end != vma->vm_end);
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	948	}
				949
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	950	out:
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	951	/*
				952	* It is possible to reach the end of the VMA list but the last few VMAs are
				953	* not guaranteed to the vma_migratable. If they are not, we would find the
				954	* !migratable VMA on the next scan but not reset the scanner to the start
				955	* so check it now.
				956	*/
				957	if (vma)
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	958	mm->numa_scan_offset = start;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	959	else
				960	reset_ptenuma_scan(p);
				961	up_read(&mm->mmap_sem);
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	962	}
				963
				964	/*
				965	* Drive the periodic memory faults..
				966	*/
				967	void task_tick_numa(struct rq rq, struct task_struct curr)
				968	{
				969	struct callback_head *work = &curr->numa_work;
				970	u64 period, now;
				971
				972	/*
				973	* We don't care about NUMA placement if we don't have memory.
				974	*/
				975	if (!curr->mm \|\| (curr->flags & PF_EXITING) \|\| work->next != work)
				976	return;
				977
				978	/*
				979	* Using runtime rather than walltime has the dual advantage that
				980	* we (mostly) drive the selection from busy threads and that the
				981	* task needs to have done some actual work before we bother with
				982	* NUMA placement.
				983	*/
				984	now = curr->se.sum_exec_runtime;
				985	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
				986
				987	if (now - curr->node_stamp > period) {
Peter Zijlstra	4b96a29	2012-10-25 14:16:47 +0200	[diff] [blame]	988	if (!curr->node_stamp)
				989	curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	990	curr->node_stamp = now;
				991
				992	if (!time_before(jiffies, curr->mm->numa_next_scan)) {
				993	init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
				994	task_work_add(curr, work, true);
				995	}
				996	}
				997	}
				998	#else
				999	static void task_tick_numa(struct rq rq, struct task_struct curr)
				1000	{
				1001	}
				1002	#endif /* CONFIG_NUMA_BALANCING */
				1003
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1004	static void
				1005	account_entity_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
				1006	{
				1007	update_load_add(&cfs_rq->load, se->load.weight);
Peter Zijlstra	c09595f	2008-06-27 13:41:14 +0200	[diff] [blame]	1008	if (!parent_entity(se))
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	1009	update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	1010	#ifdef CONFIG_SMP
				1011	if (entity_is_task(se))
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	1012	list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	1013	#endif
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1014	cfs_rq->nr_running++;
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1015	}
				1016
				1017	static void
				1018	account_entity_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)
				1019	{
				1020	update_load_sub(&cfs_rq->load, se->load.weight);
Peter Zijlstra	c09595f	2008-06-27 13:41:14 +0200	[diff] [blame]	1021	if (!parent_entity(se))
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	1022	update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	1023	if (entity_is_task(se))
Bharata B Rao	b87f172	2008-09-25 09:53:54 +0530	[diff] [blame]	1024	list_del_init(&se->group_node);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1025	cfs_rq->nr_running--;
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1026	}
				1027
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1028	#ifdef CONFIG_FAIR_GROUP_SCHED
				1029	# ifdef CONFIG_SMP
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1030	static inline long calc_tg_weight(struct task_group tg, struct cfs_rq cfs_rq)
				1031	{
				1032	long tg_weight;
				1033
				1034	/*
				1035	* Use this CPU's actual weight instead of the last load_contribution
				1036	* to gain a more accurate current total weight. See
				1037	* update_cfs_rq_load_contribution().
				1038	*/
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	1039	tg_weight = atomic64_read(&tg->load_avg);
				1040	tg_weight -= cfs_rq->tg_load_contrib;
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1041	tg_weight += cfs_rq->load.weight;
				1042
				1043	return tg_weight;
				1044	}
				1045
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1046	static long calc_cfs_shares(struct cfs_rq cfs_rq, struct task_group tg)
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1047	{
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1048	long tg_weight, load, shares;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1049
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1050	tg_weight = calc_tg_weight(tg, cfs_rq);
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1051	load = cfs_rq->load.weight;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1052
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1053	shares = (tg->shares * load);
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1054	if (tg_weight)
				1055	shares /= tg_weight;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1056
				1057	if (shares < MIN_SHARES)
				1058	shares = MIN_SHARES;
				1059	if (shares > tg->shares)
				1060	shares = tg->shares;
				1061
				1062	return shares;
				1063	}
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1064	# else /* CONFIG_SMP */
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1065	static inline long calc_cfs_shares(struct cfs_rq cfs_rq, struct task_group tg)
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1066	{
				1067	return tg->shares;
				1068	}
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1069	# endif /* CONFIG_SMP */
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1070	static void reweight_entity(struct cfs_rq cfs_rq, struct sched_entity se,
				1071	unsigned long weight)
				1072	{
Paul Turner	19e5eeb	2010-12-15 19:10:18 -0800	[diff] [blame]	1073	if (se->on_rq) {
				1074	/* commit outstanding execution time */
				1075	if (cfs_rq->curr == se)
				1076	update_curr(cfs_rq);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1077	account_entity_dequeue(cfs_rq, se);
Paul Turner	19e5eeb	2010-12-15 19:10:18 -0800	[diff] [blame]	1078	}
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1079
				1080	update_load_set(&se->load, weight);
				1081
				1082	if (se->on_rq)
				1083	account_entity_enqueue(cfs_rq, se);
				1084	}
				1085
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	1086	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
				1087
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1088	static void update_cfs_shares(struct cfs_rq *cfs_rq)
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1089	{
				1090	struct task_group *tg;
				1091	struct sched_entity *se;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1092	long shares;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1093
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1094	tg = cfs_rq->tg;
				1095	se = tg->se[cpu_of(rq_of(cfs_rq))];
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	1096	if (!se \|\| throttled_hierarchy(cfs_rq))
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1097	return;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1098	#ifndef CONFIG_SMP
				1099	if (likely(se->load.weight == tg->shares))
				1100	return;
				1101	#endif
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1102	shares = calc_cfs_shares(cfs_rq, tg);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1103
				1104	reweight_entity(cfs_rq_of(se), se, shares);
				1105	}
				1106	#else /* CONFIG_FAIR_GROUP_SCHED */
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1107	static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1108	{
				1109	}
				1110	#endif /* CONFIG_FAIR_GROUP_SCHED */
				1111
Paul Turner	f4e26b1	2012-10-04 13:18:32 +0200	[diff] [blame]	1112	/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
				1113	#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1114	/*
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1115	* We choose a half-life close to 1 scheduling period.
				1116	* Note: The tables below are dependent on this value.
				1117	*/
				1118	#define LOAD_AVG_PERIOD 32
				1119	#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
				1120	#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
				1121
				1122	/* Precomputed fixed inverse multiplies for multiplication by y^n */
				1123	static const u32 runnable_avg_yN_inv[] = {
				1124	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
				1125	0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
				1126	0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
				1127	0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
				1128	0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
				1129	0x85aac367, 0x82cd8698,
				1130	};
				1131
				1132	/*
				1133	* Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
				1134	* over-estimates when re-combining.
				1135	*/
				1136	static const u32 runnable_avg_yN_sum[] = {
				1137	0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
				1138	9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
				1139	17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
				1140	};
				1141
				1142	/*
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1143	* Approximate:
				1144	* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
				1145	*/
				1146	static __always_inline u64 decay_load(u64 val, u64 n)
				1147	{
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1148	unsigned int local_n;
				1149
				1150	if (!n)
				1151	return val;
				1152	else if (unlikely(n > LOAD_AVG_PERIOD * 63))
				1153	return 0;
				1154
				1155	/* after bounds checking we can collapse to 32-bit */
				1156	local_n = n;
				1157
				1158	/*
				1159	* As y^PERIOD = 1/2, we can combine
				1160	* y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
				1161	* With a look-up table which covers k^n (n<PERIOD)
				1162	*
				1163	* To achieve constant time decay_load.
				1164	*/
				1165	if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
				1166	val >>= local_n / LOAD_AVG_PERIOD;
				1167	local_n %= LOAD_AVG_PERIOD;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1168	}
				1169
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1170	val *= runnable_avg_yN_inv[local_n];
				1171	/* We don't use SRR here since we always want to round down. */
				1172	return val >> 32;
				1173	}
				1174
				1175	/*
				1176	* For updates fully spanning n periods, the contribution to runnable
				1177	* average will be: \Sum 1024*y^n
				1178	*
				1179	* We can compute this reasonably efficiently by combining:
				1180	* y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
				1181	*/
				1182	static u32 __compute_runnable_contrib(u64 n)
				1183	{
				1184	u32 contrib = 0;
				1185
				1186	if (likely(n <= LOAD_AVG_PERIOD))
				1187	return runnable_avg_yN_sum[n];
				1188	else if (unlikely(n >= LOAD_AVG_MAX_N))
				1189	return LOAD_AVG_MAX;
				1190
				1191	/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
				1192	do {
				1193	contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
				1194	contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
				1195
				1196	n -= LOAD_AVG_PERIOD;
				1197	} while (n > LOAD_AVG_PERIOD);
				1198
				1199	contrib = decay_load(contrib, n);
				1200	return contrib + runnable_avg_yN_sum[n];
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1201	}
				1202
				1203	/*
				1204	* We can represent the historical contribution to runnable average as the
				1205	* coefficients of a geometric series. To do this we sub-divide our runnable
				1206	* history into segments of approximately 1ms (1024us); label the segment that
				1207	* occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
				1208	*
				1209	* [<- 1024us ->\|<- 1024us ->\|<- 1024us ->\| ...
				1210	* p0 p1 p2
				1211	* (now) (~1ms ago) (~2ms ago)
				1212	*
				1213	* Let u_i denote the fraction of p_i that the entity was runnable.
				1214	*
				1215	* We then designate the fractions u_i as our co-efficients, yielding the
				1216	* following representation of historical load:
				1217	* u_0 + u_1y + u_2y^2 + u_3*y^3 + ...
				1218	*
				1219	* We choose y based on the with of a reasonably scheduling period, fixing:
				1220	* y^32 = 0.5
				1221	*
				1222	* This means that the contribution to load ~32ms ago (u_32) will be weighted
				1223	* approximately half as much as the contribution to load within the last ms
				1224	* (u_0).
				1225	*
				1226	* When a period "rolls over" and we have new u_0`, multiplying the previous
				1227	* sum again by y is sufficient to update:
				1228	* load_avg = u_0` + y(u_0 + u_1y + u_2*y^2 + ... )
				1229	* = u_0 + u_1y + u_2y^2 + ... [re-labeling u_i --> u_{i+1}]
				1230	*/
				1231	static __always_inline int __update_entity_runnable_avg(u64 now,
				1232	struct sched_avg *sa,
				1233	int runnable)
				1234	{
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1235	u64 delta, periods;
				1236	u32 runnable_contrib;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1237	int delta_w, decayed = 0;
				1238
				1239	delta = now - sa->last_runnable_update;
				1240	/*
				1241	* This should only happen when time goes backwards, which it
				1242	* unfortunately does during sched clock init when we swap over to TSC.
				1243	*/
				1244	if ((s64)delta < 0) {
				1245	sa->last_runnable_update = now;
				1246	return 0;
				1247	}
				1248
				1249	/*
				1250	* Use 1024ns as the unit of measurement since it's a reasonable
				1251	* approximation of 1us and fast to compute.
				1252	*/
				1253	delta >>= 10;
				1254	if (!delta)
				1255	return 0;
				1256	sa->last_runnable_update = now;
				1257
				1258	/* delta_w is the amount already accumulated against our next period */
				1259	delta_w = sa->runnable_avg_period % 1024;
				1260	if (delta + delta_w >= 1024) {
				1261	/* period roll-over */
				1262	decayed = 1;
				1263
				1264	/*
				1265	* Now that we know we're crossing a period boundary, figure
				1266	* out how much from delta we need to complete the current
				1267	* period and accrue it.
				1268	*/
				1269	delta_w = 1024 - delta_w;
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1270	if (runnable)
				1271	sa->runnable_avg_sum += delta_w;
				1272	sa->runnable_avg_period += delta_w;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1273
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1274	delta -= delta_w;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1275
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1276	/* Figure out how many additional periods this update spans */
				1277	periods = delta / 1024;
				1278	delta %= 1024;
				1279
				1280	sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
				1281	periods + 1);
				1282	sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
				1283	periods + 1);
				1284
				1285	/* Efficiently calculate \sum (1..n_period) 1024y^i /
				1286	runnable_contrib = __compute_runnable_contrib(periods);
				1287	if (runnable)
				1288	sa->runnable_avg_sum += runnable_contrib;
				1289	sa->runnable_avg_period += runnable_contrib;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1290	}
				1291
				1292	/* Remainder of delta accrued against u_0` */
				1293	if (runnable)
				1294	sa->runnable_avg_sum += delta;
				1295	sa->runnable_avg_period += delta;
				1296
				1297	return decayed;
				1298	}
				1299
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1300	/* Synchronize an entity's decay with its parenting cfs_rq.*/
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1301	static inline u64 __synchronize_entity_decay(struct sched_entity *se)
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1302	{
				1303	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1304	u64 decays = atomic64_read(&cfs_rq->decay_counter);
				1305
				1306	decays -= se->avg.decay_count;
				1307	if (!decays)
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1308	return 0;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1309
				1310	se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
				1311	se->avg.decay_count = 0;
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1312
				1313	return decays;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1314	}
				1315
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1316	#ifdef CONFIG_FAIR_GROUP_SCHED
				1317	static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
				1318	int force_update)
				1319	{
				1320	struct task_group *tg = cfs_rq->tg;
				1321	s64 tg_contrib;
				1322
				1323	tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
				1324	tg_contrib -= cfs_rq->tg_load_contrib;
				1325
				1326	if (force_update \|\| abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
				1327	atomic64_add(tg_contrib, &tg->load_avg);
				1328	cfs_rq->tg_load_contrib += tg_contrib;
				1329	}
				1330	}
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1331
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1332	/*
				1333	* Aggregate cfs_rq runnable averages into an equivalent task_group
				1334	* representation for computing load contributions.
				1335	*/
				1336	static inline void __update_tg_runnable_avg(struct sched_avg *sa,
				1337	struct cfs_rq *cfs_rq)
				1338	{
				1339	struct task_group *tg = cfs_rq->tg;
				1340	long contrib;
				1341
				1342	/* The fraction of a cpu used by this cfs_rq */
				1343	contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
				1344	sa->runnable_avg_period + 1);
				1345	contrib -= cfs_rq->tg_runnable_contrib;
				1346
				1347	if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
				1348	atomic_add(contrib, &tg->runnable_avg);
				1349	cfs_rq->tg_runnable_contrib += contrib;
				1350	}
				1351	}
				1352
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1353	static inline void __update_group_entity_contrib(struct sched_entity *se)
				1354	{
				1355	struct cfs_rq *cfs_rq = group_cfs_rq(se);
				1356	struct task_group *tg = cfs_rq->tg;
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1357	int runnable_avg;
				1358
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1359	u64 contrib;
				1360
				1361	contrib = cfs_rq->tg_load_contrib * tg->shares;
				1362	se->avg.load_avg_contrib = div64_u64(contrib,
				1363	atomic64_read(&tg->load_avg) + 1);
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1364
				1365	/*
				1366	* For group entities we need to compute a correction term in the case
				1367	* that they are consuming <1 cpu so that we would contribute the same
				1368	* load as a task of equal weight.
				1369	*
				1370	* Explicitly co-ordinating this measurement would be expensive, but
				1371	* fortunately the sum of each cpus contribution forms a usable
				1372	* lower-bound on the true value.
				1373	*
				1374	* Consider the aggregate of 2 contributions. Either they are disjoint
				1375	* (and the sum represents true value) or they are disjoint and we are
				1376	* understating by the aggregate of their overlap.
				1377	*
				1378	* Extending this to N cpus, for a given overlap, the maximum amount we
				1379	* understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
				1380	* cpus that overlap for this interval and w_i is the interval width.
				1381	*
				1382	* On a small machine; the first term is well-bounded which bounds the
				1383	* total error since w_i is a subset of the period. Whereas on a
				1384	* larger machine, while this first term can be larger, if w_i is the
				1385	* of consequential size guaranteed to see n_i*w_i quickly converge to
				1386	* our upper bound of 1-cpu.
				1387	*/
				1388	runnable_avg = atomic_read(&tg->runnable_avg);
				1389	if (runnable_avg < NICE_0_LOAD) {
				1390	se->avg.load_avg_contrib *= runnable_avg;
				1391	se->avg.load_avg_contrib >>= NICE_0_SHIFT;
				1392	}
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1393	}
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1394	#else
				1395	static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
				1396	int force_update) {}
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1397	static inline void __update_tg_runnable_avg(struct sched_avg *sa,
				1398	struct cfs_rq *cfs_rq) {}
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1399	static inline void __update_group_entity_contrib(struct sched_entity *se) {}
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1400	#endif
				1401
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1402	static inline void __update_task_entity_contrib(struct sched_entity *se)
				1403	{
				1404	u32 contrib;
				1405
				1406	/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
				1407	contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
				1408	contrib /= (se->avg.runnable_avg_period + 1);
				1409	se->avg.load_avg_contrib = scale_load(contrib);
				1410	}
				1411
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1412	/* Compute the current contribution to load_avg by se, return any delta */
				1413	static long __update_entity_load_avg_contrib(struct sched_entity *se)
				1414	{
				1415	long old_contrib = se->avg.load_avg_contrib;
				1416
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1417	if (entity_is_task(se)) {
				1418	__update_task_entity_contrib(se);
				1419	} else {
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1420	__update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1421	__update_group_entity_contrib(se);
				1422	}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1423
				1424	return se->avg.load_avg_contrib - old_contrib;
				1425	}
				1426
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1427	static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
				1428	long load_contrib)
				1429	{
				1430	if (likely(load_contrib < cfs_rq->blocked_load_avg))
				1431	cfs_rq->blocked_load_avg -= load_contrib;
				1432	else
				1433	cfs_rq->blocked_load_avg = 0;
				1434	}
				1435
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	1436	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
				1437
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1438	/* Update a sched_entity's runnable average */
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1439	static inline void update_entity_load_avg(struct sched_entity *se,
				1440	int update_cfs_rq)
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1441	{
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1442	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1443	long contrib_delta;
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	1444	u64 now;
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1445
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	1446	/*
				1447	* For a group entity we need to use their owned cfs_rq_clock_task() in
				1448	* case they are the parent of a throttled hierarchy.
				1449	*/
				1450	if (entity_is_task(se))
				1451	now = cfs_rq_clock_task(cfs_rq);
				1452	else
				1453	now = cfs_rq_clock_task(group_cfs_rq(se));
				1454
				1455	if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1456	return;
				1457
				1458	contrib_delta = __update_entity_load_avg_contrib(se);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1459
				1460	if (!update_cfs_rq)
				1461	return;
				1462
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1463	if (se->on_rq)
				1464	cfs_rq->runnable_load_avg += contrib_delta;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1465	else
				1466	subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
				1467	}
				1468
				1469	/*
				1470	* Decay the load contributed by all blocked children and account this so that
				1471	* their contribution may appropriately discounted when they wake up.
				1472	*/
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1473	static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1474	{
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	1475	u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1476	u64 decays;
				1477
				1478	decays = now - cfs_rq->last_decay;
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1479	if (!decays && !force_update)
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1480	return;
				1481
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1482	if (atomic64_read(&cfs_rq->removed_load)) {
				1483	u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
				1484	subtract_blocked_load_contrib(cfs_rq, removed_load);
				1485	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1486
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1487	if (decays) {
				1488	cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
				1489	decays);
				1490	atomic64_add(decays, &cfs_rq->decay_counter);
				1491	cfs_rq->last_decay = now;
				1492	}
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1493
				1494	__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1495	}
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	1496
				1497	static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
				1498	{
				1499	__update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1500	__update_tg_runnable_avg(&rq->avg, &rq->cfs);
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	1501	}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1502
				1503	/* Add the load generated by se into cfs_rq's child load-average */
				1504	static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1505	struct sched_entity *se,
				1506	int wakeup)
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1507	{
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1508	/*
				1509	* We track migrations using entity decay_count <= 0, on a wake-up
				1510	* migration we use a negative decay count to track the remote decays
				1511	* accumulated while sleeping.
				1512	*/
				1513	if (unlikely(se->avg.decay_count <= 0)) {
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1514	se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1515	if (se->avg.decay_count) {
				1516	/*
				1517	* In a wake-up migration we have to approximate the
				1518	* time sleeping. This is because we can't synchronize
				1519	* clock_task between the two cpus, and it is not
				1520	* guaranteed to be read-safe. Instead, we can
				1521	* approximate this using our carried decays, which are
				1522	* explicitly atomically readable.
				1523	*/
				1524	se->avg.last_runnable_update -= (-se->avg.decay_count)
				1525	<< 20;
				1526	update_entity_load_avg(se, 0);
				1527	/* Indicate that we're now synchronized and on-rq */
				1528	se->avg.decay_count = 0;
				1529	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1530	wakeup = 0;
				1531	} else {
				1532	__synchronize_entity_decay(se);
				1533	}
				1534
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1535	/* migrated tasks did not contribute to our blocked load */
				1536	if (wakeup) {
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1537	subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1538	update_entity_load_avg(se, 0);
				1539	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1540
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1541	cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1542	/* we force update consideration on load-balancer moves */
				1543	update_cfs_rq_blocked_load(cfs_rq, !wakeup);
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1544	}
				1545
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1546	/*
				1547	* Remove se's load from this cfs_rq child load-average, if the entity is
				1548	* transitioning to a blocked state we track its projected decay using
				1549	* blocked_load_avg.
				1550	*/
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1551	static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1552	struct sched_entity *se,
				1553	int sleep)
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1554	{
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1555	update_entity_load_avg(se, 1);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1556	/* we force update consideration on load-balancer moves */
				1557	update_cfs_rq_blocked_load(cfs_rq, !sleep);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1558
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1559	cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1560	if (sleep) {
				1561	cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
				1562	se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
				1563	} /* migrations, e.g. sleep=0 leave decay_count == 0 */
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1564	}
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1565	#else
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1566	static inline void update_entity_load_avg(struct sched_entity *se,
				1567	int update_cfs_rq) {}
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	1568	static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1569	static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1570	struct sched_entity *se,
				1571	int wakeup) {}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1572	static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1573	struct sched_entity *se,
				1574	int sleep) {}
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1575	static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
				1576	int force_update) {}
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1577	#endif
				1578
Ingo Molnar	2396af6	2007-08-09 11:16:48 +0200	[diff] [blame]	1579	static void enqueue_sleeper(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1580	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1581	#ifdef CONFIG_SCHEDSTATS
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	1582	struct task_struct *tsk = NULL;
				1583
				1584	if (entity_is_task(se))
				1585	tsk = task_of(se);
				1586
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1587	if (se->statistics.sleep_start) {
				1588	u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1589
				1590	if ((s64)delta < 0)
				1591	delta = 0;
				1592
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1593	if (unlikely(delta > se->statistics.sleep_max))
				1594	se->statistics.sleep_max = delta;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1595
Peter Zijlstra	8c79a04	2012-01-30 14:51:37 +0100	[diff] [blame]	1596	se->statistics.sleep_start = 0;
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1597	se->statistics.sum_sleep_runtime += delta;
Arjan van de Ven	9745512	2008-01-25 21:08:34 +0100	[diff] [blame]	1598
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	1599	if (tsk) {
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	1600	account_scheduler_latency(tsk, delta >> 10, 1);
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	1601	trace_sched_stat_sleep(tsk, delta);
				1602	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1603	}
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1604	if (se->statistics.block_start) {
				1605	u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1606
				1607	if ((s64)delta < 0)
				1608	delta = 0;
				1609
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1610	if (unlikely(delta > se->statistics.block_max))
				1611	se->statistics.block_max = delta;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1612
Peter Zijlstra	8c79a04	2012-01-30 14:51:37 +0100	[diff] [blame]	1613	se->statistics.block_start = 0;
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1614	se->statistics.sum_sleep_runtime += delta;
Ingo Molnar	30084fb	2007-10-02 14:13:08 +0200	[diff] [blame]	1615
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	1616	if (tsk) {
Arjan van de Ven	8f0dfc3	2009-07-20 11:26:58 -0700	[diff] [blame]	1617	if (tsk->in_iowait) {
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1618	se->statistics.iowait_sum += delta;
				1619	se->statistics.iowait_count++;
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	1620	trace_sched_stat_iowait(tsk, delta);
Arjan van de Ven	8f0dfc3	2009-07-20 11:26:58 -0700	[diff] [blame]	1621	}
				1622
Andrew Vagin	b781a60	2011-11-28 12:03:35 +0300	[diff] [blame]	1623	trace_sched_stat_blocked(tsk, delta);
				1624
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	1625	/*
				1626	* Blocking time is in units of nanosecs, so shift by
				1627	* 20 to get a milliseconds-range estimation of the
				1628	* amount of time that the task spent sleeping:
				1629	*/
				1630	if (unlikely(prof_on == SLEEP_PROFILING)) {
				1631	profile_hits(SLEEP_PROFILING,
				1632	(void *)get_wchan(tsk),
				1633	delta >> 20);
				1634	}
				1635	account_scheduler_latency(tsk, delta >> 10, 0);
Ingo Molnar	30084fb	2007-10-02 14:13:08 +0200	[diff] [blame]	1636	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1637	}
				1638	#endif
				1639	}
				1640
Peter Zijlstra	ddc9729	2007-10-15 17:00:10 +0200	[diff] [blame]	1641	static void check_spread(struct cfs_rq cfs_rq, struct sched_entity se)
				1642	{
				1643	#ifdef CONFIG_SCHED_DEBUG
				1644	s64 d = se->vruntime - cfs_rq->min_vruntime;
				1645
				1646	if (d < 0)
				1647	d = -d;
				1648
				1649	if (d > 3*sysctl_sched_latency)
				1650	schedstat_inc(cfs_rq, nr_spread_over);
				1651	#endif
				1652	}
				1653
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1654	static void
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1655	place_entity(struct cfs_rq cfs_rq, struct sched_entity se, int initial)
				1656	{
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	1657	u64 vruntime = cfs_rq->min_vruntime;
Peter Zijlstra	94dfb5e	2007-10-15 17:00:05 +0200	[diff] [blame]	1658
Peter Zijlstra	2cb8600	2007-11-09 22:39:37 +0100	[diff] [blame]	1659	/*
				1660	* The 'current' period is already promised to the current tasks,
				1661	* however the extra weight of the new task will slow them down a
				1662	* little, place the new task so that it fits in the slot that
				1663	* stays open at the end.
				1664	*/
Peter Zijlstra	94dfb5e	2007-10-15 17:00:05 +0200	[diff] [blame]	1665	if (initial && sched_feat(START_DEBIT))
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	1666	vruntime += sched_vslice(cfs_rq, se);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1667
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1668	/* sleeps up to a single latency don't count. */
Mike Galbraith	5ca9880	2010-03-11 17:17:17 +0100	[diff] [blame]	1669	if (!initial) {
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1670	unsigned long thresh = sysctl_sched_latency;
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	1671
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1672	/*
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1673	* Halve their sleep time's effect, to allow
				1674	* for a gentler effect of sleepers:
				1675	*/
				1676	if (sched_feat(GENTLE_FAIR_SLEEPERS))
				1677	thresh >>= 1;
Ingo Molnar	51e0304	2009-09-16 08:54:45 +0200	[diff] [blame]	1678
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1679	vruntime -= thresh;
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1680	}
				1681
Mike Galbraith	b5d9d73	2009-09-08 11:12:28 +0200	[diff] [blame]	1682	/* ensure we never gain time by being placed backwards. */
Viresh Kumar	16c8f1c	2012-11-08 13:33:46 +0530	[diff] [blame^]	1683	se->vruntime = max_vruntime(se->vruntime, vruntime);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1684	}
				1685
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	1686	static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
				1687
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1688	static void
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1689	enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1690	{
				1691	/*
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1692	* Update the normalized vruntime before updating min_vruntime
				1693	* through callig update_curr().
				1694	*/
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	1695	if (!(flags & ENQUEUE_WAKEUP) \|\| (flags & ENQUEUE_WAKING))
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1696	se->vruntime += cfs_rq->min_vruntime;
				1697
				1698	/*
Dmitry Adamushko	a2a2d68	2007-10-15 17:00:13 +0200	[diff] [blame]	1699	* Update run-time statistics of the 'current'.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1700	*/
Ingo Molnar	b7cc089	2007-08-09 11:16:47 +0200	[diff] [blame]	1701	update_curr(cfs_rq);
Paul Turner	f269ae0	2012-10-04 13:18:31 +0200	[diff] [blame]	1702	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	1703	account_entity_enqueue(cfs_rq, se);
				1704	update_cfs_shares(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1705
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1706	if (flags & ENQUEUE_WAKEUP) {
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1707	place_entity(cfs_rq, se, 0);
Ingo Molnar	2396af6	2007-08-09 11:16:48 +0200	[diff] [blame]	1708	enqueue_sleeper(cfs_rq, se);
Ingo Molnar	e9acbff	2007-10-15 17:00:04 +0200	[diff] [blame]	1709	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1710
Ingo Molnar	d2417e5	2007-08-09 11:16:47 +0200	[diff] [blame]	1711	update_stats_enqueue(cfs_rq, se);
Peter Zijlstra	ddc9729	2007-10-15 17:00:10 +0200	[diff] [blame]	1712	check_spread(cfs_rq, se);
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	1713	if (se != cfs_rq->curr)
				1714	__enqueue_entity(cfs_rq, se);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1715	se->on_rq = 1;
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	1716
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	1717	if (cfs_rq->nr_running == 1) {
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	1718	list_add_leaf_cfs_rq(cfs_rq);
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	1719	check_enqueue_throttle(cfs_rq);
				1720	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1721	}
				1722
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	1723	static void __clear_buddies_last(struct sched_entity *se)
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	1724	{
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	1725	for_each_sched_entity(se) {
				1726	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1727	if (cfs_rq->last == se)
				1728	cfs_rq->last = NULL;
				1729	else
				1730	break;
				1731	}
				1732	}
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	1733
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	1734	static void __clear_buddies_next(struct sched_entity *se)
				1735	{
				1736	for_each_sched_entity(se) {
				1737	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1738	if (cfs_rq->next == se)
				1739	cfs_rq->next = NULL;
				1740	else
				1741	break;
				1742	}
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	1743	}
				1744
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1745	static void __clear_buddies_skip(struct sched_entity *se)
				1746	{
				1747	for_each_sched_entity(se) {
				1748	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1749	if (cfs_rq->skip == se)
				1750	cfs_rq->skip = NULL;
				1751	else
				1752	break;
				1753	}
				1754	}
				1755
Peter Zijlstra	a571bbe	2009-01-28 14:51:40 +0100	[diff] [blame]	1756	static void clear_buddies(struct cfs_rq cfs_rq, struct sched_entity se)
				1757	{
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	1758	if (cfs_rq->last == se)
				1759	__clear_buddies_last(se);
				1760
				1761	if (cfs_rq->next == se)
				1762	__clear_buddies_next(se);
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1763
				1764	if (cfs_rq->skip == se)
				1765	__clear_buddies_skip(se);
Peter Zijlstra	a571bbe	2009-01-28 14:51:40 +0100	[diff] [blame]	1766	}
				1767
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	1768	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	1769
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1770	static void
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	1771	dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1772	{
Dmitry Adamushko	a2a2d68	2007-10-15 17:00:13 +0200	[diff] [blame]	1773	/*
				1774	* Update run-time statistics of the 'current'.
				1775	*/
				1776	update_curr(cfs_rq);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	1777	dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
Dmitry Adamushko	a2a2d68	2007-10-15 17:00:13 +0200	[diff] [blame]	1778
Ingo Molnar	19b6a2e	2007-08-09 11:16:48 +0200	[diff] [blame]	1779	update_stats_dequeue(cfs_rq, se);
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	1780	if (flags & DEQUEUE_SLEEP) {
Peter Zijlstra	67e9fb2	2007-10-15 17:00:10 +0200	[diff] [blame]	1781	#ifdef CONFIG_SCHEDSTATS
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1782	if (entity_is_task(se)) {
				1783	struct task_struct *tsk = task_of(se);
				1784
				1785	if (tsk->state & TASK_INTERRUPTIBLE)
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1786	se->statistics.sleep_start = rq_of(cfs_rq)->clock;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1787	if (tsk->state & TASK_UNINTERRUPTIBLE)
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1788	se->statistics.block_start = rq_of(cfs_rq)->clock;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1789	}
Dmitry Adamushko	db36cc7	2007-10-15 17:00:06 +0200	[diff] [blame]	1790	#endif
Peter Zijlstra	67e9fb2	2007-10-15 17:00:10 +0200	[diff] [blame]	1791	}
				1792
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	1793	clear_buddies(cfs_rq, se);
Peter Zijlstra	4793241	2008-11-04 21:25:09 +0100	[diff] [blame]	1794
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	1795	if (se != cfs_rq->curr)
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1796	__dequeue_entity(cfs_rq, se);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	1797	se->on_rq = 0;
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1798	account_entity_dequeue(cfs_rq, se);
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1799
				1800	/*
				1801	* Normalize the entity after updating the min_vruntime because the
				1802	* update can refer to the ->curr item and we need to reflect this
				1803	* movement in our normalized position.
				1804	*/
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	1805	if (!(flags & DEQUEUE_SLEEP))
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1806	se->vruntime -= cfs_rq->min_vruntime;
Peter Zijlstra	1e87623	2011-05-17 16:21:10 -0700	[diff] [blame]	1807
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	1808	/* return excess runtime on last dequeue */
				1809	return_cfs_rq_runtime(cfs_rq);
				1810
Peter Zijlstra	1e87623	2011-05-17 16:21:10 -0700	[diff] [blame]	1811	update_min_vruntime(cfs_rq);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	1812	update_cfs_shares(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1813	}
				1814
				1815	/*
				1816	* Preempt the current task with a newly woken task if needed:
				1817	*/
Peter Zijlstra	7c92e54	2007-09-05 14:32:49 +0200	[diff] [blame]	1818	static void
Ingo Molnar	2e09bf5	2007-10-15 17:00:05 +0200	[diff] [blame]	1819	check_preempt_tick(struct cfs_rq cfs_rq, struct sched_entity curr)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1820	{
Peter Zijlstra	1169783	2007-09-05 14:32:49 +0200	[diff] [blame]	1821	unsigned long ideal_runtime, delta_exec;
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	1822	struct sched_entity *se;
				1823	s64 delta;
Peter Zijlstra	1169783	2007-09-05 14:32:49 +0200	[diff] [blame]	1824
Peter Zijlstra	6d0f0ebd	2007-10-15 17:00:05 +0200	[diff] [blame]	1825	ideal_runtime = sched_slice(cfs_rq, curr);
Peter Zijlstra	1169783	2007-09-05 14:32:49 +0200	[diff] [blame]	1826	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
Mike Galbraith	a9f3e2b	2009-01-28 14:51:39 +0100	[diff] [blame]	1827	if (delta_exec > ideal_runtime) {
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1828	resched_task(rq_of(cfs_rq)->curr);
Mike Galbraith	a9f3e2b	2009-01-28 14:51:39 +0100	[diff] [blame]	1829	/*
				1830	* The current task ran long enough, ensure it doesn't get
				1831	* re-elected due to buddy favours.
				1832	*/
				1833	clear_buddies(cfs_rq, curr);
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1834	return;
				1835	}
				1836
				1837	/*
				1838	* Ensure that a task that missed wakeup preemption by a
				1839	* narrow margin doesn't have to wait for a full slice.
				1840	* This also mitigates buddy induced latencies under load.
				1841	*/
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1842	if (delta_exec < sysctl_sched_min_granularity)
				1843	return;
				1844
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	1845	se = __pick_first_entity(cfs_rq);
				1846	delta = curr->vruntime - se->vruntime;
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1847
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	1848	if (delta < 0)
				1849	return;
Mike Galbraith	d7d8294	2011-01-05 05:41:17 +0100	[diff] [blame]	1850
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	1851	if (delta > ideal_runtime)
				1852	resched_task(rq_of(cfs_rq)->curr);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1853	}
				1854
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	1855	static void
Ingo Molnar	8494f41	2007-08-09 11:16:48 +0200	[diff] [blame]	1856	set_next_entity(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1857	{
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	1858	/* 'current' is not kept within the tree. */
				1859	if (se->on_rq) {
				1860	/*
				1861	* Any task has to be enqueued before it get to execute on
				1862	* a CPU. So account for the time it spent waiting on the
				1863	* runqueue.
				1864	*/
				1865	update_stats_wait_end(cfs_rq, se);
				1866	__dequeue_entity(cfs_rq, se);
				1867	}
				1868
Ingo Molnar	79303e9	2007-08-09 11:16:47 +0200	[diff] [blame]	1869	update_stats_curr_start(cfs_rq, se);
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	1870	cfs_rq->curr = se;
Ingo Molnar	eba1ed4	2007-10-15 17:00:02 +0200	[diff] [blame]	1871	#ifdef CONFIG_SCHEDSTATS
				1872	/*
				1873	* Track our maximum slice length, if the CPU's load is at
				1874	* least twice that of our own weight (i.e. dont track it
				1875	* when there are only lesser-weight tasks around):
				1876	*/
Dmitry Adamushko	495eca4	2007-10-15 17:00:06 +0200	[diff] [blame]	1877	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1878	se->statistics.slice_max = max(se->statistics.slice_max,
Ingo Molnar	eba1ed4	2007-10-15 17:00:02 +0200	[diff] [blame]	1879	se->sum_exec_runtime - se->prev_sum_exec_runtime);
				1880	}
				1881	#endif
Peter Zijlstra	4a55b45	2007-09-05 14:32:49 +0200	[diff] [blame]	1882	se->prev_sum_exec_runtime = se->sum_exec_runtime;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1883	}
				1884
Peter Zijlstra	3f3a490	2008-10-24 11:06:16 +0200	[diff] [blame]	1885	static int
				1886	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se);
				1887
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1888	/*
				1889	* Pick the next process, keeping these things in mind, in this order:
				1890	* 1) keep things fair between processes/task groups
				1891	* 2) pick the "next" process, since someone really wants that to run
				1892	* 3) pick the "last" process, for cache locality
				1893	* 4) do not run the "skip" process, if something else is available
				1894	*/
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	1895	static struct sched_entity pick_next_entity(struct cfs_rq cfs_rq)
Peter Zijlstra	aa2ac25	2008-03-14 21:12:12 +0100	[diff] [blame]	1896	{
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1897	struct sched_entity *se = __pick_first_entity(cfs_rq);
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1898	struct sched_entity *left = se;
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	1899
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1900	/*
				1901	* Avoid running the skip buddy, if running something else can
				1902	* be done without getting too unfair.
				1903	*/
				1904	if (cfs_rq->skip == se) {
				1905	struct sched_entity *second = __pick_next_entity(se);
				1906	if (second && wakeup_preempt_entity(second, left) < 1)
				1907	se = second;
				1908	}
Peter Zijlstra	aa2ac25	2008-03-14 21:12:12 +0100	[diff] [blame]	1909
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1910	/*
				1911	* Prefer last buddy, try to return the CPU to a preempted task.
				1912	*/
				1913	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
				1914	se = cfs_rq->last;
				1915
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1916	/*
				1917	* Someone really wants this to run. If it's not unfair, run it.
				1918	*/
				1919	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
				1920	se = cfs_rq->next;
				1921
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1922	clear_buddies(cfs_rq, se);
Peter Zijlstra	4793241	2008-11-04 21:25:09 +0100	[diff] [blame]	1923
				1924	return se;
Peter Zijlstra	aa2ac25	2008-03-14 21:12:12 +0100	[diff] [blame]	1925	}
				1926
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	1927	static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
				1928
Ingo Molnar	ab6cde2	2007-08-09 11:16:48 +0200	[diff] [blame]	1929	static void put_prev_entity(struct cfs_rq cfs_rq, struct sched_entity prev)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1930	{
				1931	/*
				1932	* If still on the runqueue then deactivate_task()
				1933	* was not called and update_curr() has to be done:
				1934	*/
				1935	if (prev->on_rq)
Ingo Molnar	b7cc089	2007-08-09 11:16:47 +0200	[diff] [blame]	1936	update_curr(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1937
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	1938	/* throttle cfs_rqs exceeding runtime */
				1939	check_cfs_rq_runtime(cfs_rq);
				1940
Peter Zijlstra	ddc9729	2007-10-15 17:00:10 +0200	[diff] [blame]	1941	check_spread(cfs_rq, prev);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1942	if (prev->on_rq) {
Ingo Molnar	5870db5	2007-08-09 11:16:47 +0200	[diff] [blame]	1943	update_stats_wait_start(cfs_rq, prev);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1944	/* Put 'current' back into the tree. */
				1945	__enqueue_entity(cfs_rq, prev);
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1946	/* in !on_rq case, update occurred at dequeue */
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1947	update_entity_load_avg(prev, 1);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1948	}
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	1949	cfs_rq->curr = NULL;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1950	}
				1951
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	1952	static void
				1953	entity_tick(struct cfs_rq cfs_rq, struct sched_entity curr, int queued)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1954	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1955	/*
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1956	* Update run-time statistics of the 'current'.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1957	*/
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1958	update_curr(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1959
Paul Turner	43365bd	2010-12-15 19:10:17 -0800	[diff] [blame]	1960	/*
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1961	* Ensure that runnable average is periodically updated.
				1962	*/
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1963	update_entity_load_avg(curr, 1);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1964	update_cfs_rq_blocked_load(cfs_rq, 1);
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1965
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	1966	#ifdef CONFIG_SCHED_HRTICK
				1967	/*
				1968	* queued ticks are scheduled to match the slice, so don't bother
				1969	* validating it and just reschedule.
				1970	*/
Harvey Harrison	983ed7a	2008-04-24 18:17:55 -0700	[diff] [blame]	1971	if (queued) {
				1972	resched_task(rq_of(cfs_rq)->curr);
				1973	return;
				1974	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	1975	/*
				1976	* don't let the period tick interfere with the hrtick preemption
				1977	*/
				1978	if (!sched_feat(DOUBLE_TICK) &&
				1979	hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
				1980	return;
				1981	#endif
				1982
Yong Zhang	2c2efae	2011-07-29 16:20:33 +0800	[diff] [blame]	1983	if (cfs_rq->nr_running > 1)
Ingo Molnar	2e09bf5	2007-10-15 17:00:05 +0200	[diff] [blame]	1984	check_preempt_tick(cfs_rq, curr);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1985	}
				1986
Paul Turner	ab84d31	2011-07-21 09:43:28 -0700	[diff] [blame]	1987
				1988	/**************************************************
				1989	* CFS bandwidth control machinery
				1990	*/
				1991
				1992	#ifdef CONFIG_CFS_BANDWIDTH
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	1993
				1994	#ifdef HAVE_JUMP_LABEL
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	1995	static struct static_key __cfs_bandwidth_used;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	1996
				1997	static inline bool cfs_bandwidth_used(void)
				1998	{
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	1999	return static_key_false(&__cfs_bandwidth_used);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2000	}
				2001
				2002	void account_cfs_bandwidth_used(int enabled, int was_enabled)
				2003	{
				2004	/* only need to count groups transitioning between enabled/!enabled */
				2005	if (enabled && !was_enabled)
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	2006	static_key_slow_inc(&__cfs_bandwidth_used);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2007	else if (!enabled && was_enabled)
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	2008	static_key_slow_dec(&__cfs_bandwidth_used);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2009	}
				2010	#else /* HAVE_JUMP_LABEL */
				2011	static bool cfs_bandwidth_used(void)
				2012	{
				2013	return true;
				2014	}
				2015
				2016	void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
				2017	#endif /* HAVE_JUMP_LABEL */
				2018
Paul Turner	ab84d31	2011-07-21 09:43:28 -0700	[diff] [blame]	2019	/*
				2020	* default period for cfs group bandwidth.
				2021	* default: 0.1s, units: nanoseconds
				2022	*/
				2023	static inline u64 default_cfs_period(void)
				2024	{
				2025	return 100000000ULL;
				2026	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2027
				2028	static inline u64 sched_cfs_bandwidth_slice(void)
				2029	{
				2030	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
				2031	}
				2032
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2033	/*
				2034	* Replenish runtime according to assigned quota and update expiration time.
				2035	* We use sched_clock_cpu directly instead of rq->clock to avoid adding
				2036	* additional synchronization around rq->lock.
				2037	*
				2038	* requires cfs_b->lock
				2039	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2040	void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2041	{
				2042	u64 now;
				2043
				2044	if (cfs_b->quota == RUNTIME_INF)
				2045	return;
				2046
				2047	now = sched_clock_cpu(smp_processor_id());
				2048	cfs_b->runtime = cfs_b->quota;
				2049	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
				2050	}
				2051
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2052	static inline struct cfs_bandwidth tg_cfs_bandwidth(struct task_group tg)
				2053	{
				2054	return &tg->cfs_bandwidth;
				2055	}
				2056
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2057	/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
				2058	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
				2059	{
				2060	if (unlikely(cfs_rq->throttle_count))
				2061	return cfs_rq->throttled_clock_task;
				2062
				2063	return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
				2064	}
				2065
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2066	/* returns 0 on failure to allocate runtime */
				2067	static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2068	{
				2069	struct task_group *tg = cfs_rq->tg;
				2070	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2071	u64 amount = 0, min_amount, expires;
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2072
				2073	/* note: this is a positive sum as runtime_remaining <= 0 */
				2074	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
				2075
				2076	raw_spin_lock(&cfs_b->lock);
				2077	if (cfs_b->quota == RUNTIME_INF)
				2078	amount = min_amount;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2079	else {
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2080	/*
				2081	* If the bandwidth pool has become inactive, then at least one
				2082	* period must have elapsed since the last consumption.
				2083	* Refresh the global state and ensure bandwidth timer becomes
				2084	* active.
				2085	*/
				2086	if (!cfs_b->timer_active) {
				2087	__refill_cfs_bandwidth_runtime(cfs_b);
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2088	__start_cfs_bandwidth(cfs_b);
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2089	}
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2090
				2091	if (cfs_b->runtime > 0) {
				2092	amount = min(cfs_b->runtime, min_amount);
				2093	cfs_b->runtime -= amount;
				2094	cfs_b->idle = 0;
				2095	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2096	}
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2097	expires = cfs_b->runtime_expires;
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2098	raw_spin_unlock(&cfs_b->lock);
				2099
				2100	cfs_rq->runtime_remaining += amount;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2101	/*
				2102	* we may have advanced our local expiration to account for allowed
				2103	* spread between our sched_clock and the one on which runtime was
				2104	* issued.
				2105	*/
				2106	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
				2107	cfs_rq->runtime_expires = expires;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2108
				2109	return cfs_rq->runtime_remaining > 0;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2110	}
				2111
				2112	/*
				2113	* Note: This depends on the synchronization provided by sched_clock and the
				2114	* fact that rq->clock snapshots this value.
				2115	*/
				2116	static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2117	{
				2118	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2119	struct rq *rq = rq_of(cfs_rq);
				2120
				2121	/* if the deadline is ahead of our clock, nothing to do */
				2122	if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
				2123	return;
				2124
				2125	if (cfs_rq->runtime_remaining < 0)
				2126	return;
				2127
				2128	/*
				2129	* If the local deadline has passed we have to consider the
				2130	* possibility that our sched_clock is 'fast' and the global deadline
				2131	* has not truly expired.
				2132	*
				2133	* Fortunately we can check determine whether this the case by checking
				2134	* whether the global deadline has advanced.
				2135	*/
				2136
				2137	if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
				2138	/* extend local deadline, drift is bounded above by 2 ticks */
				2139	cfs_rq->runtime_expires += TICK_NSEC;
				2140	} else {
				2141	/* global deadline is ahead, expiration has passed */
				2142	cfs_rq->runtime_remaining = 0;
				2143	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2144	}
				2145
				2146	static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
				2147	unsigned long delta_exec)
				2148	{
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2149	/* dock delta_exec before expiring quota (as it could span periods) */
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2150	cfs_rq->runtime_remaining -= delta_exec;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2151	expire_cfs_rq_runtime(cfs_rq);
				2152
				2153	if (likely(cfs_rq->runtime_remaining > 0))
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2154	return;
				2155
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2156	/*
				2157	* if we're unable to extend our runtime we resched so that the active
				2158	* hierarchy can be throttled
				2159	*/
				2160	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
				2161	resched_task(rq_of(cfs_rq)->curr);
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2162	}
				2163
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	2164	static __always_inline
				2165	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2166	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2167	if (!cfs_bandwidth_used() \|\| !cfs_rq->runtime_enabled)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2168	return;
				2169
				2170	__account_cfs_rq_runtime(cfs_rq, delta_exec);
				2171	}
				2172
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2173	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
				2174	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2175	return cfs_bandwidth_used() && cfs_rq->throttled;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2176	}
				2177
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2178	/* check whether cfs_rq, or any parent, is throttled */
				2179	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
				2180	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2181	return cfs_bandwidth_used() && cfs_rq->throttle_count;
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2182	}
				2183
				2184	/*
				2185	* Ensure that neither of the group entities corresponding to src_cpu or
				2186	* dest_cpu are members of a throttled hierarchy when performing group
				2187	* load-balance operations.
				2188	*/
				2189	static inline int throttled_lb_pair(struct task_group *tg,
				2190	int src_cpu, int dest_cpu)
				2191	{
				2192	struct cfs_rq src_cfs_rq, dest_cfs_rq;
				2193
				2194	src_cfs_rq = tg->cfs_rq[src_cpu];
				2195	dest_cfs_rq = tg->cfs_rq[dest_cpu];
				2196
				2197	return throttled_hierarchy(src_cfs_rq) \|\|
				2198	throttled_hierarchy(dest_cfs_rq);
				2199	}
				2200
				2201	/* updated child weight may affect parent so we have to do this bottom up */
				2202	static int tg_unthrottle_up(struct task_group tg, void data)
				2203	{
				2204	struct rq *rq = data;
				2205	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
				2206
				2207	cfs_rq->throttle_count--;
				2208	#ifdef CONFIG_SMP
				2209	if (!cfs_rq->throttle_count) {
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2210	/* adjust cfs_rq_clock_task() */
				2211	cfs_rq->throttled_clock_task_time += rq->clock_task -
				2212	cfs_rq->throttled_clock_task;
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2213	}
				2214	#endif
				2215
				2216	return 0;
				2217	}
				2218
				2219	static int tg_throttle_down(struct task_group tg, void data)
				2220	{
				2221	struct rq *rq = data;
				2222	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
				2223
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	2224	/* group is entering throttled state, stop time */
				2225	if (!cfs_rq->throttle_count)
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2226	cfs_rq->throttled_clock_task = rq->clock_task;
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2227	cfs_rq->throttle_count++;
				2228
				2229	return 0;
				2230	}
				2231
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2232	static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2233	{
				2234	struct rq *rq = rq_of(cfs_rq);
				2235	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2236	struct sched_entity *se;
				2237	long task_delta, dequeue = 1;
				2238
				2239	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
				2240
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2241	/* freeze hierarchy runnable averages while throttled */
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2242	rcu_read_lock();
				2243	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
				2244	rcu_read_unlock();
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2245
				2246	task_delta = cfs_rq->h_nr_running;
				2247	for_each_sched_entity(se) {
				2248	struct cfs_rq *qcfs_rq = cfs_rq_of(se);
				2249	/* throttled entity or throttle-on-deactivate */
				2250	if (!se->on_rq)
				2251	break;
				2252
				2253	if (dequeue)
				2254	dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
				2255	qcfs_rq->h_nr_running -= task_delta;
				2256
				2257	if (qcfs_rq->load.weight)
				2258	dequeue = 0;
				2259	}
				2260
				2261	if (!se)
				2262	rq->nr_running -= task_delta;
				2263
				2264	cfs_rq->throttled = 1;
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2265	cfs_rq->throttled_clock = rq->clock;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2266	raw_spin_lock(&cfs_b->lock);
				2267	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
				2268	raw_spin_unlock(&cfs_b->lock);
				2269	}
				2270
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2271	void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2272	{
				2273	struct rq *rq = rq_of(cfs_rq);
				2274	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2275	struct sched_entity *se;
				2276	int enqueue = 1;
				2277	long task_delta;
				2278
				2279	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
				2280
				2281	cfs_rq->throttled = 0;
				2282	raw_spin_lock(&cfs_b->lock);
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2283	cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2284	list_del_rcu(&cfs_rq->throttled_list);
				2285	raw_spin_unlock(&cfs_b->lock);
				2286
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2287	update_rq_clock(rq);
				2288	/* update hierarchical throttle state */
				2289	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
				2290
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2291	if (!cfs_rq->load.weight)
				2292	return;
				2293
				2294	task_delta = cfs_rq->h_nr_running;
				2295	for_each_sched_entity(se) {
				2296	if (se->on_rq)
				2297	enqueue = 0;
				2298
				2299	cfs_rq = cfs_rq_of(se);
				2300	if (enqueue)
				2301	enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
				2302	cfs_rq->h_nr_running += task_delta;
				2303
				2304	if (cfs_rq_throttled(cfs_rq))
				2305	break;
				2306	}
				2307
				2308	if (!se)
				2309	rq->nr_running += task_delta;
				2310
				2311	/* determine whether we need to wake up potentially idle cpu */
				2312	if (rq->curr == rq->idle && rq->cfs.nr_running)
				2313	resched_task(rq->curr);
				2314	}
				2315
				2316	static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
				2317	u64 remaining, u64 expires)
				2318	{
				2319	struct cfs_rq *cfs_rq;
				2320	u64 runtime = remaining;
				2321
				2322	rcu_read_lock();
				2323	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
				2324	throttled_list) {
				2325	struct rq *rq = rq_of(cfs_rq);
				2326
				2327	raw_spin_lock(&rq->lock);
				2328	if (!cfs_rq_throttled(cfs_rq))
				2329	goto next;
				2330
				2331	runtime = -cfs_rq->runtime_remaining + 1;
				2332	if (runtime > remaining)
				2333	runtime = remaining;
				2334	remaining -= runtime;
				2335
				2336	cfs_rq->runtime_remaining += runtime;
				2337	cfs_rq->runtime_expires = expires;
				2338
				2339	/* we check whether we're throttled above */
				2340	if (cfs_rq->runtime_remaining > 0)
				2341	unthrottle_cfs_rq(cfs_rq);
				2342
				2343	next:
				2344	raw_spin_unlock(&rq->lock);
				2345
				2346	if (!remaining)
				2347	break;
				2348	}
				2349	rcu_read_unlock();
				2350
				2351	return remaining;
				2352	}
				2353
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2354	/*
				2355	* Responsible for refilling a task_group's bandwidth and unthrottling its
				2356	* cfs_rqs as appropriate. If there has been no activity within the last
				2357	* period the timer is deactivated until scheduling resumes; cfs_b->idle is
				2358	* used to track this state.
				2359	*/
				2360	static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
				2361	{
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2362	u64 runtime, runtime_expires;
				2363	int idle = 1, throttled;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2364
				2365	raw_spin_lock(&cfs_b->lock);
				2366	/* no need to continue the timer with no bandwidth constraint */
				2367	if (cfs_b->quota == RUNTIME_INF)
				2368	goto out_unlock;
				2369
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2370	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
				2371	/* idle depends on !throttled (for the case of a large deficit) */
				2372	idle = cfs_b->idle && !throttled;
Nikhil Rao	e8da1b1	2011-07-21 09:43:40 -0700	[diff] [blame]	2373	cfs_b->nr_periods += overrun;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2374
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2375	/* if we're going inactive then everything else can be deferred */
				2376	if (idle)
				2377	goto out_unlock;
				2378
				2379	__refill_cfs_bandwidth_runtime(cfs_b);
				2380
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2381	if (!throttled) {
				2382	/* mark as potentially idle for the upcoming period */
				2383	cfs_b->idle = 1;
				2384	goto out_unlock;
				2385	}
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2386
Nikhil Rao	e8da1b1	2011-07-21 09:43:40 -0700	[diff] [blame]	2387	/* account preceding periods in which throttling occurred */
				2388	cfs_b->nr_throttled += overrun;
				2389
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2390	/*
				2391	* There are throttled entities so we must first use the new bandwidth
				2392	* to unthrottle them before making it generally available. This
				2393	* ensures that all existing debts will be paid before a new cfs_rq is
				2394	* allowed to run.
				2395	*/
				2396	runtime = cfs_b->runtime;
				2397	runtime_expires = cfs_b->runtime_expires;
				2398	cfs_b->runtime = 0;
				2399
				2400	/*
				2401	* This check is repeated as we are holding onto the new bandwidth
				2402	* while we unthrottle. This can potentially race with an unthrottled
				2403	* group trying to acquire new bandwidth from the global pool.
				2404	*/
				2405	while (throttled && runtime > 0) {
				2406	raw_spin_unlock(&cfs_b->lock);
				2407	/* we can't nest cfs_b->lock while distributing bandwidth */
				2408	runtime = distribute_cfs_runtime(cfs_b, runtime,
				2409	runtime_expires);
				2410	raw_spin_lock(&cfs_b->lock);
				2411
				2412	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
				2413	}
				2414
				2415	/* return (any) remaining runtime */
				2416	cfs_b->runtime = runtime;
				2417	/*
				2418	* While we are ensured activity in the period following an
				2419	* unthrottle, this also covers the case in which the new bandwidth is
				2420	* insufficient to cover the existing bandwidth deficit. (Forcing the
				2421	* timer to remain active while there are any throttled entities.)
				2422	*/
				2423	cfs_b->idle = 0;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2424	out_unlock:
				2425	if (idle)
				2426	cfs_b->timer_active = 0;
				2427	raw_spin_unlock(&cfs_b->lock);
				2428
				2429	return idle;
				2430	}
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2431
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	2432	/* a cfs_rq won't donate quota below this amount */
				2433	static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
				2434	/* minimum remaining period time to redistribute slack quota */
				2435	static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
				2436	/* how long we wait to gather additional slack before distributing */
				2437	static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
				2438
				2439	/* are we near the end of the current quota period? */
				2440	static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
				2441	{
				2442	struct hrtimer *refresh_timer = &cfs_b->period_timer;
				2443	u64 remaining;
				2444
				2445	/* if the call-back is running a quota refresh is already occurring */
				2446	if (hrtimer_callback_running(refresh_timer))
				2447	return 1;
				2448
				2449	/* is a quota refresh about to occur? */
				2450	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
				2451	if (remaining < min_expire)
				2452	return 1;
				2453
				2454	return 0;
				2455	}
				2456
				2457	static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
				2458	{
				2459	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
				2460
				2461	/* if there's a quota refresh soon don't bother with slack */
				2462	if (runtime_refresh_within(cfs_b, min_left))
				2463	return;
				2464
				2465	start_bandwidth_timer(&cfs_b->slack_timer,
				2466	ns_to_ktime(cfs_bandwidth_slack_period));
				2467	}
				2468
				2469	/* we know any runtime found here is valid as update_curr() precedes return */
				2470	static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2471	{
				2472	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2473	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
				2474
				2475	if (slack_runtime <= 0)
				2476	return;
				2477
				2478	raw_spin_lock(&cfs_b->lock);
				2479	if (cfs_b->quota != RUNTIME_INF &&
				2480	cfs_rq->runtime_expires == cfs_b->runtime_expires) {
				2481	cfs_b->runtime += slack_runtime;
				2482
				2483	/* we are under rq->lock, defer unthrottling using a timer */
				2484	if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
				2485	!list_empty(&cfs_b->throttled_cfs_rq))
				2486	start_cfs_slack_bandwidth(cfs_b);
				2487	}
				2488	raw_spin_unlock(&cfs_b->lock);
				2489
				2490	/* even if it's not valid for return we don't want to try again */
				2491	cfs_rq->runtime_remaining -= slack_runtime;
				2492	}
				2493
				2494	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2495	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2496	if (!cfs_bandwidth_used())
				2497	return;
				2498
Paul Turner	fccfdc6	2011-11-07 20:26:34 -0800	[diff] [blame]	2499	if (!cfs_rq->runtime_enabled \|\| cfs_rq->nr_running)
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	2500	return;
				2501
				2502	__return_cfs_rq_runtime(cfs_rq);
				2503	}
				2504
				2505	/*
				2506	* This is done with a timer (instead of inline with bandwidth return) since
				2507	* it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
				2508	*/
				2509	static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
				2510	{
				2511	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
				2512	u64 expires;
				2513
				2514	/* confirm we're still not at a refresh boundary */
				2515	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
				2516	return;
				2517
				2518	raw_spin_lock(&cfs_b->lock);
				2519	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
				2520	runtime = cfs_b->runtime;
				2521	cfs_b->runtime = 0;
				2522	}
				2523	expires = cfs_b->runtime_expires;
				2524	raw_spin_unlock(&cfs_b->lock);
				2525
				2526	if (!runtime)
				2527	return;
				2528
				2529	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
				2530
				2531	raw_spin_lock(&cfs_b->lock);
				2532	if (expires == cfs_b->runtime_expires)
				2533	cfs_b->runtime = runtime;
				2534	raw_spin_unlock(&cfs_b->lock);
				2535	}
				2536
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2537	/*
				2538	* When a group wakes up we want to make sure that its quota is not already
				2539	* expired/exceeded, otherwise it may be allowed to steal additional ticks of
				2540	* runtime as update_curr() throttling can not not trigger until it's on-rq.
				2541	*/
				2542	static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
				2543	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2544	if (!cfs_bandwidth_used())
				2545	return;
				2546
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2547	/* an active group must be handled by the update_curr()->put() path */
				2548	if (!cfs_rq->runtime_enabled \|\| cfs_rq->curr)
				2549	return;
				2550
				2551	/* ensure the group is not already throttled */
				2552	if (cfs_rq_throttled(cfs_rq))
				2553	return;
				2554
				2555	/* update runtime allocation */
				2556	account_cfs_rq_runtime(cfs_rq, 0);
				2557	if (cfs_rq->runtime_remaining <= 0)
				2558	throttle_cfs_rq(cfs_rq);
				2559	}
				2560
				2561	/* conditionally throttle active cfs_rq's from put_prev_entity() */
				2562	static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2563	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2564	if (!cfs_bandwidth_used())
				2565	return;
				2566
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2567	if (likely(!cfs_rq->runtime_enabled \|\| cfs_rq->runtime_remaining > 0))
				2568	return;
				2569
				2570	/*
				2571	* it's possible for a throttled entity to be forced into a running
				2572	* state (e.g. set_curr_task), in this case we're finished.
				2573	*/
				2574	if (cfs_rq_throttled(cfs_rq))
				2575	return;
				2576
				2577	throttle_cfs_rq(cfs_rq);
				2578	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2579
				2580	static inline u64 default_cfs_period(void);
				2581	static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
				2582	static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
				2583
				2584	static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
				2585	{
				2586	struct cfs_bandwidth *cfs_b =
				2587	container_of(timer, struct cfs_bandwidth, slack_timer);
				2588	do_sched_cfs_slack_timer(cfs_b);
				2589
				2590	return HRTIMER_NORESTART;
				2591	}
				2592
				2593	static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
				2594	{
				2595	struct cfs_bandwidth *cfs_b =
				2596	container_of(timer, struct cfs_bandwidth, period_timer);
				2597	ktime_t now;
				2598	int overrun;
				2599	int idle = 0;
				2600
				2601	for (;;) {
				2602	now = hrtimer_cb_get_time(timer);
				2603	overrun = hrtimer_forward(timer, now, cfs_b->period);
				2604
				2605	if (!overrun)
				2606	break;
				2607
				2608	idle = do_sched_cfs_period_timer(cfs_b, overrun);
				2609	}
				2610
				2611	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
				2612	}
				2613
				2614	void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				2615	{
				2616	raw_spin_lock_init(&cfs_b->lock);
				2617	cfs_b->runtime = 0;
				2618	cfs_b->quota = RUNTIME_INF;
				2619	cfs_b->period = ns_to_ktime(default_cfs_period());
				2620
				2621	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
				2622	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
				2623	cfs_b->period_timer.function = sched_cfs_period_timer;
				2624	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
				2625	cfs_b->slack_timer.function = sched_cfs_slack_timer;
				2626	}
				2627
				2628	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2629	{
				2630	cfs_rq->runtime_enabled = 0;
				2631	INIT_LIST_HEAD(&cfs_rq->throttled_list);
				2632	}
				2633
				2634	/* requires cfs_b->lock, may release to reprogram timer */
				2635	void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				2636	{
				2637	/*
				2638	* The timer may be active because we're trying to set a new bandwidth
				2639	* period or because we're racing with the tear-down path
				2640	* (timer_active==0 becomes visible before the hrtimer call-back
				2641	* terminates). In either case we ensure that it's re-programmed
				2642	*/
				2643	while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
				2644	raw_spin_unlock(&cfs_b->lock);
				2645	/* ensure cfs_b->lock is available while we wait */
				2646	hrtimer_cancel(&cfs_b->period_timer);
				2647
				2648	raw_spin_lock(&cfs_b->lock);
				2649	/* if someone else restarted the timer then we're done */
				2650	if (cfs_b->timer_active)
				2651	return;
				2652	}
				2653
				2654	cfs_b->timer_active = 1;
				2655	start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
				2656	}
				2657
				2658	static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				2659	{
				2660	hrtimer_cancel(&cfs_b->period_timer);
				2661	hrtimer_cancel(&cfs_b->slack_timer);
				2662	}
				2663
Peter Boonstoppel	a4c96ae	2012-08-09 15:34:47 -0700	[diff] [blame]	2664	static void unthrottle_offline_cfs_rqs(struct rq *rq)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2665	{
				2666	struct cfs_rq *cfs_rq;
				2667
				2668	for_each_leaf_cfs_rq(rq, cfs_rq) {
				2669	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2670
				2671	if (!cfs_rq->runtime_enabled)
				2672	continue;
				2673
				2674	/*
				2675	* clock_task is not advancing so we just need to make sure
				2676	* there's some valid quota amount
				2677	*/
				2678	cfs_rq->runtime_remaining = cfs_b->quota;
				2679	if (cfs_rq_throttled(cfs_rq))
				2680	unthrottle_cfs_rq(cfs_rq);
				2681	}
				2682	}
				2683
				2684	#else /* CONFIG_CFS_BANDWIDTH */
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2685	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
				2686	{
				2687	return rq_of(cfs_rq)->clock_task;
				2688	}
				2689
				2690	static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
				2691	unsigned long delta_exec) {}
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2692	static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
				2693	static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	2694	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2695
				2696	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
				2697	{
				2698	return 0;
				2699	}
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2700
				2701	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
				2702	{
				2703	return 0;
				2704	}
				2705
				2706	static inline int throttled_lb_pair(struct task_group *tg,
				2707	int src_cpu, int dest_cpu)
				2708	{
				2709	return 0;
				2710	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2711
				2712	void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
				2713
				2714	#ifdef CONFIG_FAIR_GROUP_SCHED
				2715	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
Paul Turner	ab84d31	2011-07-21 09:43:28 -0700	[diff] [blame]	2716	#endif
				2717
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2718	static inline struct cfs_bandwidth tg_cfs_bandwidth(struct task_group tg)
				2719	{
				2720	return NULL;
				2721	}
				2722	static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
Peter Boonstoppel	a4c96ae	2012-08-09 15:34:47 -0700	[diff] [blame]	2723	static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2724
				2725	#endif /* CONFIG_CFS_BANDWIDTH */
				2726
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2727	/**************************************************
				2728	* CFS operations on tasks:
				2729	*/
				2730
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2731	#ifdef CONFIG_SCHED_HRTICK
				2732	static void hrtick_start_fair(struct rq rq, struct task_struct p)
				2733	{
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2734	struct sched_entity *se = &p->se;
				2735	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				2736
				2737	WARN_ON(task_rq(p) != rq);
				2738
Mike Galbraith	b39e66e	2011-11-22 15:20:07 +0100	[diff] [blame]	2739	if (cfs_rq->nr_running > 1) {
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2740	u64 slice = sched_slice(cfs_rq, se);
				2741	u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
				2742	s64 delta = slice - ran;
				2743
				2744	if (delta < 0) {
				2745	if (rq->curr == p)
				2746	resched_task(p);
				2747	return;
				2748	}
				2749
				2750	/*
				2751	* Don't schedule slices shorter than 10000ns, that just
				2752	* doesn't make sense. Rely on vruntime for fairness.
				2753	*/
Peter Zijlstra	3165651	2008-07-18 18:01:23 +0200	[diff] [blame]	2754	if (rq->curr != p)
Peter Zijlstra	157124c	2008-07-28 11:53:11 +0200	[diff] [blame]	2755	delta = max_t(s64, 10000LL, delta);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2756
Peter Zijlstra	3165651	2008-07-18 18:01:23 +0200	[diff] [blame]	2757	hrtick_start(rq, delta);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2758	}
				2759	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	2760
				2761	/*
				2762	* called from enqueue/dequeue and updates the hrtick when the
				2763	* current task is from our class and nr_running is low enough
				2764	* to matter.
				2765	*/
				2766	static void hrtick_update(struct rq *rq)
				2767	{
				2768	struct task_struct *curr = rq->curr;
				2769
Mike Galbraith	b39e66e	2011-11-22 15:20:07 +0100	[diff] [blame]	2770	if (!hrtick_enabled(rq) \|\| curr->sched_class != &fair_sched_class)
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	2771	return;
				2772
				2773	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
				2774	hrtick_start_fair(rq, curr);
				2775	}
Dhaval Giani	55e12e5	2008-06-24 23:39:43 +0530	[diff] [blame]	2776	#else /* !CONFIG_SCHED_HRTICK */
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2777	static inline void
				2778	hrtick_start_fair(struct rq rq, struct task_struct p)
				2779	{
				2780	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	2781
				2782	static inline void hrtick_update(struct rq *rq)
				2783	{
				2784	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2785	#endif
				2786
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2787	/*
				2788	* The enqueue_task method is called before nr_running is
				2789	* increased. Here we update the fair scheduling stats and
				2790	* then put the task into the rbtree:
				2791	*/
Thomas Gleixner	ea87bb7	2010-01-20 20:58:57 +0000	[diff] [blame]	2792	static void
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	2793	enqueue_task_fair(struct rq rq, struct task_struct p, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2794	{
				2795	struct cfs_rq *cfs_rq;
Peter Zijlstra	62fb185	2008-02-25 17:34:02 +0100	[diff] [blame]	2796	struct sched_entity *se = &p->se;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2797
				2798	for_each_sched_entity(se) {
Peter Zijlstra	62fb185	2008-02-25 17:34:02 +0100	[diff] [blame]	2799	if (se->on_rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2800	break;
				2801	cfs_rq = cfs_rq_of(se);
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2802	enqueue_entity(cfs_rq, se, flags);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2803
				2804	/*
				2805	* end evaluation on encountering a throttled cfs_rq
				2806	*
				2807	* note: in the case of encountering a throttled cfs_rq we will
				2808	* post the final h_nr_running increment below.
				2809	*/
				2810	if (cfs_rq_throttled(cfs_rq))
				2811	break;
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	2812	cfs_rq->h_nr_running++;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2813
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2814	flags = ENQUEUE_WAKEUP;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2815	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2816
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2817	for_each_sched_entity(se) {
Lin Ming	0f31714	2011-07-22 09:14:31 +0800	[diff] [blame]	2818	cfs_rq = cfs_rq_of(se);
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	2819	cfs_rq->h_nr_running++;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2820
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2821	if (cfs_rq_throttled(cfs_rq))
				2822	break;
				2823
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	2824	update_cfs_shares(cfs_rq);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2825	update_entity_load_avg(se, 1);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2826	}
				2827
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	2828	if (!se) {
				2829	update_rq_runnable_avg(rq, rq->nr_running);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2830	inc_nr_running(rq);
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	2831	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	2832	hrtick_update(rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2833	}
				2834
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	2835	static void set_next_buddy(struct sched_entity *se);
				2836
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2837	/*
				2838	* The dequeue_task method is called before nr_running is
				2839	* decreased. We remove the task from the rbtree and
				2840	* update the fair scheduling stats:
				2841	*/
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	2842	static void dequeue_task_fair(struct rq rq, struct task_struct p, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2843	{
				2844	struct cfs_rq *cfs_rq;
Peter Zijlstra	62fb185	2008-02-25 17:34:02 +0100	[diff] [blame]	2845	struct sched_entity *se = &p->se;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	2846	int task_sleep = flags & DEQUEUE_SLEEP;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2847
				2848	for_each_sched_entity(se) {
				2849	cfs_rq = cfs_rq_of(se);
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	2850	dequeue_entity(cfs_rq, se, flags);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2851
				2852	/*
				2853	* end evaluation on encountering a throttled cfs_rq
				2854	*
				2855	* note: in the case of encountering a throttled cfs_rq we will
				2856	* post the final h_nr_running decrement below.
				2857	*/
				2858	if (cfs_rq_throttled(cfs_rq))
				2859	break;
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	2860	cfs_rq->h_nr_running--;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2861
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2862	/* Don't dequeue parent if it has other entities besides us */
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	2863	if (cfs_rq->load.weight) {
				2864	/*
				2865	* Bias pick_next to pick a task from this cfs_rq, as
				2866	* p is sleeping when it is within its sched_slice.
				2867	*/
				2868	if (task_sleep && parent_entity(se))
				2869	set_next_buddy(parent_entity(se));
Paul Turner	9598c82	2011-07-06 22:30:37 -0700	[diff] [blame]	2870
				2871	/* avoid re-evaluating load for this entity */
				2872	se = parent_entity(se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2873	break;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	2874	}
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	2875	flags \|= DEQUEUE_SLEEP;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2876	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2877
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2878	for_each_sched_entity(se) {
Lin Ming	0f31714	2011-07-22 09:14:31 +0800	[diff] [blame]	2879	cfs_rq = cfs_rq_of(se);
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	2880	cfs_rq->h_nr_running--;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2881
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2882	if (cfs_rq_throttled(cfs_rq))
				2883	break;
				2884
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	2885	update_cfs_shares(cfs_rq);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2886	update_entity_load_avg(se, 1);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2887	}
				2888
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	2889	if (!se) {
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2890	dec_nr_running(rq);
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	2891	update_rq_runnable_avg(rq, 1);
				2892	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	2893	hrtick_update(rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2894	}
				2895
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	2896	#ifdef CONFIG_SMP
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2897	/* Used instead of source_load when we know the type == 0 */
				2898	static unsigned long weighted_cpuload(const int cpu)
				2899	{
				2900	return cpu_rq(cpu)->load.weight;
				2901	}
				2902
				2903	/*
				2904	* Return a low guess at the load of a migration-source cpu weighted
				2905	* according to the scheduling class and "nice" value.
				2906	*
				2907	* We want to under-estimate the load of migration sources, to
				2908	* balance conservatively.
				2909	*/
				2910	static unsigned long source_load(int cpu, int type)
				2911	{
				2912	struct rq *rq = cpu_rq(cpu);
				2913	unsigned long total = weighted_cpuload(cpu);
				2914
				2915	if (type == 0 \|\| !sched_feat(LB_BIAS))
				2916	return total;
				2917
				2918	return min(rq->cpu_load[type-1], total);
				2919	}
				2920
				2921	/*
				2922	* Return a high guess at the load of a migration-target cpu weighted
				2923	* according to the scheduling class and "nice" value.
				2924	*/
				2925	static unsigned long target_load(int cpu, int type)
				2926	{
				2927	struct rq *rq = cpu_rq(cpu);
				2928	unsigned long total = weighted_cpuload(cpu);
				2929
				2930	if (type == 0 \|\| !sched_feat(LB_BIAS))
				2931	return total;
				2932
				2933	return max(rq->cpu_load[type-1], total);
				2934	}
				2935
				2936	static unsigned long power_of(int cpu)
				2937	{
				2938	return cpu_rq(cpu)->cpu_power;
				2939	}
				2940
				2941	static unsigned long cpu_avg_load_per_task(int cpu)
				2942	{
				2943	struct rq *rq = cpu_rq(cpu);
				2944	unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
				2945
				2946	if (nr_running)
				2947	return rq->load.weight / nr_running;
				2948
				2949	return 0;
				2950	}
				2951
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	2952
Peter Zijlstra	74f8e4b	2011-04-05 17:23:47 +0200	[diff] [blame]	2953	static void task_waking_fair(struct task_struct *p)
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2954	{
				2955	struct sched_entity *se = &p->se;
				2956	struct cfs_rq *cfs_rq = cfs_rq_of(se);
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	2957	u64 min_vruntime;
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2958
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	2959	#ifndef CONFIG_64BIT
				2960	u64 min_vruntime_copy;
Peter Zijlstra	74f8e4b	2011-04-05 17:23:47 +0200	[diff] [blame]	2961
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	2962	do {
				2963	min_vruntime_copy = cfs_rq->min_vruntime_copy;
				2964	smp_rmb();
				2965	min_vruntime = cfs_rq->min_vruntime;
				2966	} while (min_vruntime != min_vruntime_copy);
				2967	#else
				2968	min_vruntime = cfs_rq->min_vruntime;
				2969	#endif
				2970
				2971	se->vruntime -= min_vruntime;
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2972	}
				2973
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	2974	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	f5bfb7d	2008-06-27 13:41:39 +0200	[diff] [blame]	2975	/*
				2976	* effective_load() calculates the load change as seen from the root_task_group
				2977	*
				2978	* Adding load to a group doesn't make a group heavier, but can cause movement
				2979	* of group shares between cpus. Assuming the shares were perfectly aligned one
				2980	* can calculate the shift in shares.
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	2981	*
				2982	* Calculate the effective load difference if @wl is added (subtracted) to @tg
				2983	* on this @cpu and results in a total addition (subtraction) of @wg to the
				2984	* total group weight.
				2985	*
				2986	* Given a runqueue weight distribution (rw_i) we can compute a shares
				2987	* distribution (s_i) using:
				2988	*
				2989	* s_i = rw_i / \Sum rw_j (1)
				2990	*
				2991	* Suppose we have 4 CPUs and our @tg is a direct child of the root group and
				2992	* has 7 equal weight tasks, distributed as below (rw_i), with the resulting
				2993	* shares distribution (s_i):
				2994	*
				2995	* rw_i = { 2, 4, 1, 0 }
				2996	* s_i = { 2/7, 4/7, 1/7, 0 }
				2997	*
				2998	* As per wake_affine() we're interested in the load of two CPUs (the CPU the
				2999	* task used to run on and the CPU the waker is running on), we need to
				3000	* compute the effect of waking a task on either CPU and, in case of a sync
				3001	* wakeup, compute the effect of the current task going to sleep.
				3002	*
				3003	* So for a change of @wl to the local @cpu with an overall group weight change
				3004	* of @wl we can compute the new shares distribution (s'_i) using:
				3005	*
				3006	* s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
				3007	*
				3008	* Suppose we're interested in CPUs 0 and 1, and want to compute the load
				3009	* differences in waking a task to CPU 0. The additional task changes the
				3010	* weight and shares distributions like:
				3011	*
				3012	* rw'_i = { 3, 4, 1, 0 }
				3013	* s'_i = { 3/8, 4/8, 1/8, 0 }
				3014	*
				3015	* We can then compute the difference in effective weight by using:
				3016	*
				3017	* dw_i = S * (s'_i - s_i) (3)
				3018	*
				3019	* Where 'S' is the group weight as seen by its parent.
				3020	*
				3021	* Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
				3022	* times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
				3023	* 4/7) times the weight of the group.
Peter Zijlstra	f5bfb7d	2008-06-27 13:41:39 +0200	[diff] [blame]	3024	*/
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	3025	static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3026	{
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3027	struct sched_entity *se = tg->se[cpu];
Peter Zijlstra	f1d239f	2008-06-27 13:41:38 +0200	[diff] [blame]	3028
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3029	if (!tg->parent) /* the trivial, non-cgroup case */
Peter Zijlstra	f1d239f	2008-06-27 13:41:38 +0200	[diff] [blame]	3030	return wl;
				3031
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3032	for_each_sched_entity(se) {
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3033	long w, W;
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3034
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	3035	tg = se->my_q->tg;
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3036
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3037	/*
				3038	* W = @wg + \Sum rw_j
				3039	*/
				3040	W = wg + calc_tg_weight(tg, se->my_q);
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3041
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3042	/*
				3043	* w = rw_i + @wl
				3044	*/
				3045	w = se->my_q->load.weight + wl;
Peter Zijlstra	940959e	2008-09-23 15:33:42 +0200	[diff] [blame]	3046
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3047	/*
				3048	* wl = S * s'_i; see (2)
				3049	*/
				3050	if (W > 0 && w < W)
				3051	wl = (w * tg->shares) / W;
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	3052	else
				3053	wl = tg->shares;
Peter Zijlstra	940959e	2008-09-23 15:33:42 +0200	[diff] [blame]	3054
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3055	/*
				3056	* Per the above, wl is the new se->load.weight value; since
				3057	* those are clipped to [MIN_SHARES, ...) do so now. See
				3058	* calc_cfs_shares().
				3059	*/
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	3060	if (wl < MIN_SHARES)
				3061	wl = MIN_SHARES;
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3062
				3063	/*
				3064	* wl = dw_i = S * (s'_i - s_i); see (3)
				3065	*/
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	3066	wl -= se->load.weight;
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3067
				3068	/*
				3069	* Recursively apply this logic to all parent groups to compute
				3070	* the final effective load change on the root group. Since
				3071	* only the @tg group gets extra weight, all parent groups can
				3072	* only redistribute existing shares. @wl is the shift in shares
				3073	* resulting from this level per the above.
				3074	*/
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3075	wg = 0;
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3076	}
				3077
				3078	return wl;
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3079	}
				3080	#else
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3081
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3082	static inline unsigned long effective_load(struct task_group *tg, int cpu,
				3083	unsigned long wl, unsigned long wg)
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3084	{
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3085	return wl;
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3086	}
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3087
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3088	#endif
				3089
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3090	static int wake_affine(struct sched_domain sd, struct task_struct p, int sync)
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3091	{
Paul Turner	e37b6a7	2011-01-21 20:44:59 -0800	[diff] [blame]	3092	s64 this_load, load;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3093	int idx, this_cpu, prev_cpu;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3094	unsigned long tl_per_task;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3095	struct task_group *tg;
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3096	unsigned long weight;
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	3097	int balanced;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3098
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3099	idx = sd->wake_idx;
				3100	this_cpu = smp_processor_id();
				3101	prev_cpu = task_cpu(p);
				3102	load = source_load(prev_cpu, idx);
				3103	this_load = target_load(this_cpu, idx);
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3104
				3105	/*
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3106	* If sync wakeup then subtract the (maximum possible)
				3107	* effect of the currently running task from the load
				3108	* of the current CPU:
				3109	*/
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3110	if (sync) {
				3111	tg = task_group(current);
				3112	weight = current->se.load.weight;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3113
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3114	this_load += effective_load(tg, this_cpu, -weight, -weight);
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3115	load += effective_load(tg, prev_cpu, 0, -weight);
				3116	}
				3117
				3118	tg = task_group(p);
				3119	weight = p->se.load.weight;
				3120
Peter Zijlstra	71a29aa	2009-09-07 18:28:05 +0200	[diff] [blame]	3121	/*
				3122	* In low-load situations, where prev_cpu is idle and this_cpu is idle
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3123	* due to the sync cause above having dropped this_load to 0, we'll
				3124	* always have an imbalance, but there's really nothing you can do
				3125	* about that, so that's good too.
Peter Zijlstra	71a29aa	2009-09-07 18:28:05 +0200	[diff] [blame]	3126	*
				3127	* Otherwise check if either cpus are near enough in load to allow this
				3128	* task to be woken on this_cpu.
				3129	*/
Paul Turner	e37b6a7	2011-01-21 20:44:59 -0800	[diff] [blame]	3130	if (this_load > 0) {
				3131	s64 this_eff_load, prev_eff_load;
Peter Zijlstra	e51fd5e	2010-05-31 12:37:30 +0200	[diff] [blame]	3132
				3133	this_eff_load = 100;
				3134	this_eff_load *= power_of(prev_cpu);
				3135	this_eff_load *= this_load +
				3136	effective_load(tg, this_cpu, weight, weight);
				3137
				3138	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
				3139	prev_eff_load *= power_of(this_cpu);
				3140	prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
				3141
				3142	balanced = this_eff_load <= prev_eff_load;
				3143	} else
				3144	balanced = true;
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	3145
				3146	/*
				3147	* If the currently running task will sleep within
				3148	* a reasonable amount of time then attract this newly
				3149	* woken task:
				3150	*/
Peter Zijlstra	2fb7635	2008-10-08 09:16:04 +0200	[diff] [blame]	3151	if (sync && balanced)
				3152	return 1;
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	3153
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3154	schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	3155	tl_per_task = cpu_avg_load_per_task(this_cpu);
				3156
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3157	if (balanced \|\|
				3158	(this_load <= load &&
				3159	this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3160	/*
				3161	* This domain has SD_WAKE_AFFINE and
				3162	* p is cache cold in this domain, and
				3163	* there is no bad imbalance.
				3164	*/
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3165	schedstat_inc(sd, ttwu_move_affine);
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3166	schedstat_inc(p, se.statistics.nr_wakeups_affine);
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3167
				3168	return 1;
				3169	}
				3170	return 0;
				3171	}
				3172
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3173	/*
				3174	* find_idlest_group finds and returns the least busy CPU group within the
				3175	* domain.
				3176	*/
				3177	static struct sched_group *
Peter Zijlstra	78e7ed5	2009-09-03 13:16:51 +0200	[diff] [blame]	3178	find_idlest_group(struct sched_domain sd, struct task_struct p,
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	3179	int this_cpu, int load_idx)
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3180	{
Andi Kleen	b3bd3de	2010-08-10 14:17:51 -0700	[diff] [blame]	3181	struct sched_group idlest = NULL, group = sd->groups;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3182	unsigned long min_load = ULONG_MAX, this_load = 0;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3183	int imbalance = 100 + (sd->imbalance_pct-100)/2;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3184
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3185	do {
				3186	unsigned long load, avg_load;
				3187	int local_group;
				3188	int i;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3189
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3190	/* Skip over this group if it has no CPUs allowed */
				3191	if (!cpumask_intersects(sched_group_cpus(group),
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	3192	tsk_cpus_allowed(p)))
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3193	continue;
				3194
				3195	local_group = cpumask_test_cpu(this_cpu,
				3196	sched_group_cpus(group));
				3197
				3198	/* Tally up the load of all CPUs in the group */
				3199	avg_load = 0;
				3200
				3201	for_each_cpu(i, sched_group_cpus(group)) {
				3202	/* Bias balancing toward cpus of our domain */
				3203	if (local_group)
				3204	load = source_load(i, load_idx);
				3205	else
				3206	load = target_load(i, load_idx);
				3207
				3208	avg_load += load;
				3209	}
				3210
				3211	/* Adjust by relative CPU power of the group */
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	3212	avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3213
				3214	if (local_group) {
				3215	this_load = avg_load;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3216	} else if (avg_load < min_load) {
				3217	min_load = avg_load;
				3218	idlest = group;
				3219	}
				3220	} while (group = group->next, group != sd->groups);
				3221
				3222	if (!idlest \|\| 100this_load < imbalancemin_load)
				3223	return NULL;
				3224	return idlest;
				3225	}
				3226
				3227	/*
				3228	* find_idlest_cpu - find the idlest cpu among the cpus in group.
				3229	*/
				3230	static int
				3231	find_idlest_cpu(struct sched_group group, struct task_struct p, int this_cpu)
				3232	{
				3233	unsigned long load, min_load = ULONG_MAX;
				3234	int idlest = -1;
				3235	int i;
				3236
				3237	/* Traverse only the allowed CPUs */
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	3238	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3239	load = weighted_cpuload(i);
				3240
				3241	if (load < min_load \|\| (load == min_load && i == this_cpu)) {
				3242	min_load = load;
				3243	idlest = i;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3244	}
				3245	}
				3246
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3247	return idlest;
				3248	}
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3249
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3250	/*
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3251	* Try and locate an idle CPU in the sched_domain.
				3252	*/
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3253	static int select_idle_sibling(struct task_struct *p, int target)
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3254	{
				3255	int cpu = smp_processor_id();
				3256	int prev_cpu = task_cpu(p);
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3257	struct sched_domain *sd;
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3258	struct sched_group *sg;
				3259	int i;
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3260
				3261	/*
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3262	* If the task is going to be woken-up on this cpu and if it is
				3263	* already idle, then it is the right target.
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3264	*/
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3265	if (target == cpu && idle_cpu(cpu))
				3266	return cpu;
				3267
				3268	/*
				3269	* If the task is going to be woken-up on the cpu where it previously
				3270	* ran and if it is currently idle, then it the right target.
				3271	*/
				3272	if (target == prev_cpu && idle_cpu(prev_cpu))
Peter Zijlstra	fe3bcfe	2009-11-12 15:55:29 +0100	[diff] [blame]	3273	return prev_cpu;
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3274
				3275	/*
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3276	* Otherwise, iterate the domains and find an elegible idle cpu.
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3277	*/
Peter Zijlstra	518cd62	2011-12-07 15:07:31 +0100	[diff] [blame]	3278	sd = rcu_dereference(per_cpu(sd_llc, target));
Suresh Siddha	77e8136	2011-11-17 11:08:23 -0800	[diff] [blame]	3279	for_each_lower_domain(sd) {
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3280	sg = sd->groups;
				3281	do {
				3282	if (!cpumask_intersects(sched_group_cpus(sg),
				3283	tsk_cpus_allowed(p)))
				3284	goto next;
Mike Galbraith	970e178	2012-06-12 05:18:32 +0200	[diff] [blame]	3285
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3286	for_each_cpu(i, sched_group_cpus(sg)) {
				3287	if (!idle_cpu(i))
				3288	goto next;
				3289	}
				3290
				3291	target = cpumask_first_and(sched_group_cpus(sg),
				3292	tsk_cpus_allowed(p));
				3293	goto done;
				3294	next:
				3295	sg = sg->next;
				3296	} while (sg != sd->groups);
				3297	}
				3298	done:
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3299	return target;
				3300	}
				3301
				3302	/*
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3303	* sched_balance_self: balance the current task (running on cpu) in domains
				3304	* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
				3305	* SD_BALANCE_EXEC.
				3306	*
				3307	* Balance, ie. select the least loaded group.
				3308	*
				3309	* Returns the target CPU number, or the same CPU if no balancing is needed.
				3310	*
				3311	* preempt must be disabled.
				3312	*/
Peter Zijlstra	0017d73	2010-03-24 18:34:10 +0100	[diff] [blame]	3313	static int
Peter Zijlstra	7608dec	2011-04-05 17:23:46 +0200	[diff] [blame]	3314	select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3315	{
Peter Zijlstra	29cd8ba	2009-09-17 09:01:14 +0200	[diff] [blame]	3316	struct sched_domain tmp, affine_sd = NULL, *sd = NULL;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3317	int cpu = smp_processor_id();
				3318	int prev_cpu = task_cpu(p);
				3319	int new_cpu = cpu;
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3320	int want_affine = 0;
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	3321	int sync = wake_flags & WF_SYNC;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3322
Peter Zijlstra	29baa74	2012-04-23 12:11:21 +0200	[diff] [blame]	3323	if (p->nr_cpus_allowed == 1)
Mike Galbraith	76854c7	2011-11-22 15:18:24 +0100	[diff] [blame]	3324	return prev_cpu;
				3325
Peter Zijlstra	0763a66	2009-09-14 19:37:39 +0200	[diff] [blame]	3326	if (sd_flag & SD_BALANCE_WAKE) {
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	3327	if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3328	want_affine = 1;
				3329	new_cpu = prev_cpu;
				3330	}
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3331
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	3332	rcu_read_lock();
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3333	for_each_domain(cpu, tmp) {
Peter Zijlstra	e4f42888	2009-12-16 18:04:34 +0100	[diff] [blame]	3334	if (!(tmp->flags & SD_LOAD_BALANCE))
				3335	continue;
				3336
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3337	/*
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3338	* If both cpu and prev_cpu are part of this domain,
				3339	* cpu is a valid SD_WAKE_AFFINE target.
Peter Zijlstra	fe3bcfe	2009-11-12 15:55:29 +0100	[diff] [blame]	3340	*/
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3341	if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
				3342	cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
				3343	affine_sd = tmp;
Alex Shi	f03542a	2012-07-26 08:55:34 +0800	[diff] [blame]	3344	break;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3345	}
				3346
Alex Shi	f03542a	2012-07-26 08:55:34 +0800	[diff] [blame]	3347	if (tmp->flags & sd_flag)
Peter Zijlstra	29cd8ba	2009-09-17 09:01:14 +0200	[diff] [blame]	3348	sd = tmp;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3349	}
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3350
Mike Galbraith	8b911ac	2010-03-11 17:17:16 +0100	[diff] [blame]	3351	if (affine_sd) {
Alex Shi	f03542a	2012-07-26 08:55:34 +0800	[diff] [blame]	3352	if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	3353	prev_cpu = cpu;
				3354
				3355	new_cpu = select_idle_sibling(p, prev_cpu);
				3356	goto unlock;
Mike Galbraith	8b911ac	2010-03-11 17:17:16 +0100	[diff] [blame]	3357	}
Peter Zijlstra	3b64089	2009-09-16 13:44:33 +0200	[diff] [blame]	3358
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3359	while (sd) {
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	3360	int load_idx = sd->forkexec_idx;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3361	struct sched_group *group;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3362	int weight;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3363
Peter Zijlstra	0763a66	2009-09-14 19:37:39 +0200	[diff] [blame]	3364	if (!(sd->flags & sd_flag)) {
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3365	sd = sd->child;
				3366	continue;
				3367	}
				3368
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	3369	if (sd_flag & SD_BALANCE_WAKE)
				3370	load_idx = sd->wake_idx;
				3371
				3372	group = find_idlest_group(sd, p, cpu, load_idx);
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3373	if (!group) {
				3374	sd = sd->child;
				3375	continue;
				3376	}
				3377
Peter Zijlstra	d7c33c4	2009-09-11 12:45:38 +0200	[diff] [blame]	3378	new_cpu = find_idlest_cpu(group, p, cpu);
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3379	if (new_cpu == -1 \|\| new_cpu == cpu) {
				3380	/* Now try balancing at a lower domain level of cpu */
				3381	sd = sd->child;
				3382	continue;
				3383	}
				3384
				3385	/* Now try balancing at a lower domain level of new_cpu */
				3386	cpu = new_cpu;
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	3387	weight = sd->span_weight;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3388	sd = NULL;
				3389	for_each_domain(cpu, tmp) {
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	3390	if (weight <= tmp->span_weight)
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3391	break;
Peter Zijlstra	0763a66	2009-09-14 19:37:39 +0200	[diff] [blame]	3392	if (tmp->flags & sd_flag)
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3393	sd = tmp;
				3394	}
				3395	/* while loop will break here if sd == NULL */
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3396	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	3397	unlock:
				3398	rcu_read_unlock();
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3399
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3400	return new_cpu;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3401	}
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	3402
				3403	/*
Paul Turner	f4e26b1	2012-10-04 13:18:32 +0200	[diff] [blame]	3404	* Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
				3405	* removed when useful for applications beyond shares distribution (e.g.
				3406	* load-balance).
				3407	*/
				3408	#ifdef CONFIG_FAIR_GROUP_SCHED
				3409	/*
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	3410	* Called immediately before a task is migrated to a new cpu; task_cpu(p) and
				3411	* cfs_rq_of(p) references at time of call are still valid and identify the
				3412	* previous cpu. However, the caller only guarantees p->pi_lock is held; no
				3413	* other assumptions, including the state of rq->lock, should be made.
				3414	*/
				3415	static void
				3416	migrate_task_rq_fair(struct task_struct *p, int next_cpu)
				3417	{
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	3418	struct sched_entity *se = &p->se;
				3419	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				3420
				3421	/*
				3422	* Load tracking: accumulate removed load so that it can be processed
				3423	* when we next update owning cfs_rq under rq->lock. Tasks contribute
				3424	* to blocked load iff they have a positive decay-count. It can never
				3425	* be negative here since on-rq tasks have decay-count == 0.
				3426	*/
				3427	if (se->avg.decay_count) {
				3428	se->avg.decay_count = -__synchronize_entity_decay(se);
				3429	atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
				3430	}
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	3431	}
Paul Turner	f4e26b1	2012-10-04 13:18:32 +0200	[diff] [blame]	3432	#endif
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3433	#endif /* CONFIG_SMP */
				3434
Peter Zijlstra	e52fb7c	2009-01-14 12:39:19 +0100	[diff] [blame]	3435	static unsigned long
				3436	wakeup_gran(struct sched_entity curr, struct sched_entity se)
Peter Zijlstra	0bbd333	2008-04-19 19:44:57 +0200	[diff] [blame]	3437	{
				3438	unsigned long gran = sysctl_sched_wakeup_granularity;
				3439
				3440	/*
Peter Zijlstra	e52fb7c	2009-01-14 12:39:19 +0100	[diff] [blame]	3441	* Since its curr running now, convert the gran from real-time
				3442	* to virtual-time in his units.
Mike Galbraith	13814d4	2010-03-11 17:17:04 +0100	[diff] [blame]	3443	*
				3444	* By using 'se' instead of 'curr' we penalize light tasks, so
				3445	* they get preempted easier. That is, if 'se' < 'curr' then
				3446	* the resulting gran will be larger, therefore penalizing the
				3447	* lighter, if otoh 'se' > 'curr' then the resulting gran will
				3448	* be smaller, again penalizing the lighter task.
				3449	*
				3450	* This is especially important for buddies when the leftmost
				3451	* task is higher priority than the buddy.
Peter Zijlstra	0bbd333	2008-04-19 19:44:57 +0200	[diff] [blame]	3452	*/
Shaohua Li	f4ad9bd	2011-04-08 12:53:09 +0800	[diff] [blame]	3453	return calc_delta_fair(gran, se);
Peter Zijlstra	0bbd333	2008-04-19 19:44:57 +0200	[diff] [blame]	3454	}
				3455
				3456	/*
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	3457	* Should 'se' preempt 'curr'.
				3458	*
				3459	* \|s1
				3460	* \|s2
				3461	* \|s3
				3462	* g
				3463	* \|<--->\|c
				3464	*
				3465	* w(c, s1) = -1
				3466	* w(c, s2) = 0
				3467	* w(c, s3) = 1
				3468	*
				3469	*/
				3470	static int
				3471	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se)
				3472	{
				3473	s64 gran, vdiff = curr->vruntime - se->vruntime;
				3474
				3475	if (vdiff <= 0)
				3476	return -1;
				3477
Peter Zijlstra	e52fb7c	2009-01-14 12:39:19 +0100	[diff] [blame]	3478	gran = wakeup_gran(curr, se);
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	3479	if (vdiff > gran)
				3480	return 1;
				3481
				3482	return 0;
				3483	}
				3484
Peter Zijlstra	0247909	2008-11-04 21:25:10 +0100	[diff] [blame]	3485	static void set_last_buddy(struct sched_entity *se)
				3486	{
Venkatesh Pallipadi	69c80f3	2011-04-13 18:21:09 -0700	[diff] [blame]	3487	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
				3488	return;
				3489
				3490	for_each_sched_entity(se)
				3491	cfs_rq_of(se)->last = se;
Peter Zijlstra	0247909	2008-11-04 21:25:10 +0100	[diff] [blame]	3492	}
				3493
				3494	static void set_next_buddy(struct sched_entity *se)
				3495	{
Venkatesh Pallipadi	69c80f3	2011-04-13 18:21:09 -0700	[diff] [blame]	3496	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
				3497	return;
				3498
				3499	for_each_sched_entity(se)
				3500	cfs_rq_of(se)->next = se;
Peter Zijlstra	0247909	2008-11-04 21:25:10 +0100	[diff] [blame]	3501	}
				3502
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3503	static void set_skip_buddy(struct sched_entity *se)
				3504	{
Venkatesh Pallipadi	69c80f3	2011-04-13 18:21:09 -0700	[diff] [blame]	3505	for_each_sched_entity(se)
				3506	cfs_rq_of(se)->skip = se;
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3507	}
				3508
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	3509	/*
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3510	* Preempt the current task with a newly woken task if needed:
				3511	*/
Peter Zijlstra	5a9b86f	2009-09-16 13:47:58 +0200	[diff] [blame]	3512	static void check_preempt_wakeup(struct rq rq, struct task_struct p, int wake_flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3513	{
				3514	struct task_struct *curr = rq->curr;
Srivatsa Vaddagiri	8651a86	2007-10-15 17:00:12 +0200	[diff] [blame]	3515	struct sched_entity se = &curr->se, pse = &p->se;
Mike Galbraith	03e89e4	2008-12-16 08:45:30 +0100	[diff] [blame]	3516	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	3517	int scale = cfs_rq->nr_running >= sched_nr_latency;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3518	int next_buddy_marked = 0;
Mike Galbraith	03e89e4	2008-12-16 08:45:30 +0100	[diff] [blame]	3519
Ingo Molnar	4ae7d5c	2008-03-19 01:42:00 +0100	[diff] [blame]	3520	if (unlikely(se == pse))
				3521	return;
				3522
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	3523	/*
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3524	* This is possible from callers such as move_task(), in which we
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	3525	* unconditionally check_prempt_curr() after an enqueue (which may have
				3526	* lead to a throttle). This both saves work and prevents false
				3527	* next-buddy nomination below.
				3528	*/
				3529	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
				3530	return;
				3531
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3532	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
Mike Galbraith	3cb63d5	2009-09-11 12:01:17 +0200	[diff] [blame]	3533	set_next_buddy(pse);
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3534	next_buddy_marked = 1;
				3535	}
Peter Zijlstra	57fdc26	2008-09-23 15:33:45 +0200	[diff] [blame]	3536
Bharata B Rao	aec0a51	2008-08-28 14:42:49 +0530	[diff] [blame]	3537	/*
				3538	* We can come here with TIF_NEED_RESCHED already set from new task
				3539	* wake up path.
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	3540	*
				3541	* Note: this also catches the edge-case of curr being in a throttled
				3542	* group (e.g. via set_curr_task), since update_curr() (in the
				3543	* enqueue of curr) will have resulted in resched being set. This
				3544	* prevents us from potentially nominating it as a false LAST_BUDDY
				3545	* below.
Bharata B Rao	aec0a51	2008-08-28 14:42:49 +0530	[diff] [blame]	3546	*/
				3547	if (test_tsk_need_resched(curr))
				3548	return;
				3549
Darren Hart	a2f5c9a	2011-02-22 13:04:33 -0800	[diff] [blame]	3550	/* Idle tasks are by definition preempted by non-idle tasks. */
				3551	if (unlikely(curr->policy == SCHED_IDLE) &&
				3552	likely(p->policy != SCHED_IDLE))
				3553	goto preempt;
				3554
Ingo Molnar	91c234b	2007-10-15 17:00:18 +0200	[diff] [blame]	3555	/*
Darren Hart	a2f5c9a	2011-02-22 13:04:33 -0800	[diff] [blame]	3556	* Batch and idle tasks do not preempt non-idle tasks (their preemption
				3557	* is driven by the tick):
Ingo Molnar	91c234b	2007-10-15 17:00:18 +0200	[diff] [blame]	3558	*/
Ingo Molnar	8ed92e51	2012-10-14 14:28:50 +0200	[diff] [blame]	3559	if (unlikely(p->policy != SCHED_NORMAL) \|\| !sched_feat(WAKEUP_PREEMPTION))
Ingo Molnar	91c234b	2007-10-15 17:00:18 +0200	[diff] [blame]	3560	return;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3561
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	3562	find_matching_se(&se, &pse);
Paul Turner	9bbd737	2011-07-05 19:07:21 -0700	[diff] [blame]	3563	update_curr(cfs_rq_of(se));
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	3564	BUG_ON(!pse);
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3565	if (wakeup_preempt_entity(se, pse) == 1) {
				3566	/*
				3567	* Bias pick_next to pick the sched entity that is
				3568	* triggering this preemption.
				3569	*/
				3570	if (!next_buddy_marked)
				3571	set_next_buddy(pse);
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	3572	goto preempt;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3573	}
Jupyung Lee	a65ac74	2009-11-17 18:51:40 +0900	[diff] [blame]	3574
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	3575	return;
				3576
				3577	preempt:
				3578	resched_task(curr);
				3579	/*
				3580	* Only set the backward buddy when the current task is still
				3581	* on the rq. This can happen when a wakeup gets interleaved
				3582	* with schedule on the ->pre_schedule() or idle_balance()
				3583	* point, either of which can * drop the rq lock.
				3584	*
				3585	* Also, during early boot the idle thread is in the fair class,
				3586	* for obvious reasons its a bad idea to schedule back to it.
				3587	*/
				3588	if (unlikely(!se->on_rq \|\| curr == rq->idle))
				3589	return;
				3590
				3591	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
				3592	set_last_buddy(se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3593	}
				3594
Ingo Molnar	fb8d472	2007-08-09 11:16:48 +0200	[diff] [blame]	3595	static struct task_struct pick_next_task_fair(struct rq rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3596	{
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3597	struct task_struct *p;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3598	struct cfs_rq *cfs_rq = &rq->cfs;
				3599	struct sched_entity *se;
				3600
Tim Blechmann	36ace27	2009-11-24 11:55:45 +0100	[diff] [blame]	3601	if (!cfs_rq->nr_running)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3602	return NULL;
				3603
				3604	do {
Ingo Molnar	9948f4b	2007-08-09 11:16:48 +0200	[diff] [blame]	3605	se = pick_next_entity(cfs_rq);
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	3606	set_next_entity(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3607	cfs_rq = group_cfs_rq(se);
				3608	} while (cfs_rq);
				3609
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3610	p = task_of(se);
Mike Galbraith	b39e66e	2011-11-22 15:20:07 +0100	[diff] [blame]	3611	if (hrtick_enabled(rq))
				3612	hrtick_start_fair(rq, p);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3613
				3614	return p;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3615	}
				3616
				3617	/*
				3618	* Account for a descheduled task:
				3619	*/
Ingo Molnar	31ee529	2007-08-09 11:16:49 +0200	[diff] [blame]	3620	static void put_prev_task_fair(struct rq rq, struct task_struct prev)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3621	{
				3622	struct sched_entity *se = &prev->se;
				3623	struct cfs_rq *cfs_rq;
				3624
				3625	for_each_sched_entity(se) {
				3626	cfs_rq = cfs_rq_of(se);
Ingo Molnar	ab6cde2	2007-08-09 11:16:48 +0200	[diff] [blame]	3627	put_prev_entity(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3628	}
				3629	}
				3630
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3631	/*
				3632	* sched_yield() is very simple
				3633	*
				3634	* The magic of dealing with the ->skip buddy is in pick_next_entity.
				3635	*/
				3636	static void yield_task_fair(struct rq *rq)
				3637	{
				3638	struct task_struct *curr = rq->curr;
				3639	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
				3640	struct sched_entity *se = &curr->se;
				3641
				3642	/*
				3643	* Are we the only task in the tree?
				3644	*/
				3645	if (unlikely(rq->nr_running == 1))
				3646	return;
				3647
				3648	clear_buddies(cfs_rq, se);
				3649
				3650	if (curr->policy != SCHED_BATCH) {
				3651	update_rq_clock(rq);
				3652	/*
				3653	* Update run-time statistics of the 'current'.
				3654	*/
				3655	update_curr(cfs_rq);
Mike Galbraith	916671c	2011-11-22 15:21:26 +0100	[diff] [blame]	3656	/*
				3657	* Tell update_rq_clock() that we've just updated,
				3658	* so we don't do microscopic update in schedule()
				3659	* and double the fastpath cost.
				3660	*/
				3661	rq->skip_clock_update = 1;
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3662	}
				3663
				3664	set_skip_buddy(se);
				3665	}
				3666
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	3667	static bool yield_to_task_fair(struct rq rq, struct task_struct p, bool preempt)
				3668	{
				3669	struct sched_entity *se = &p->se;
				3670
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	3671	/* throttled hierarchies are not runnable */
				3672	if (!se->on_rq \|\| throttled_hierarchy(cfs_rq_of(se)))
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	3673	return false;
				3674
				3675	/* Tell the scheduler that we'd really like pse to run next. */
				3676	set_next_buddy(se);
				3677
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	3678	yield_task_fair(rq);
				3679
				3680	return true;
				3681	}
				3682
Peter Williams	681f3e6	2007-10-24 18:23:51 +0200	[diff] [blame]	3683	#ifdef CONFIG_SMP
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3684	/**************************************************
Peter Zijlstra	e9c84cb	2012-07-03 13:53:26 +0200	[diff] [blame]	3685	* Fair scheduling class load-balancing methods.
				3686	*
				3687	* BASICS
				3688	*
				3689	* The purpose of load-balancing is to achieve the same basic fairness the
				3690	* per-cpu scheduler provides, namely provide a proportional amount of compute
				3691	* time to each task. This is expressed in the following equation:
				3692	*
				3693	* W_i,n/P_i == W_j,n/P_j for all i,j (1)
				3694	*
				3695	* Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
				3696	* W_i,0 is defined as:
				3697	*
				3698	* W_i,0 = \Sum_j w_i,j (2)
				3699	*
				3700	* Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
				3701	* is derived from the nice value as per prio_to_weight[].
				3702	*
				3703	* The weight average is an exponential decay average of the instantaneous
				3704	* weight:
				3705	*
				3706	* W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
				3707	*
				3708	* P_i is the cpu power (or compute capacity) of cpu i, typically it is the
				3709	* fraction of 'recent' time available for SCHED_OTHER task execution. But it
				3710	* can also include other factors [XXX].
				3711	*
				3712	* To achieve this balance we define a measure of imbalance which follows
				3713	* directly from (1):
				3714	*
				3715	* imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4)
				3716	*
				3717	* We them move tasks around to minimize the imbalance. In the continuous
				3718	* function space it is obvious this converges, in the discrete case we get
				3719	* a few fun cases generally called infeasible weight scenarios.
				3720	*
				3721	* [XXX expand on:
				3722	* - infeasible weights;
				3723	* - local vs global optima in the discrete case. ]
				3724	*
				3725	*
				3726	* SCHED DOMAINS
				3727	*
				3728	* In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
				3729	* for all i,j solution, we create a tree of cpus that follows the hardware
				3730	* topology where each level pairs two lower groups (or better). This results
				3731	* in O(log n) layers. Furthermore we reduce the number of cpus going up the
				3732	* tree to only the first of the previous level and we decrease the frequency
				3733	* of load-balance at each level inv. proportional to the number of cpus in
				3734	* the groups.
				3735	*
				3736	* This yields:
				3737	*
				3738	* log_2 n 1 n
				3739	* \Sum { --- * --- * 2^i } = O(n) (5)
				3740	* i = 0 2^i 2^i
				3741	* `- size of each group
				3742	* \| \| `- number of cpus doing load-balance
				3743	* \| `- freq
				3744	* `- sum over all levels
				3745	*
				3746	* Coupled with a limit on how many tasks we can migrate every balance pass,
				3747	* this makes (5) the runtime complexity of the balancer.
				3748	*
				3749	* An important property here is that each CPU is still (indirectly) connected
				3750	* to every other cpu in at most O(log n) steps:
				3751	*
				3752	* The adjacency matrix of the resulting graph is given by:
				3753	*
				3754	* log_2 n
				3755	* A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
				3756	* k = 0
				3757	*
				3758	* And you'll find that:
				3759	*
				3760	* A^(log_2 n)_i,j != 0 for all i,j (7)
				3761	*
				3762	* Showing there's indeed a path between every cpu in at most O(log n) steps.
				3763	* The task movement gives a factor of O(m), giving a convergence complexity
				3764	* of:
				3765	*
				3766	* O(nm log n), n := nr_cpus, m := nr_tasks (8)
				3767	*
				3768	*
				3769	* WORK CONSERVING
				3770	*
				3771	* In order to avoid CPUs going idle while there's still work to do, new idle
				3772	* balancing is more aggressive and has the newly idle cpu iterate up the domain
				3773	* tree itself instead of relying on other CPUs to bring it work.
				3774	*
				3775	* This adds some complexity to both (5) and (8) but it reduces the total idle
				3776	* time.
				3777	*
				3778	* [XXX more?]
				3779	*
				3780	*
				3781	* CGROUPS
				3782	*
				3783	* Cgroups make a horror show out of (2), instead of a simple sum we get:
				3784	*
				3785	* s_k,i
				3786	* W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
				3787	* S_k
				3788	*
				3789	* Where
				3790	*
				3791	* s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
				3792	*
				3793	* w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
				3794	*
				3795	* The big problem is S_k, its a global sum needed to compute a local (W_i)
				3796	* property.
				3797	*
				3798	* [XXX write more on how we solve this.. _after_ merging pjt's patches that
				3799	* rewrite all of this once again.]
				3800	*/
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3801
Hiroshi Shimamoto	ed387b7	2012-01-31 11:40:32 +0900	[diff] [blame]	3802	static unsigned long __read_mostly max_load_balance_interval = HZ/10;
				3803
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3804	#define LBF_ALL_PINNED 0x01
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3805	#define LBF_NEED_BREAK 0x02
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	3806	#define LBF_SOME_PINNED 0x04
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3807
				3808	struct lb_env {
				3809	struct sched_domain *sd;
				3810
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3811	struct rq *src_rq;
Prashanth Nageshappa	85c1e7d	2012-06-19 17:47:34 +0530	[diff] [blame]	3812	int src_cpu;
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3813
				3814	int dst_cpu;
				3815	struct rq *dst_rq;
				3816
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	3817	struct cpumask *dst_grpmask;
				3818	int new_dst_cpu;
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3819	enum cpu_idle_type idle;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	3820	long imbalance;
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	3821	/* The set of CPUs under consideration for load-balancing */
				3822	struct cpumask *cpus;
				3823
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3824	unsigned int flags;
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3825
				3826	unsigned int loop;
				3827	unsigned int loop_break;
				3828	unsigned int loop_max;
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3829	};
				3830
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3831	/*
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3832	* move_task - move a task from one runqueue to another runqueue.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3833	* Both runqueues must be locked.
				3834	*/
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3835	static void move_task(struct task_struct p, struct lb_env env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3836	{
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3837	deactivate_task(env->src_rq, p, 0);
				3838	set_task_cpu(p, env->dst_cpu);
				3839	activate_task(env->dst_rq, p, 0);
				3840	check_preempt_curr(env->dst_rq, p, 0);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3841	}
				3842
				3843	/*
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3844	* Is this task likely cache-hot:
				3845	*/
				3846	static int
				3847	task_hot(struct task_struct p, u64 now, struct sched_domain sd)
				3848	{
				3849	s64 delta;
				3850
				3851	if (p->sched_class != &fair_sched_class)
				3852	return 0;
				3853
				3854	if (unlikely(p->policy == SCHED_IDLE))
				3855	return 0;
				3856
				3857	/*
				3858	* Buddy candidates are cache hot:
				3859	*/
				3860	if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
				3861	(&p->se == cfs_rq_of(&p->se)->next \|\|
				3862	&p->se == cfs_rq_of(&p->se)->last))
				3863	return 1;
				3864
				3865	if (sysctl_sched_migration_cost == -1)
				3866	return 1;
				3867	if (sysctl_sched_migration_cost == 0)
				3868	return 0;
				3869
				3870	delta = now - p->se.exec_start;
				3871
				3872	return delta < (s64)sysctl_sched_migration_cost;
				3873	}
				3874
				3875	/*
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3876	* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
				3877	*/
				3878	static
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	3879	int can_migrate_task(struct task_struct p, struct lb_env env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3880	{
				3881	int tsk_cache_hot = 0;
				3882	/*
				3883	* We do not migrate tasks that are:
				3884	* 1) running (obviously), or
				3885	* 2) cannot be migrated to this CPU due to cpus_allowed, or
				3886	* 3) are cache-hot on their current CPU.
				3887	*/
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3888	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	3889	int new_dst_cpu;
				3890
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3891	schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	3892
				3893	/*
				3894	* Remember if this task can be migrated to any other cpu in
				3895	* our sched_group. We may want to revisit it if we couldn't
				3896	* meet load balance goals by pulling other tasks on src_cpu.
				3897	*
				3898	* Also avoid computing new_dst_cpu if we have already computed
				3899	* one in current iteration.
				3900	*/
				3901	if (!env->dst_grpmask \|\| (env->flags & LBF_SOME_PINNED))
				3902	return 0;
				3903
				3904	new_dst_cpu = cpumask_first_and(env->dst_grpmask,
				3905	tsk_cpus_allowed(p));
				3906	if (new_dst_cpu < nr_cpu_ids) {
				3907	env->flags \|= LBF_SOME_PINNED;
				3908	env->new_dst_cpu = new_dst_cpu;
				3909	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3910	return 0;
				3911	}
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	3912
				3913	/* Record that we found atleast one task that could run on dst_cpu */
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	3914	env->flags &= ~LBF_ALL_PINNED;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3915
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3916	if (task_running(env->src_rq, p)) {
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3917	schedstat_inc(p, se.statistics.nr_failed_migrations_running);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3918	return 0;
				3919	}
				3920
				3921	/*
				3922	* Aggressive migration if:
				3923	* 1) task is cache cold, or
				3924	* 2) too many balance attempts have failed.
				3925	*/
				3926
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3927	tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3928	if (!tsk_cache_hot \|\|
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	3929	env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3930	#ifdef CONFIG_SCHEDSTATS
				3931	if (tsk_cache_hot) {
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	3932	schedstat_inc(env->sd, lb_hot_gained[env->idle]);
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3933	schedstat_inc(p, se.statistics.nr_forced_migrations);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3934	}
				3935	#endif
				3936	return 1;
				3937	}
				3938
				3939	if (tsk_cache_hot) {
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3940	schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3941	return 0;
				3942	}
				3943	return 1;
				3944	}
				3945
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3946	/*
				3947	* move_one_task tries to move exactly one task from busiest to this_rq, as
				3948	* part of active balancing operations within "domain".
				3949	* Returns 1 if successful and 0 otherwise.
				3950	*
				3951	* Called with both runqueues locked.
				3952	*/
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	3953	static int move_one_task(struct lb_env *env)
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3954	{
				3955	struct task_struct p, n;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3956
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3957	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
				3958	if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu))
				3959	continue;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3960
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3961	if (!can_migrate_task(p, env))
				3962	continue;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3963
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3964	move_task(p, env);
				3965	/*
				3966	* Right now, this is only the second place move_task()
				3967	* is called, so we can safely collect move_task()
				3968	* stats here rather than inside move_task().
				3969	*/
				3970	schedstat_inc(env->sd, lb_gained[env->idle]);
				3971	return 1;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3972	}
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	3973	return 0;
				3974	}
				3975
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3976	static unsigned long task_h_load(struct task_struct *p);
				3977
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	3978	static const unsigned int sched_nr_migrate_break = 32;
				3979
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	3980	/*
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	3981	* move_tasks tries to move up to imbalance weighted load from busiest to
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	3982	* this_rq, as part of a balancing operation within domain "sd".
				3983	* Returns 1 if successful and 0 otherwise.
				3984	*
				3985	* Called with both runqueues locked.
				3986	*/
				3987	static int move_tasks(struct lb_env *env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3988	{
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	3989	struct list_head *tasks = &env->src_rq->cfs_tasks;
				3990	struct task_struct *p;
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3991	unsigned long load;
				3992	int pulled = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3993
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	3994	if (env->imbalance <= 0)
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	3995	return 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	3996
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	3997	while (!list_empty(tasks)) {
				3998	p = list_first_entry(tasks, struct task_struct, se.group_node);
				3999
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4000	env->loop++;
				4001	/* We've more or less seen every task there is, call it quits */
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4002	if (env->loop > env->loop_max)
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4003	break;
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4004
				4005	/* take a breather every nr_migrate tasks */
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4006	if (env->loop > env->loop_break) {
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	4007	env->loop_break += sched_nr_migrate_break;
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	4008	env->flags \|= LBF_NEED_BREAK;
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4009	break;
Peter Zijlstra	a195f00	2011-09-22 15:30:18 +0200	[diff] [blame]	4010	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4011
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4012	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4013	goto next;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4014
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4015	load = task_h_load(p);
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4016
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	4017	if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4018	goto next;
				4019
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4020	if ((load / 2) > env->imbalance)
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4021	goto next;
				4022
				4023	if (!can_migrate_task(p, env))
				4024	goto next;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4025
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4026	move_task(p, env);
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4027	pulled++;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4028	env->imbalance -= load;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4029
				4030	#ifdef CONFIG_PREEMPT
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4031	/*
				4032	* NEWIDLE balancing is a source of latency, so preemptible
				4033	* kernels will stop after the first task is pulled to minimize
				4034	* the critical section.
				4035	*/
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4036	if (env->idle == CPU_NEWLY_IDLE)
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4037	break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4038	#endif
				4039
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4040	/*
				4041	* We only want to steal up to the prescribed amount of
				4042	* weighted load.
				4043	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4044	if (env->imbalance <= 0)
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4045	break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4046
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4047	continue;
				4048	next:
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4049	list_move_tail(&p->se.group_node, tasks);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4050	}
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4051
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4052	/*
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4053	* Right now, this is one of only two places move_task() is called,
				4054	* so we can safely collect move_task() stats here rather than
				4055	* inside move_task().
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4056	*/
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	4057	schedstat_add(env->sd, lb_gained[env->idle], pulled);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4058
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4059	return pulled;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4060	}
				4061
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4062	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4063	/*
				4064	* update tg->load_weight by folding this cpu's load_avg
				4065	*/
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4066	static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4067	{
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4068	struct sched_entity *se = tg->se[cpu];
				4069	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4070
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4071	/* throttled entities do not contribute to load */
				4072	if (throttled_hierarchy(cfs_rq))
				4073	return;
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4074
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	4075	update_cfs_rq_blocked_load(cfs_rq, 1);
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4076
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	4077	if (se) {
				4078	update_entity_load_avg(se, 1);
				4079	/*
				4080	* We pivot on our runnable average having decayed to zero for
				4081	* list removal. This generally implies that all our children
				4082	* have also been removed (modulo rounding error or bandwidth
				4083	* control); however, such cases are rare and we can fix these
				4084	* at enqueue.
				4085	*
				4086	* TODO: fix up out-of-order children on enqueue.
				4087	*/
				4088	if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
				4089	list_del_leaf_cfs_rq(cfs_rq);
				4090	} else {
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4091	struct rq *rq = rq_of(cfs_rq);
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	4092	update_rq_runnable_avg(rq, rq->nr_running);
				4093	}
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4094	}
				4095
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4096	static void update_blocked_averages(int cpu)
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4097	{
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4098	struct rq *rq = cpu_rq(cpu);
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4099	struct cfs_rq *cfs_rq;
				4100	unsigned long flags;
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4101
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4102	raw_spin_lock_irqsave(&rq->lock, flags);
				4103	update_rq_clock(rq);
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4104	/*
				4105	* Iterates the task_group tree in a bottom up fashion, see
				4106	* list_add_leaf_cfs_rq() for details.
				4107	*/
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	4108	for_each_leaf_cfs_rq(rq, cfs_rq) {
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4109	/*
				4110	* Note: We may want to consider periodically releasing
				4111	* rq->lock about these updates so that creating many task
				4112	* groups does not result in continually extending hold time.
				4113	*/
				4114	__update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	4115	}
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4116
				4117	raw_spin_unlock_irqrestore(&rq->lock, flags);
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4118	}
				4119
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4120	/*
				4121	* Compute the cpu's hierarchical load factor for each task group.
				4122	* This needs to be done in a top-down fashion because the load of a child
				4123	* group is a fraction of its parents load.
				4124	*/
				4125	static int tg_load_down(struct task_group tg, void data)
				4126	{
				4127	unsigned long load;
				4128	long cpu = (long)data;
				4129
				4130	if (!tg->parent) {
				4131	load = cpu_rq(cpu)->load.weight;
				4132	} else {
				4133	load = tg->parent->cfs_rq[cpu]->h_load;
				4134	load *= tg->se[cpu]->load.weight;
				4135	load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
				4136	}
				4137
				4138	tg->cfs_rq[cpu]->h_load = load;
				4139
				4140	return 0;
				4141	}
				4142
				4143	static void update_h_load(long cpu)
				4144	{
Peter Zijlstra	a35b646	2012-08-08 21:46:40 +0200	[diff] [blame]	4145	struct rq *rq = cpu_rq(cpu);
				4146	unsigned long now = jiffies;
				4147
				4148	if (rq->h_load_throttle == now)
				4149	return;
				4150
				4151	rq->h_load_throttle = now;
				4152
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4153	rcu_read_lock();
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4154	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4155	rcu_read_unlock();
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4156	}
				4157
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4158	static unsigned long task_h_load(struct task_struct *p)
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4159	{
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4160	struct cfs_rq *cfs_rq = task_cfs_rq(p);
				4161	unsigned long load;
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4162
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4163	load = p->se.load.weight;
				4164	load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1);
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4165
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4166	return load;
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4167	}
				4168	#else
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4169	static inline void update_blocked_averages(int cpu)
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4170	{
				4171	}
				4172
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4173	static inline void update_h_load(long cpu)
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4174	{
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4175	}
				4176
				4177	static unsigned long task_h_load(struct task_struct *p)
				4178	{
				4179	return p->se.load.weight;
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4180	}
				4181	#endif
				4182
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4183	/******** Helpers for find_busiest_group **********************/
				4184	/*
				4185	* sd_lb_stats - Structure to store the statistics of a sched_domain
				4186	* during load balancing.
				4187	*/
				4188	struct sd_lb_stats {
				4189	struct sched_group busiest; / Busiest group in this sd */
				4190	struct sched_group this; / Local group in this sd */
				4191	unsigned long total_load; /* Total load of all groups in sd */
				4192	unsigned long total_pwr; /* Total power of all groups in sd */
				4193	unsigned long avg_load; /* Average load across all groups in sd */
				4194
				4195	/** Statistics of this group */
				4196	unsigned long this_load;
				4197	unsigned long this_load_per_task;
				4198	unsigned long this_nr_running;
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4199	unsigned long this_has_capacity;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4200	unsigned int this_idle_cpus;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4201
				4202	/* Statistics of the busiest group */
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4203	unsigned int busiest_idle_cpus;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4204	unsigned long max_load;
				4205	unsigned long busiest_load_per_task;
				4206	unsigned long busiest_nr_running;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4207	unsigned long busiest_group_capacity;
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4208	unsigned long busiest_has_capacity;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4209	unsigned int busiest_group_weight;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4210
				4211	int group_imb; /* Is there imbalance in this sd */
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4212	};
				4213
				4214	/*
				4215	* sg_lb_stats - stats of a sched_group required for load_balancing
				4216	*/
				4217	struct sg_lb_stats {
				4218	unsigned long avg_load; /Avg load across the CPUs of the group /
				4219	unsigned long group_load; /* Total load over the CPUs of the group */
				4220	unsigned long sum_nr_running; /* Nr tasks running in the group */
				4221	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
				4222	unsigned long group_capacity;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4223	unsigned long idle_cpus;
				4224	unsigned long group_weight;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4225	int group_imb; /* Is there an imbalance in the group ? */
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4226	int group_has_capacity; /* Is there extra capacity in the group? */
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4227	};
				4228
				4229	/**
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4230	* get_sd_load_idx - Obtain the load index for a given sched domain.
				4231	* @sd: The sched_domain whose load_idx is to be obtained.
				4232	* @idle: The Idle status of the CPU for whose sd load_icx is obtained.
				4233	*/
				4234	static inline int get_sd_load_idx(struct sched_domain *sd,
				4235	enum cpu_idle_type idle)
				4236	{
				4237	int load_idx;
				4238
				4239	switch (idle) {
				4240	case CPU_NOT_IDLE:
				4241	load_idx = sd->busy_idx;
				4242	break;
				4243
				4244	case CPU_NEWLY_IDLE:
				4245	load_idx = sd->newidle_idx;
				4246	break;
				4247	default:
				4248	load_idx = sd->idle_idx;
				4249	break;
				4250	}
				4251
				4252	return load_idx;
				4253	}
				4254
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4255	unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
				4256	{
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4257	return SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4258	}
				4259
				4260	unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
				4261	{
				4262	return default_scale_freq_power(sd, cpu);
				4263	}
				4264
				4265	unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
				4266	{
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	4267	unsigned long weight = sd->span_weight;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4268	unsigned long smt_gain = sd->smt_gain;
				4269
				4270	smt_gain /= weight;
				4271
				4272	return smt_gain;
				4273	}
				4274
				4275	unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
				4276	{
				4277	return default_scale_smt_power(sd, cpu);
				4278	}
				4279
				4280	unsigned long scale_rt_power(int cpu)
				4281	{
				4282	struct rq *rq = cpu_rq(cpu);
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	4283	u64 total, available, age_stamp, avg;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4284
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	4285	/*
				4286	* Since we're reading these variables without serialization make sure
				4287	* we read them once before doing sanity checks on them.
				4288	*/
				4289	age_stamp = ACCESS_ONCE(rq->age_stamp);
				4290	avg = ACCESS_ONCE(rq->rt_avg);
Venkatesh Pallipadi	aa48380	2010-10-04 17:03:22 -0700	[diff] [blame]	4291
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	4292	total = sched_avg_period() + (rq->clock - age_stamp);
				4293
				4294	if (unlikely(total < avg)) {
Venkatesh Pallipadi	aa48380	2010-10-04 17:03:22 -0700	[diff] [blame]	4295	/* Ensures that power won't end up being negative */
				4296	available = 0;
				4297	} else {
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	4298	available = total - avg;
Venkatesh Pallipadi	aa48380	2010-10-04 17:03:22 -0700	[diff] [blame]	4299	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4300
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4301	if (unlikely((s64)total < SCHED_POWER_SCALE))
				4302	total = SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4303
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4304	total >>= SCHED_POWER_SHIFT;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4305
				4306	return div_u64(available, total);
				4307	}
				4308
				4309	static void update_cpu_power(struct sched_domain *sd, int cpu)
				4310	{
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	4311	unsigned long weight = sd->span_weight;
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4312	unsigned long power = SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4313	struct sched_group *sdg = sd->groups;
				4314
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4315	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
				4316	if (sched_feat(ARCH_POWER))
				4317	power *= arch_scale_smt_power(sd, cpu);
				4318	else
				4319	power *= default_scale_smt_power(sd, cpu);
				4320
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4321	power >>= SCHED_POWER_SHIFT;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4322	}
				4323
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4324	sdg->sgp->power_orig = power;
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4325
				4326	if (sched_feat(ARCH_POWER))
				4327	power *= arch_scale_freq_power(sd, cpu);
				4328	else
				4329	power *= default_scale_freq_power(sd, cpu);
				4330
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4331	power >>= SCHED_POWER_SHIFT;
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4332
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4333	power *= scale_rt_power(cpu);
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4334	power >>= SCHED_POWER_SHIFT;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4335
				4336	if (!power)
				4337	power = 1;
				4338
Peter Zijlstra	e51fd5e	2010-05-31 12:37:30 +0200	[diff] [blame]	4339	cpu_rq(cpu)->cpu_power = power;
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4340	sdg->sgp->power = power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4341	}
				4342
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4343	void update_group_power(struct sched_domain *sd, int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4344	{
				4345	struct sched_domain *child = sd->child;
				4346	struct sched_group group, sdg = sd->groups;
				4347	unsigned long power;
Vincent Guittot	4ec4412	2011-12-12 20:21:08 +0100	[diff] [blame]	4348	unsigned long interval;
				4349
				4350	interval = msecs_to_jiffies(sd->balance_interval);
				4351	interval = clamp(interval, 1UL, max_load_balance_interval);
				4352	sdg->sgp->next_update = jiffies + interval;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4353
				4354	if (!child) {
				4355	update_cpu_power(sd, cpu);
				4356	return;
				4357	}
				4358
				4359	power = 0;
				4360
Peter Zijlstra	74a5ce2	2012-05-23 18:00:43 +0200	[diff] [blame]	4361	if (child->flags & SD_OVERLAP) {
				4362	/*
				4363	* SD_OVERLAP domains cannot assume that child groups
				4364	* span the current group.
				4365	*/
				4366
				4367	for_each_cpu(cpu, sched_group_cpus(sdg))
				4368	power += power_of(cpu);
				4369	} else {
				4370	/*
				4371	* !SD_OVERLAP domains can assume that child groups
				4372	* span the current group.
				4373	*/
				4374
				4375	group = child->groups;
				4376	do {
				4377	power += group->sgp->power;
				4378	group = group->next;
				4379	} while (group != child->groups);
				4380	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4381
Peter Zijlstra	c3decf0	2012-05-31 12:05:32 +0200	[diff] [blame]	4382	sdg->sgp->power_orig = sdg->sgp->power = power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4383	}
				4384
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4385	/*
				4386	* Try and fix up capacity for tiny siblings, this is needed when
				4387	* things like SD_ASYM_PACKING need f_b_g to select another sibling
				4388	* which on its own isn't powerful enough.
				4389	*
				4390	* See update_sd_pick_busiest() and check_asym_packing().
				4391	*/
				4392	static inline int
				4393	fix_small_capacity(struct sched_domain sd, struct sched_group group)
				4394	{
				4395	/*
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4396	* Only siblings can have significantly less than SCHED_POWER_SCALE
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4397	*/
Peter Zijlstra	a6c75f2	2011-04-07 14:09:52 +0200	[diff] [blame]	4398	if (!(sd->flags & SD_SHARE_CPUPOWER))
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4399	return 0;
				4400
				4401	/*
				4402	* If ~90% of the cpu_power is still there, we're good.
				4403	*/
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4404	if (group->sgp->power * 32 > group->sgp->power_orig * 29)
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4405	return 1;
				4406
				4407	return 0;
				4408	}
				4409
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4410	/**
				4411	* update_sg_lb_stats - Update sched_group's statistics for load balancing.
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4412	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4413	* @group: sched_group whose statistics are to be updated.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4414	* @load_idx: Load index of sched_domain of this_cpu for load calc.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4415	* @local_group: Does group contain this_cpu.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4416	* @balance: Should we balance.
				4417	* @sgs: variable to hold the statistics for this group.
				4418	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4419	static inline void update_sg_lb_stats(struct lb_env *env,
				4420	struct sched_group *group, int load_idx,
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	4421	int local_group, int balance, struct sg_lb_stats sgs)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4422	{
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4423	unsigned long nr_running, max_nr_running, min_nr_running;
				4424	unsigned long load, max_cpu_load, min_cpu_load;
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	4425	unsigned int balance_cpu = -1, first_idle_cpu = 0;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4426	unsigned long avg_load_per_task = 0;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4427	int i;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4428
Gautham R Shenoy	871e35b	2010-01-20 14:02:44 -0600	[diff] [blame]	4429	if (local_group)
Peter Zijlstra	c117487	2012-05-31 14:47:33 +0200	[diff] [blame]	4430	balance_cpu = group_balance_cpu(group);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4431
				4432	/* Tally up the load of all CPUs in the group */
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4433	max_cpu_load = 0;
				4434	min_cpu_load = ~0UL;
Nikhil Rao	2582f0e	2010-10-13 12:09:36 -0700	[diff] [blame]	4435	max_nr_running = 0;
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4436	min_nr_running = ~0UL;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4437
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	4438	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4439	struct rq *rq = cpu_rq(i);
				4440
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4441	nr_running = rq->nr_running;
				4442
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4443	/* Bias balancing toward cpus of our domain */
				4444	if (local_group) {
Peter Zijlstra	c117487	2012-05-31 14:47:33 +0200	[diff] [blame]	4445	if (idle_cpu(i) && !first_idle_cpu &&
				4446	cpumask_test_cpu(i, sched_group_mask(group))) {
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	4447	first_idle_cpu = 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4448	balance_cpu = i;
				4449	}
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	4450
				4451	load = target_load(i, load_idx);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4452	} else {
				4453	load = source_load(i, load_idx);
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4454	if (load > max_cpu_load)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4455	max_cpu_load = load;
				4456	if (min_cpu_load > load)
				4457	min_cpu_load = load;
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4458
				4459	if (nr_running > max_nr_running)
				4460	max_nr_running = nr_running;
				4461	if (min_nr_running > nr_running)
				4462	min_nr_running = nr_running;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4463	}
				4464
				4465	sgs->group_load += load;
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4466	sgs->sum_nr_running += nr_running;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4467	sgs->sum_weighted_load += weighted_cpuload(i);
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4468	if (idle_cpu(i))
				4469	sgs->idle_cpus++;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4470	}
				4471
				4472	/*
				4473	* First idle cpu or the first cpu(busiest) in this sched group
				4474	* is eligible for doing load balancing at this and above
				4475	* domains. In the newly idle case, we will allow all the cpu's
				4476	* to do the newly idle load balance.
				4477	*/
Vincent Guittot	4ec4412	2011-12-12 20:21:08 +0100	[diff] [blame]	4478	if (local_group) {
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4479	if (env->idle != CPU_NEWLY_IDLE) {
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	4480	if (balance_cpu != env->dst_cpu) {
Vincent Guittot	4ec4412	2011-12-12 20:21:08 +0100	[diff] [blame]	4481	*balance = 0;
				4482	return;
				4483	}
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4484	update_group_power(env->sd, env->dst_cpu);
Vincent Guittot	4ec4412	2011-12-12 20:21:08 +0100	[diff] [blame]	4485	} else if (time_after_eq(jiffies, group->sgp->next_update))
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4486	update_group_power(env->sd, env->dst_cpu);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4487	}
				4488
				4489	/* Adjust by relative CPU power of the group */
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4490	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4491
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4492	/*
				4493	* Consider the group unbalanced when the imbalance is larger
Peter Zijlstra	866ab43	2011-02-21 18:56:47 +0100	[diff] [blame]	4494	* than the average weight of a task.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4495	*
				4496	* APZ: with cgroup the avg task weight can vary wildly and
				4497	* might not be a suitable number - should we keep a
				4498	* normalized nr_running number somewhere that negates
				4499	* the hierarchy?
				4500	*/
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4501	if (sgs->sum_nr_running)
				4502	avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4503
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4504	if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
				4505	(max_nr_running - min_nr_running) > 1)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4506	sgs->group_imb = 1;
				4507
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4508	sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4509	SCHED_POWER_SCALE);
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4510	if (!sgs->group_capacity)
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4511	sgs->group_capacity = fix_small_capacity(env->sd, group);
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4512	sgs->group_weight = group->group_weight;
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4513
				4514	if (sgs->group_capacity > sgs->sum_nr_running)
				4515	sgs->group_has_capacity = 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4516	}
				4517
				4518	/**
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4519	* update_sd_pick_busiest - return 1 on busiest group
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4520	* @env: The load balancing environment.
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4521	* @sds: sched_domain statistics
				4522	* @sg: sched_group candidate to be checked for being the busiest
Michael Neuling	b6b1229	2010-06-10 12:06:21 +1000	[diff] [blame]	4523	* @sgs: sched_group statistics
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4524	*
				4525	* Determine if @sg is a busier group than the previously selected
				4526	* busiest group.
				4527	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4528	static bool update_sd_pick_busiest(struct lb_env *env,
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4529	struct sd_lb_stats *sds,
				4530	struct sched_group *sg,
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4531	struct sg_lb_stats *sgs)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4532	{
				4533	if (sgs->avg_load <= sds->max_load)
				4534	return false;
				4535
				4536	if (sgs->sum_nr_running > sgs->group_capacity)
				4537	return true;
				4538
				4539	if (sgs->group_imb)
				4540	return true;
				4541
				4542	/*
				4543	* ASYM_PACKING needs to move all the work to the lowest
				4544	* numbered CPUs in the group, therefore mark all groups
				4545	* higher than ourself as busy.
				4546	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4547	if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
				4548	env->dst_cpu < group_first_cpu(sg)) {
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4549	if (!sds->busiest)
				4550	return true;
				4551
				4552	if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
				4553	return true;
				4554	}
				4555
				4556	return false;
				4557	}
				4558
				4559	/**
Hui Kang	461819a	2011-10-11 23:00:59 -0400	[diff] [blame]	4560	* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4561	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4562	* @balance: Should we balance.
				4563	* @sds: variable to hold the statistics for this sched_domain.
				4564	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4565	static inline void update_sd_lb_stats(struct lb_env *env,
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	4566	int balance, struct sd_lb_stats sds)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4567	{
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4568	struct sched_domain *child = env->sd->child;
				4569	struct sched_group *sg = env->sd->groups;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4570	struct sg_lb_stats sgs;
				4571	int load_idx, prefer_sibling = 0;
				4572
				4573	if (child && child->flags & SD_PREFER_SIBLING)
				4574	prefer_sibling = 1;
				4575
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4576	load_idx = get_sd_load_idx(env->sd, env->idle);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4577
				4578	do {
				4579	int local_group;
				4580
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4581	local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4582	memset(&sgs, 0, sizeof(sgs));
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	4583	update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4584
Peter Zijlstra	8f190fb	2009-12-24 14:18:21 +0100	[diff] [blame]	4585	if (local_group && !(*balance))
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4586	return;
				4587
				4588	sds->total_load += sgs.group_load;
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4589	sds->total_pwr += sg->sgp->power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4590
				4591	/*
				4592	* In case the child domain prefers tasks go to siblings
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4593	* first, lower the sg capacity to one so that we'll try
Nikhil Rao	75dd321	2010-10-15 13:12:30 -0700	[diff] [blame]	4594	* and move all the excess tasks away. We lower the capacity
				4595	* of a group only if the local group has the capacity to fit
				4596	* these excess tasks, i.e. nr_running < group_capacity. The
				4597	* extra check prevents the case where you always pull from the
				4598	* heaviest group when it is already under-utilized (possible
				4599	* with a large weight task outweighs the tasks on the system).
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4600	*/
Nikhil Rao	75dd321	2010-10-15 13:12:30 -0700	[diff] [blame]	4601	if (prefer_sibling && !local_group && sds->this_has_capacity)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4602	sgs.group_capacity = min(sgs.group_capacity, 1UL);
				4603
				4604	if (local_group) {
				4605	sds->this_load = sgs.avg_load;
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4606	sds->this = sg;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4607	sds->this_nr_running = sgs.sum_nr_running;
				4608	sds->this_load_per_task = sgs.sum_weighted_load;
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4609	sds->this_has_capacity = sgs.group_has_capacity;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4610	sds->this_idle_cpus = sgs.idle_cpus;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4611	} else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4612	sds->max_load = sgs.avg_load;
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4613	sds->busiest = sg;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4614	sds->busiest_nr_running = sgs.sum_nr_running;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4615	sds->busiest_idle_cpus = sgs.idle_cpus;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4616	sds->busiest_group_capacity = sgs.group_capacity;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4617	sds->busiest_load_per_task = sgs.sum_weighted_load;
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4618	sds->busiest_has_capacity = sgs.group_has_capacity;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4619	sds->busiest_group_weight = sgs.group_weight;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4620	sds->group_imb = sgs.group_imb;
				4621	}
				4622
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4623	sg = sg->next;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4624	} while (sg != env->sd->groups);
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4625	}
				4626
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4627	/**
				4628	* check_asym_packing - Check to see if the group is packed into the
				4629	* sched doman.
				4630	*
				4631	* This is primarily intended to used at the sibling level. Some
				4632	* cores like POWER7 prefer to use lower numbered SMT threads. In the
				4633	* case of POWER7, it can move to lower SMT modes only when higher
				4634	* threads are idle. When in lower SMT modes, the threads will
				4635	* perform better since they share less core resources. Hence when we
				4636	* have idle threads, we want them to be the higher ones.
				4637	*
				4638	* This packing function is run on idle threads. It checks to see if
				4639	* the busiest CPU in this domain (core in the P7 case) has a higher
				4640	* CPU number than the packing function is being run on. Here we are
				4641	* assuming lower CPU number will be equivalent to lower a SMT thread
				4642	* number.
				4643	*
Michael Neuling	b6b1229	2010-06-10 12:06:21 +1000	[diff] [blame]	4644	* Returns 1 when packing is required and a task should be moved to
				4645	* this CPU. The amount of the imbalance is returned in *imbalance.
				4646	*
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4647	* @env: The load balancing environment.
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4648	* @sds: Statistics of the sched_domain which is to be packed
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4649	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4650	static int check_asym_packing(struct lb_env env, struct sd_lb_stats sds)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4651	{
				4652	int busiest_cpu;
				4653
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4654	if (!(env->sd->flags & SD_ASYM_PACKING))
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4655	return 0;
				4656
				4657	if (!sds->busiest)
				4658	return 0;
				4659
				4660	busiest_cpu = group_first_cpu(sds->busiest);
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4661	if (env->dst_cpu > busiest_cpu)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4662	return 0;
				4663
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4664	env->imbalance = DIV_ROUND_CLOSEST(
				4665	sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
				4666
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4667	return 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4668	}
				4669
				4670	/**
				4671	* fix_small_imbalance - Calculate the minor imbalance that exists
				4672	* amongst the groups of a sched_domain, during
				4673	* load balancing.
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4674	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4675	* @sds: Statistics of the sched_domain whose imbalance is to be calculated.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4676	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4677	static inline
				4678	void fix_small_imbalance(struct lb_env env, struct sd_lb_stats sds)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4679	{
				4680	unsigned long tmp, pwr_now = 0, pwr_move = 0;
				4681	unsigned int imbn = 2;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4682	unsigned long scaled_busy_load_per_task;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4683
				4684	if (sds->this_nr_running) {
				4685	sds->this_load_per_task /= sds->this_nr_running;
				4686	if (sds->busiest_load_per_task >
				4687	sds->this_load_per_task)
				4688	imbn = 1;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4689	} else {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4690	sds->this_load_per_task =
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4691	cpu_avg_load_per_task(env->dst_cpu);
				4692	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4693
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4694	scaled_busy_load_per_task = sds->busiest_load_per_task
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4695	* SCHED_POWER_SCALE;
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4696	scaled_busy_load_per_task /= sds->busiest->sgp->power;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4697
				4698	if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
				4699	(scaled_busy_load_per_task * imbn)) {
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4700	env->imbalance = sds->busiest_load_per_task;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4701	return;
				4702	}
				4703
				4704	/*
				4705	* OK, we don't have enough imbalance to justify moving tasks,
				4706	* however we may be able to increase total CPU power used by
				4707	* moving them.
				4708	*/
				4709
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4710	pwr_now += sds->busiest->sgp->power *
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4711	min(sds->busiest_load_per_task, sds->max_load);
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4712	pwr_now += sds->this->sgp->power *
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4713	min(sds->this_load_per_task, sds->this_load);
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4714	pwr_now /= SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4715
				4716	/* Amount of load we'd subtract */
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4717	tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4718	sds->busiest->sgp->power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4719	if (sds->max_load > tmp)
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4720	pwr_move += sds->busiest->sgp->power *
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4721	min(sds->busiest_load_per_task, sds->max_load - tmp);
				4722
				4723	/* Amount of load we'd add */
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4724	if (sds->max_load * sds->busiest->sgp->power <
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4725	sds->busiest_load_per_task * SCHED_POWER_SCALE)
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4726	tmp = (sds->max_load * sds->busiest->sgp->power) /
				4727	sds->this->sgp->power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4728	else
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4729	tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4730	sds->this->sgp->power;
				4731	pwr_move += sds->this->sgp->power *
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4732	min(sds->this_load_per_task, sds->this_load + tmp);
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4733	pwr_move /= SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4734
				4735	/* Move if we gain throughput */
				4736	if (pwr_move > pwr_now)
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4737	env->imbalance = sds->busiest_load_per_task;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4738	}
				4739
				4740	/**
				4741	* calculate_imbalance - Calculate the amount of imbalance present within the
				4742	* groups of a given sched_domain during load balance.
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4743	* @env: load balance environment
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4744	* @sds: statistics of the sched_domain whose imbalance is to be calculated.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4745	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4746	static inline void calculate_imbalance(struct lb_env env, struct sd_lb_stats sds)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4747	{
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4748	unsigned long max_pull, load_above_capacity = ~0UL;
				4749
				4750	sds->busiest_load_per_task /= sds->busiest_nr_running;
				4751	if (sds->group_imb) {
				4752	sds->busiest_load_per_task =
				4753	min(sds->busiest_load_per_task, sds->avg_load);
				4754	}
				4755
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4756	/*
				4757	* In the presence of smp nice balancing, certain scenarios can have
				4758	* max load less than avg load(as we skip the groups at or below
				4759	* its cpu_power, while calculating max_load..)
				4760	*/
				4761	if (sds->max_load < sds->avg_load) {
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4762	env->imbalance = 0;
				4763	return fix_small_imbalance(env, sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4764	}
				4765
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4766	if (!sds->group_imb) {
				4767	/*
				4768	* Don't want to pull so many tasks that a group would go idle.
				4769	*/
				4770	load_above_capacity = (sds->busiest_nr_running -
				4771	sds->busiest_group_capacity);
				4772
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4773	load_above_capacity = (SCHED_LOAD_SCALE SCHED_POWER_SCALE);
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4774
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4775	load_above_capacity /= sds->busiest->sgp->power;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4776	}
				4777
				4778	/*
				4779	* We're trying to get all the cpus to the average_load, so we don't
				4780	* want to push ourselves above the average load, nor do we wish to
				4781	* reduce the max loaded cpu below the average load. At the same time,
				4782	* we also don't want to reduce the group load below the group capacity
				4783	* (so that we can implement power-savings policies etc). Thus we look
				4784	* for the minimum possible imbalance.
				4785	* Be careful of negative numbers as they'll appear as very large values
				4786	* with unsigned longs.
				4787	*/
				4788	max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4789
				4790	/* How much load to actually move to equalise the imbalance */
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4791	env->imbalance = min(max_pull * sds->busiest->sgp->power,
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4792	(sds->avg_load - sds->this_load) * sds->this->sgp->power)
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4793	/ SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4794
				4795	/*
				4796	* if *imbalance is less than the average load per runnable task
Lucas De Marchi	25985ed	2011-03-30 22:57:33 -0300	[diff] [blame]	4797	* there is no guarantee that any tasks will be moved so we'll have
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4798	* a think about bumping its value to force at least one task to be
				4799	* moved
				4800	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4801	if (env->imbalance < sds->busiest_load_per_task)
				4802	return fix_small_imbalance(env, sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4803
				4804	}
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4805
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4806	/***** find_busiest_group() helpers end here *******************/
				4807
				4808	/**
				4809	* find_busiest_group - Returns the busiest group within the sched_domain
				4810	* if there is an imbalance. If there isn't an imbalance, and
				4811	* the user has opted for power-savings, it returns a group whose
				4812	* CPUs can be put to idle by rebalancing those tasks elsewhere, if
				4813	* such a group exists.
				4814	*
				4815	* Also calculates the amount of weighted load which should be moved
				4816	* to restore balance.
				4817	*
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4818	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4819	* @balance: Pointer to a variable indicating if this_cpu
				4820	* is the appropriate cpu to perform load balancing at this_level.
				4821	*
				4822	* Returns: - the busiest group if imbalance exists.
				4823	* - If no imbalance and user has opted for power-savings balance,
				4824	* return the least loaded group whose CPUs can be
				4825	* put to idle by rebalancing its tasks onto our group.
				4826	*/
				4827	static struct sched_group *
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	4828	find_busiest_group(struct lb_env env, int balance)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4829	{
				4830	struct sd_lb_stats sds;
				4831
				4832	memset(&sds, 0, sizeof(sds));
				4833
				4834	/*
				4835	* Compute the various statistics relavent for load balancing at
				4836	* this level.
				4837	*/
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	4838	update_sd_lb_stats(env, balance, &sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4839
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	4840	/*
				4841	* this_cpu is not the appropriate cpu to perform load balancing at
				4842	* this level.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4843	*/
Peter Zijlstra	8f190fb	2009-12-24 14:18:21 +0100	[diff] [blame]	4844	if (!(*balance))
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4845	goto ret;
				4846
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4847	if ((env->idle == CPU_IDLE \|\| env->idle == CPU_NEWLY_IDLE) &&
				4848	check_asym_packing(env, &sds))
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4849	return sds.busiest;
				4850
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	4851	/* There is no busy sibling group to pull tasks from */
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4852	if (!sds.busiest \|\| sds.busiest_nr_running == 0)
				4853	goto out_balanced;
				4854
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4855	sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
Ken Chen	b0432d8	2011-04-07 17:23:22 -0700	[diff] [blame]	4856
Peter Zijlstra	866ab43	2011-02-21 18:56:47 +0100	[diff] [blame]	4857	/*
				4858	* If the busiest group is imbalanced the below checks don't
				4859	* work because they assumes all things are equal, which typically
				4860	* isn't true due to cpus_allowed constraints and the like.
				4861	*/
				4862	if (sds.group_imb)
				4863	goto force_balance;
				4864
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	4865	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4866	if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4867	!sds.busiest_has_capacity)
				4868	goto force_balance;
				4869
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	4870	/*
				4871	* If the local group is more busy than the selected busiest group
				4872	* don't try and pull any tasks.
				4873	*/
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4874	if (sds.this_load >= sds.max_load)
				4875	goto out_balanced;
				4876
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	4877	/*
				4878	* Don't pull any tasks if this group is already above the domain
				4879	* average load.
				4880	*/
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4881	if (sds.this_load >= sds.avg_load)
				4882	goto out_balanced;
				4883
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4884	if (env->idle == CPU_IDLE) {
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4885	/*
				4886	* This cpu is idle. If the busiest group load doesn't
				4887	* have more tasks than the number of available cpu's and
				4888	* there is no imbalance between this and busiest group
				4889	* wrt to idle cpu's, it is balanced.
				4890	*/
Peter Zijlstra	c186faf	2011-02-21 18:52:53 +0100	[diff] [blame]	4891	if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4892	sds.busiest_nr_running <= sds.busiest_group_weight)
				4893	goto out_balanced;
Peter Zijlstra	c186faf	2011-02-21 18:52:53 +0100	[diff] [blame]	4894	} else {
				4895	/*
				4896	* In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
				4897	* imbalance_pct to be conservative.
				4898	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4899	if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
Peter Zijlstra	c186faf	2011-02-21 18:52:53 +0100	[diff] [blame]	4900	goto out_balanced;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4901	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4902
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4903	force_balance:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4904	/* Looks like there is an imbalance. Compute it */
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4905	calculate_imbalance(env, &sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4906	return sds.busiest;
				4907
				4908	out_balanced:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4909	ret:
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4910	env->imbalance = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4911	return NULL;
				4912	}
				4913
				4914	/*
				4915	* find_busiest_queue - find the busiest runqueue among the cpus in group.
				4916	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4917	static struct rq find_busiest_queue(struct lb_env env,
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	4918	struct sched_group *group)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4919	{
				4920	struct rq busiest = NULL, rq;
				4921	unsigned long max_load = 0;
				4922	int i;
				4923
				4924	for_each_cpu(i, sched_group_cpus(group)) {
				4925	unsigned long power = power_of(i);
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4926	unsigned long capacity = DIV_ROUND_CLOSEST(power,
				4927	SCHED_POWER_SCALE);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4928	unsigned long wl;
				4929
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4930	if (!capacity)
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4931	capacity = fix_small_capacity(env->sd, group);
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4932
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	4933	if (!cpumask_test_cpu(i, env->cpus))
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4934	continue;
				4935
				4936	rq = cpu_rq(i);
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	4937	wl = weighted_cpuload(i);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4938
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	4939	/*
				4940	* When comparing with imbalance, use weighted_cpuload()
				4941	* which is not scaled with the cpu power.
				4942	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4943	if (capacity && rq->nr_running == 1 && wl > env->imbalance)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4944	continue;
				4945
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	4946	/*
				4947	* For the load comparisons with the other cpu's, consider
				4948	* the weighted_cpuload() scaled with the cpu power, so that
				4949	* the load can be moved away from the cpu that is potentially
				4950	* running at a lower capacity.
				4951	*/
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4952	wl = (wl * SCHED_POWER_SCALE) / power;
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	4953
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4954	if (wl > max_load) {
				4955	max_load = wl;
				4956	busiest = rq;
				4957	}
				4958	}
				4959
				4960	return busiest;
				4961	}
				4962
				4963	/*
				4964	* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
				4965	* so long as it is large enough.
				4966	*/
				4967	#define MAX_PINNED_INTERVAL 512
				4968
				4969	/* Working cpumask for load_balance and load_balance_newidle. */
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4970	DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4971
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4972	static int need_active_balance(struct lb_env *env)
Peter Zijlstra	1af3ed3	2009-12-23 15:10:31 +0100	[diff] [blame]	4973	{
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4974	struct sched_domain *sd = env->sd;
				4975
				4976	if (env->idle == CPU_NEWLY_IDLE) {
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4977
				4978	/*
				4979	* ASYM_PACKING needs to force migrate tasks from busy but
				4980	* higher numbered CPUs in order to pack all tasks in the
				4981	* lowest numbered CPUs.
				4982	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4983	if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4984	return 1;
Peter Zijlstra	1af3ed3	2009-12-23 15:10:31 +0100	[diff] [blame]	4985	}
				4986
				4987	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
				4988	}
				4989
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	4990	static int active_load_balance_cpu_stop(void *data);
				4991
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4992	/*
				4993	* Check this_cpu to ensure it is balanced within domain. Attempt to move
				4994	* tasks if there is an imbalance.
				4995	*/
				4996	static int load_balance(int this_cpu, struct rq *this_rq,
				4997	struct sched_domain *sd, enum cpu_idle_type idle,
				4998	int *balance)
				4999	{
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5000	int ld_moved, cur_ld_moved, active_balance = 0;
				5001	int lb_iterations, max_lb_iterations;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5002	struct sched_group *group;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5003	struct rq *busiest;
				5004	unsigned long flags;
				5005	struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
				5006
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5007	struct lb_env env = {
				5008	.sd = sd,
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	5009	.dst_cpu = this_cpu,
				5010	.dst_rq = this_rq,
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5011	.dst_grpmask = sched_group_cpus(sd->groups),
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5012	.idle = idle,
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	5013	.loop_break = sched_nr_migrate_break,
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	5014	.cpus = cpus,
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5015	};
				5016
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5017	cpumask_copy(cpus, cpu_active_mask);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5018	max_lb_iterations = cpumask_weight(env.dst_grpmask);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5019
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5020	schedstat_inc(sd, lb_count[idle]);
				5021
				5022	redo:
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	5023	group = find_busiest_group(&env, balance);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5024
				5025	if (*balance == 0)
				5026	goto out_balanced;
				5027
				5028	if (!group) {
				5029	schedstat_inc(sd, lb_nobusyg[idle]);
				5030	goto out_balanced;
				5031	}
				5032
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	5033	busiest = find_busiest_queue(&env, group);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5034	if (!busiest) {
				5035	schedstat_inc(sd, lb_nobusyq[idle]);
				5036	goto out_balanced;
				5037	}
				5038
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	5039	BUG_ON(busiest == env.dst_rq);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5040
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5041	schedstat_add(sd, lb_imbalance[idle], env.imbalance);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5042
				5043	ld_moved = 0;
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5044	lb_iterations = 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5045	if (busiest->nr_running > 1) {
				5046	/*
				5047	* Attempt to move tasks. If find_busiest_group has found
				5048	* an imbalance but busiest->nr_running <= 1, the group is
				5049	* still unbalanced. ld_moved simply stays zero, so it is
				5050	* correctly treated as an imbalance.
				5051	*/
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5052	env.flags \|= LBF_ALL_PINNED;
Peter Zijlstra	c82513e	2012-04-26 13:12:27 +0200	[diff] [blame]	5053	env.src_cpu = busiest->cpu;
				5054	env.src_rq = busiest;
				5055	env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5056
Peter Zijlstra	a35b646	2012-08-08 21:46:40 +0200	[diff] [blame]	5057	update_h_load(env.src_cpu);
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	5058	more_balance:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5059	local_irq_save(flags);
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	5060	double_rq_lock(env.dst_rq, busiest);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5061
				5062	/*
				5063	* cur_ld_moved - load moved in current iteration
				5064	* ld_moved - cumulative load moved across iterations
				5065	*/
				5066	cur_ld_moved = move_tasks(&env);
				5067	ld_moved += cur_ld_moved;
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	5068	double_rq_unlock(env.dst_rq, busiest);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5069	local_irq_restore(flags);
				5070
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	5071	if (env.flags & LBF_NEED_BREAK) {
				5072	env.flags &= ~LBF_NEED_BREAK;
				5073	goto more_balance;
				5074	}
				5075
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5076	/*
				5077	* some other cpu did the load balance for us.
				5078	*/
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5079	if (cur_ld_moved && env.dst_cpu != smp_processor_id())
				5080	resched_cpu(env.dst_cpu);
				5081
				5082	/*
				5083	* Revisit (affine) tasks on src_cpu that couldn't be moved to
				5084	* us and move them to an alternate dst_cpu in our sched_group
				5085	* where they can run. The upper limit on how many times we
				5086	* iterate on same src_cpu is dependent on number of cpus in our
				5087	* sched_group.
				5088	*
				5089	* This changes load balance semantics a bit on who can move
				5090	* load to a given_cpu. In addition to the given_cpu itself
				5091	* (or a ilb_cpu acting on its behalf where given_cpu is
				5092	* nohz-idle), we now have balance_cpu in a position to move
				5093	* load to given_cpu. In rare situations, this may cause
				5094	* conflicts (balance_cpu and given_cpu/ilb_cpu deciding
				5095	* _independently_ and at _same_ time to move some load to
				5096	* given_cpu) causing exceess load to be moved to given_cpu.
				5097	* This however should not happen so much in practice and
				5098	* moreover subsequent load balance cycles should correct the
				5099	* excess load moved.
				5100	*/
				5101	if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 &&
				5102	lb_iterations++ < max_lb_iterations) {
				5103
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	5104	env.dst_rq = cpu_rq(env.new_dst_cpu);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5105	env.dst_cpu = env.new_dst_cpu;
				5106	env.flags &= ~LBF_SOME_PINNED;
				5107	env.loop = 0;
				5108	env.loop_break = sched_nr_migrate_break;
				5109	/*
				5110	* Go back to "more_balance" rather than "redo" since we
				5111	* need to continue with same src_cpu.
				5112	*/
				5113	goto more_balance;
				5114	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5115
				5116	/* All tasks on this runqueue were pinned by CPU affinity */
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5117	if (unlikely(env.flags & LBF_ALL_PINNED)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5118	cpumask_clear_cpu(cpu_of(busiest), cpus);
Prashanth Nageshappa	bbf18b1	2012-06-19 17:52:07 +0530	[diff] [blame]	5119	if (!cpumask_empty(cpus)) {
				5120	env.loop = 0;
				5121	env.loop_break = sched_nr_migrate_break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5122	goto redo;
Prashanth Nageshappa	bbf18b1	2012-06-19 17:52:07 +0530	[diff] [blame]	5123	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5124	goto out_balanced;
				5125	}
				5126	}
				5127
				5128	if (!ld_moved) {
				5129	schedstat_inc(sd, lb_failed[idle]);
Venkatesh Pallipadi	58b26c4	2010-09-10 18:19:17 -0700	[diff] [blame]	5130	/*
				5131	* Increment the failure counter only on periodic balance.
				5132	* We do not want newidle balance, which can be very
				5133	* frequent, pollute the failure counter causing
				5134	* excessive cache_hot migrations and active balances.
				5135	*/
				5136	if (idle != CPU_NEWLY_IDLE)
				5137	sd->nr_balance_failed++;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5138
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5139	if (need_active_balance(&env)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5140	raw_spin_lock_irqsave(&busiest->lock, flags);
				5141
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5142	/* don't kick the active_load_balance_cpu_stop,
				5143	* if the curr task on busiest cpu can't be
				5144	* moved to this_cpu
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5145	*/
				5146	if (!cpumask_test_cpu(this_cpu,
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	5147	tsk_cpus_allowed(busiest->curr))) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5148	raw_spin_unlock_irqrestore(&busiest->lock,
				5149	flags);
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5150	env.flags \|= LBF_ALL_PINNED;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5151	goto out_one_pinned;
				5152	}
				5153
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5154	/*
				5155	* ->active_balance synchronizes accesses to
				5156	* ->active_balance_work. Once set, it's cleared
				5157	* only after active load balance is finished.
				5158	*/
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5159	if (!busiest->active_balance) {
				5160	busiest->active_balance = 1;
				5161	busiest->push_cpu = this_cpu;
				5162	active_balance = 1;
				5163	}
				5164	raw_spin_unlock_irqrestore(&busiest->lock, flags);
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5165
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5166	if (active_balance) {
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5167	stop_one_cpu_nowait(cpu_of(busiest),
				5168	active_load_balance_cpu_stop, busiest,
				5169	&busiest->active_balance_work);
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5170	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5171
				5172	/*
				5173	* We've kicked active balancing, reset the failure
				5174	* counter.
				5175	*/
				5176	sd->nr_balance_failed = sd->cache_nice_tries+1;
				5177	}
				5178	} else
				5179	sd->nr_balance_failed = 0;
				5180
				5181	if (likely(!active_balance)) {
				5182	/* We were unbalanced, so reset the balancing interval */
				5183	sd->balance_interval = sd->min_interval;
				5184	} else {
				5185	/*
				5186	* If we've begun active balancing, start to back off. This
				5187	* case may not be covered by the all_pinned logic if there
				5188	* is only 1 task on the busy runqueue (because we don't call
				5189	* move_tasks).
				5190	*/
				5191	if (sd->balance_interval < sd->max_interval)
				5192	sd->balance_interval *= 2;
				5193	}
				5194
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5195	goto out;
				5196
				5197	out_balanced:
				5198	schedstat_inc(sd, lb_balanced[idle]);
				5199
				5200	sd->nr_balance_failed = 0;
				5201
				5202	out_one_pinned:
				5203	/* tune up the balancing interval */
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5204	if (((env.flags & LBF_ALL_PINNED) &&
Peter Zijlstra	5b54b56	2011-09-22 15:23:13 +0200	[diff] [blame]	5205	sd->balance_interval < MAX_PINNED_INTERVAL) \|\|
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5206	(sd->balance_interval < sd->max_interval))
				5207	sd->balance_interval *= 2;
				5208
Venkatesh Pallipadi	46e49b3	2011-02-14 14:38:50 -0800	[diff] [blame]	5209	ld_moved = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5210	out:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5211	return ld_moved;
				5212	}
				5213
				5214	/*
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5215	* idle_balance is called by schedule() if this_cpu is about to become
				5216	* idle. Attempts to pull tasks from other CPUs.
				5217	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5218	void idle_balance(int this_cpu, struct rq *this_rq)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5219	{
				5220	struct sched_domain *sd;
				5221	int pulled_task = 0;
				5222	unsigned long next_balance = jiffies + HZ;
				5223
				5224	this_rq->idle_stamp = this_rq->clock;
				5225
				5226	if (this_rq->avg_idle < sysctl_sched_migration_cost)
				5227	return;
				5228
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	5229	update_rq_runnable_avg(this_rq, 1);
				5230
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5231	/*
				5232	* Drop the rq->lock, but keep IRQ/preempt disabled.
				5233	*/
				5234	raw_spin_unlock(&this_rq->lock);
				5235
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	5236	update_blocked_averages(this_cpu);
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5237	rcu_read_lock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5238	for_each_domain(this_cpu, sd) {
				5239	unsigned long interval;
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5240	int balance = 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5241
				5242	if (!(sd->flags & SD_LOAD_BALANCE))
				5243	continue;
				5244
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5245	if (sd->flags & SD_BALANCE_NEWIDLE) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5246	/* If we've pulled tasks over stop searching: */
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5247	pulled_task = load_balance(this_cpu, this_rq,
				5248	sd, CPU_NEWLY_IDLE, &balance);
				5249	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5250
				5251	interval = msecs_to_jiffies(sd->balance_interval);
				5252	if (time_after(next_balance, sd->last_balance + interval))
				5253	next_balance = sd->last_balance + interval;
Nikhil Rao	d5ad140	2010-11-17 11:42:04 -0800	[diff] [blame]	5254	if (pulled_task) {
				5255	this_rq->idle_stamp = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5256	break;
Nikhil Rao	d5ad140	2010-11-17 11:42:04 -0800	[diff] [blame]	5257	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5258	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5259	rcu_read_unlock();
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5260
				5261	raw_spin_lock(&this_rq->lock);
				5262
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5263	if (pulled_task \|\| time_after(jiffies, this_rq->next_balance)) {
				5264	/*
				5265	* We are going idle. next_balance may be set based on
				5266	* a busy processor. So reset next_balance.
				5267	*/
				5268	this_rq->next_balance = next_balance;
				5269	}
				5270	}
				5271
				5272	/*
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5273	* active_load_balance_cpu_stop is run by cpu stopper. It pushes
				5274	* running tasks off the busiest CPU onto idle CPUs. It requires at
				5275	* least 1 task to be running on each physical CPU where possible, and
				5276	* avoids physical / logical imbalances.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5277	*/
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5278	static int active_load_balance_cpu_stop(void *data)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5279	{
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5280	struct rq *busiest_rq = data;
				5281	int busiest_cpu = cpu_of(busiest_rq);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5282	int target_cpu = busiest_rq->push_cpu;
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5283	struct rq *target_rq = cpu_rq(target_cpu);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5284	struct sched_domain *sd;
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5285
				5286	raw_spin_lock_irq(&busiest_rq->lock);
				5287
				5288	/* make sure the requested cpu hasn't gone down in the meantime */
				5289	if (unlikely(busiest_cpu != smp_processor_id() \|\|
				5290	!busiest_rq->active_balance))
				5291	goto out_unlock;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5292
				5293	/* Is there any task to move? */
				5294	if (busiest_rq->nr_running <= 1)
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5295	goto out_unlock;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5296
				5297	/*
				5298	* This condition is "impossible", if it occurs
				5299	* we need to fix it. Originally reported by
				5300	* Bjorn Helgaas on a 128-cpu setup.
				5301	*/
				5302	BUG_ON(busiest_rq == target_rq);
				5303
				5304	/* move a task from busiest_rq to target_rq */
				5305	double_lock_balance(busiest_rq, target_rq);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5306
				5307	/* Search for an sd spanning us and the target CPU. */
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5308	rcu_read_lock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5309	for_each_domain(target_cpu, sd) {
				5310	if ((sd->flags & SD_LOAD_BALANCE) &&
				5311	cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
				5312	break;
				5313	}
				5314
				5315	if (likely(sd)) {
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5316	struct lb_env env = {
				5317	.sd = sd,
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	5318	.dst_cpu = target_cpu,
				5319	.dst_rq = target_rq,
				5320	.src_cpu = busiest_rq->cpu,
				5321	.src_rq = busiest_rq,
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5322	.idle = CPU_IDLE,
				5323	};
				5324
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5325	schedstat_inc(sd, alb_count);
				5326
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5327	if (move_one_task(&env))
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5328	schedstat_inc(sd, alb_pushed);
				5329	else
				5330	schedstat_inc(sd, alb_failed);
				5331	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5332	rcu_read_unlock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5333	double_unlock_balance(busiest_rq, target_rq);
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5334	out_unlock:
				5335	busiest_rq->active_balance = 0;
				5336	raw_spin_unlock_irq(&busiest_rq->lock);
				5337	return 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5338	}
				5339
				5340	#ifdef CONFIG_NO_HZ
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5341	/*
				5342	* idle load balancing details
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5343	* - When one of the busy CPUs notice that there may be an idle rebalancing
				5344	* needed, they will kick the idle load balancer, which then does idle
				5345	* load balancing for all the idle CPUs.
				5346	*/
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5347	static struct {
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5348	cpumask_var_t idle_cpus_mask;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5349	atomic_t nr_cpus;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5350	unsigned long next_balance; /* in jiffy units */
				5351	} nohz ____cacheline_aligned;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5352
Peter Zijlstra	8e7fbcb	2012-01-09 11:28:35 +0100	[diff] [blame]	5353	static inline int find_new_ilb(int call_cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5354	{
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5355	int ilb = cpumask_first(nohz.idle_cpus_mask);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5356
Suresh Siddha	786d6dc7	2011-12-01 17:07:35 -0800	[diff] [blame]	5357	if (ilb < nr_cpu_ids && idle_cpu(ilb))
				5358	return ilb;
				5359
				5360	return nr_cpu_ids;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5361	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5362
				5363	/*
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5364	* Kick a CPU to do the nohz balancing, if it is time for it. We pick the
				5365	* nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
				5366	* CPU (if there is one).
				5367	*/
				5368	static void nohz_balancer_kick(int cpu)
				5369	{
				5370	int ilb_cpu;
				5371
				5372	nohz.next_balance++;
				5373
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5374	ilb_cpu = find_new_ilb(cpu);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5375
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5376	if (ilb_cpu >= nr_cpu_ids)
				5377	return;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5378
Suresh Siddha	cd490c5	2011-12-06 11:26:34 -0800	[diff] [blame]	5379	if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5380	return;
				5381	/*
				5382	* Use smp_send_reschedule() instead of resched_cpu().
				5383	* This way we generate a sched IPI on the target cpu which
				5384	* is idle. And the softirq performing nohz idle load balance
				5385	* will be run before returning from the IPI.
				5386	*/
				5387	smp_send_reschedule(ilb_cpu);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5388	return;
				5389	}
				5390
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5391	static inline void nohz_balance_exit_idle(int cpu)
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	5392	{
				5393	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
				5394	cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
				5395	atomic_dec(&nohz.nr_cpus);
				5396	clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
				5397	}
				5398	}
				5399
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	5400	static inline void set_cpu_sd_state_busy(void)
				5401	{
				5402	struct sched_domain *sd;
				5403	int cpu = smp_processor_id();
				5404
				5405	if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
				5406	return;
				5407	clear_bit(NOHZ_IDLE, nohz_flags(cpu));
				5408
				5409	rcu_read_lock();
				5410	for_each_domain(cpu, sd)
				5411	atomic_inc(&sd->groups->sgp->nr_busy_cpus);
				5412	rcu_read_unlock();
				5413	}
				5414
				5415	void set_cpu_sd_state_idle(void)
				5416	{
				5417	struct sched_domain *sd;
				5418	int cpu = smp_processor_id();
				5419
				5420	if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
				5421	return;
				5422	set_bit(NOHZ_IDLE, nohz_flags(cpu));
				5423
				5424	rcu_read_lock();
				5425	for_each_domain(cpu, sd)
				5426	atomic_dec(&sd->groups->sgp->nr_busy_cpus);
				5427	rcu_read_unlock();
				5428	}
				5429
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5430	/*
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5431	* This routine will record that the cpu is going idle with tick stopped.
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5432	* This info will be used in performing idle load balancing in the future.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5433	*/
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5434	void nohz_balance_enter_idle(int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5435	{
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	5436	/*
				5437	* If this cpu is going down, then nothing needs to be done.
				5438	*/
				5439	if (!cpu_active(cpu))
				5440	return;
				5441
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5442	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
				5443	return;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5444
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5445	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
				5446	atomic_inc(&nohz.nr_cpus);
				5447	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5448	}
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	5449
				5450	static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
				5451	unsigned long action, void *hcpu)
				5452	{
				5453	switch (action & ~CPU_TASKS_FROZEN) {
				5454	case CPU_DYING:
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5455	nohz_balance_exit_idle(smp_processor_id());
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	5456	return NOTIFY_OK;
				5457	default:
				5458	return NOTIFY_DONE;
				5459	}
				5460	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5461	#endif
				5462
				5463	static DEFINE_SPINLOCK(balancing);
				5464
Peter Zijlstra	49c022e	2011-04-05 10:14:25 +0200	[diff] [blame]	5465	/*
				5466	* Scale the max load_balance interval with the number of CPUs in the system.
				5467	* This trades load-balance latency on larger machines for less cross talk.
				5468	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5469	void update_max_interval(void)
Peter Zijlstra	49c022e	2011-04-05 10:14:25 +0200	[diff] [blame]	5470	{
				5471	max_load_balance_interval = HZ*num_online_cpus()/10;
				5472	}
				5473
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5474	/*
				5475	* It checks each scheduling domain to see if it is due to be balanced,
				5476	* and initiates a balancing operation if so.
				5477	*
				5478	* Balancing parameters are set up in arch_init_sched_domains.
				5479	*/
				5480	static void rebalance_domains(int cpu, enum cpu_idle_type idle)
				5481	{
				5482	int balance = 1;
				5483	struct rq *rq = cpu_rq(cpu);
				5484	unsigned long interval;
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	5485	struct sched_domain *sd;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5486	/* Earliest time when we have to do rebalance again */
				5487	unsigned long next_balance = jiffies + 60*HZ;
				5488	int update_next_balance = 0;
				5489	int need_serialize;
				5490
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	5491	update_blocked_averages(cpu);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	5492
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5493	rcu_read_lock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5494	for_each_domain(cpu, sd) {
				5495	if (!(sd->flags & SD_LOAD_BALANCE))
				5496	continue;
				5497
				5498	interval = sd->balance_interval;
				5499	if (idle != CPU_IDLE)
				5500	interval *= sd->busy_factor;
				5501
				5502	/* scale ms to jiffies */
				5503	interval = msecs_to_jiffies(interval);
Peter Zijlstra	49c022e	2011-04-05 10:14:25 +0200	[diff] [blame]	5504	interval = clamp(interval, 1UL, max_load_balance_interval);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5505
				5506	need_serialize = sd->flags & SD_SERIALIZE;
				5507
				5508	if (need_serialize) {
				5509	if (!spin_trylock(&balancing))
				5510	goto out;
				5511	}
				5512
				5513	if (time_after_eq(jiffies, sd->last_balance + interval)) {
				5514	if (load_balance(cpu, rq, sd, idle, &balance)) {
				5515	/*
				5516	* We've pulled tasks over so either we're no
Peter Zijlstra	c186faf	2011-02-21 18:52:53 +0100	[diff] [blame]	5517	* longer idle.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5518	*/
				5519	idle = CPU_NOT_IDLE;
				5520	}
				5521	sd->last_balance = jiffies;
				5522	}
				5523	if (need_serialize)
				5524	spin_unlock(&balancing);
				5525	out:
				5526	if (time_after(next_balance, sd->last_balance + interval)) {
				5527	next_balance = sd->last_balance + interval;
				5528	update_next_balance = 1;
				5529	}
				5530
				5531	/*
				5532	* Stop the load balance at this level. There is another
				5533	* CPU in our sched group which is doing load balancing more
				5534	* actively.
				5535	*/
				5536	if (!balance)
				5537	break;
				5538	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5539	rcu_read_unlock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5540
				5541	/*
				5542	* next_balance will be updated only when there is a need.
				5543	* When the cpu is attached to null domain for ex, it will not be
				5544	* updated.
				5545	*/
				5546	if (likely(update_next_balance))
				5547	rq->next_balance = next_balance;
				5548	}
				5549
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5550	#ifdef CONFIG_NO_HZ
				5551	/*
				5552	* In CONFIG_NO_HZ case, the idle balance kickee will do the
				5553	* rebalancing for all the cpus for whom scheduler ticks are stopped.
				5554	*/
				5555	static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
				5556	{
				5557	struct rq *this_rq = cpu_rq(this_cpu);
				5558	struct rq *rq;
				5559	int balance_cpu;
				5560
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5561	if (idle != CPU_IDLE \|\|
				5562	!test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
				5563	goto end;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5564
				5565	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
Suresh Siddha	8a6d42d	2011-12-06 11:19:37 -0800	[diff] [blame]	5566	if (balance_cpu == this_cpu \|\| !idle_cpu(balance_cpu))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5567	continue;
				5568
				5569	/*
				5570	* If this cpu gets work to do, stop the load balancing
				5571	* work being done for other cpus. Next load
				5572	* balancing owner will pick it up.
				5573	*/
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5574	if (need_resched())
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5575	break;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5576
Vincent Guittot	5ed4f1d	2012-09-13 06:11:26 +0200	[diff] [blame]	5577	rq = cpu_rq(balance_cpu);
				5578
				5579	raw_spin_lock_irq(&rq->lock);
				5580	update_rq_clock(rq);
				5581	update_idle_cpu_load(rq);
				5582	raw_spin_unlock_irq(&rq->lock);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5583
				5584	rebalance_domains(balance_cpu, CPU_IDLE);
				5585
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5586	if (time_after(this_rq->next_balance, rq->next_balance))
				5587	this_rq->next_balance = rq->next_balance;
				5588	}
				5589	nohz.next_balance = this_rq->next_balance;
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5590	end:
				5591	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5592	}
				5593
				5594	/*
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5595	* Current heuristic for kicking the idle load balancer in the presence
				5596	* of an idle cpu is the system.
				5597	* - This rq has more than one task.
				5598	* - At any scheduler domain level, this cpu's scheduler group has multiple
				5599	* busy cpu's exceeding the group's power.
				5600	* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
				5601	* domain span are idle.
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5602	*/
				5603	static inline int nohz_kick_needed(struct rq *rq, int cpu)
				5604	{
				5605	unsigned long now = jiffies;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5606	struct sched_domain *sd;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5607
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5608	if (unlikely(idle_cpu(cpu)))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5609	return 0;
				5610
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5611	/*
				5612	* We may be recently in ticked or tickless idle mode. At the first
				5613	* busy tick after returning from idle, we will update the busy stats.
				5614	*/
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	5615	set_cpu_sd_state_busy();
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5616	nohz_balance_exit_idle(cpu);
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5617
				5618	/*
				5619	* None are in tickless mode and hence no need for NOHZ idle load
				5620	* balancing.
				5621	*/
				5622	if (likely(!atomic_read(&nohz.nr_cpus)))
				5623	return 0;
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5624
				5625	if (time_before(now, nohz.next_balance))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5626	return 0;
				5627
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5628	if (rq->nr_running >= 2)
				5629	goto need_kick;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5630
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5631	rcu_read_lock();
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5632	for_each_domain(cpu, sd) {
				5633	struct sched_group *sg = sd->groups;
				5634	struct sched_group_power *sgp = sg->sgp;
				5635	int nr_busy = atomic_read(&sgp->nr_busy_cpus);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5636
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5637	if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5638	goto need_kick_unlock;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5639
				5640	if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
				5641	&& (cpumask_first_and(nohz.idle_cpus_mask,
				5642	sched_domain_span(sd)) < cpu))
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5643	goto need_kick_unlock;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5644
				5645	if (!(sd->flags & (SD_SHARE_PKG_RESOURCES \| SD_ASYM_PACKING)))
				5646	break;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5647	}
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5648	rcu_read_unlock();
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5649	return 0;
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5650
				5651	need_kick_unlock:
				5652	rcu_read_unlock();
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5653	need_kick:
				5654	return 1;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5655	}
				5656	#else
				5657	static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
				5658	#endif
				5659
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5660	/*
				5661	* run_rebalance_domains is triggered when needed from the scheduler tick.
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5662	* Also triggered for nohz idle balancing (with nohz_balancing_kick set).
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5663	*/
				5664	static void run_rebalance_domains(struct softirq_action *h)
				5665	{
				5666	int this_cpu = smp_processor_id();
				5667	struct rq *this_rq = cpu_rq(this_cpu);
Suresh Siddha	6eb57e0	2011-10-03 15:09:01 -0700	[diff] [blame]	5668	enum cpu_idle_type idle = this_rq->idle_balance ?
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5669	CPU_IDLE : CPU_NOT_IDLE;
				5670
				5671	rebalance_domains(this_cpu, idle);
				5672
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5673	/*
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5674	* If this cpu has a pending nohz_balance_kick, then do the
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5675	* balancing on behalf of the other idle cpus whose ticks are
				5676	* stopped.
				5677	*/
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5678	nohz_idle_balance(this_cpu, idle);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5679	}
				5680
				5681	static inline int on_null_domain(int cpu)
				5682	{
Paul E. McKenney	90a6501	2010-02-28 08:32:18 -0800	[diff] [blame]	5683	return !rcu_dereference_sched(cpu_rq(cpu)->sd);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5684	}
				5685
				5686	/*
				5687	* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5688	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5689	void trigger_load_balance(struct rq *rq, int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5690	{
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5691	/* Don't need to rebalance while attached to NULL domain */
				5692	if (time_after_eq(jiffies, rq->next_balance) &&
				5693	likely(!on_null_domain(cpu)))
				5694	raise_softirq(SCHED_SOFTIRQ);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5695	#ifdef CONFIG_NO_HZ
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5696	if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5697	nohz_balancer_kick(cpu);
				5698	#endif
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5699	}
				5700
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	5701	static void rq_online_fair(struct rq *rq)
				5702	{
				5703	update_sysctl();
				5704	}
				5705
				5706	static void rq_offline_fair(struct rq *rq)
				5707	{
				5708	update_sysctl();
Peter Boonstoppel	a4c96ae	2012-08-09 15:34:47 -0700	[diff] [blame]	5709
				5710	/* Ensure any throttled groups are reachable by pick_next_task */
				5711	unthrottle_offline_cfs_rqs(rq);
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	5712	}
				5713
Dhaval Giani	55e12e5	2008-06-24 23:39:43 +0530	[diff] [blame]	5714	#endif /* CONFIG_SMP */
Peter Williams	e1d1484	2007-10-24 18:23:51 +0200	[diff] [blame]	5715
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5716	/*
				5717	* scheduler tick hitting a task of our scheduling class:
				5718	*/
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	5719	static void task_tick_fair(struct rq rq, struct task_struct curr, int queued)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5720	{
				5721	struct cfs_rq *cfs_rq;
				5722	struct sched_entity *se = &curr->se;
				5723
				5724	for_each_sched_entity(se) {
				5725	cfs_rq = cfs_rq_of(se);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	5726	entity_tick(cfs_rq, se, queued);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5727	}
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	5728
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	5729	if (sched_feat_numa(NUMA))
				5730	task_tick_numa(rq, curr);
Linus Torvalds	3d59eeb	2012-12-16 14:33:25 -0800	[diff] [blame]	5731
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	5732	update_rq_runnable_avg(rq, 1);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5733	}
				5734
				5735	/*
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5736	* called on fork with the child task as argument from the parent's context
				5737	* - child not yet on the tasklist
				5738	* - preemption disabled
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5739	*/
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5740	static void task_fork_fair(struct task_struct *p)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5741	{
Daisuke Nishimura	4fc420c	2011-12-15 14:36:55 +0900	[diff] [blame]	5742	struct cfs_rq *cfs_rq;
				5743	struct sched_entity se = &p->se, curr;
Ingo Molnar	00bf7bf	2007-10-15 17:00:14 +0200	[diff] [blame]	5744	int this_cpu = smp_processor_id();
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5745	struct rq *rq = this_rq();
				5746	unsigned long flags;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5747
Thomas Gleixner	05fa785	2009-11-17 14:28:38 +0100	[diff] [blame]	5748	raw_spin_lock_irqsave(&rq->lock, flags);
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5749
Peter Zijlstra	861d034	2010-08-19 13:31:43 +0200	[diff] [blame]	5750	update_rq_clock(rq);
				5751
Daisuke Nishimura	4fc420c	2011-12-15 14:36:55 +0900	[diff] [blame]	5752	cfs_rq = task_cfs_rq(current);
				5753	curr = cfs_rq->curr;
				5754
Paul E. McKenney	b0a0f66	2010-10-06 17:32:51 -0700	[diff] [blame]	5755	if (unlikely(task_cpu(p) != this_cpu)) {
				5756	rcu_read_lock();
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5757	__set_task_cpu(p, this_cpu);
Paul E. McKenney	b0a0f66	2010-10-06 17:32:51 -0700	[diff] [blame]	5758	rcu_read_unlock();
				5759	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5760
Ting Yang	7109c44	2007-08-28 12:53:24 +0200	[diff] [blame]	5761	update_curr(cfs_rq);
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5762
Mike Galbraith	b5d9d73	2009-09-08 11:12:28 +0200	[diff] [blame]	5763	if (curr)
				5764	se->vruntime = curr->vruntime;
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	5765	place_entity(cfs_rq, se, 1);
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	5766
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	5767	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
Dmitry Adamushko	87fefa3	2007-10-15 17:00:08 +0200	[diff] [blame]	5768	/*
Ingo Molnar	edcb60a	2007-10-15 17:00:08 +0200	[diff] [blame]	5769	* Upon rescheduling, sched_class::put_prev_task() will place
				5770	* 'current' within the tree based on its new key value.
				5771	*/
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	5772	swap(curr->vruntime, se->vruntime);
Bharata B Rao	aec0a51	2008-08-28 14:42:49 +0530	[diff] [blame]	5773	resched_task(rq->curr);
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	5774	}
				5775
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	5776	se->vruntime -= cfs_rq->min_vruntime;
				5777
Thomas Gleixner	05fa785	2009-11-17 14:28:38 +0100	[diff] [blame]	5778	raw_spin_unlock_irqrestore(&rq->lock, flags);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	5779	}
				5780
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5781	/*
				5782	* Priority of the task has changed. Check to see if we preempt
				5783	* the current task.
				5784	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5785	static void
				5786	prio_changed_fair(struct rq rq, struct task_struct p, int oldprio)
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5787	{
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5788	if (!p->se.on_rq)
				5789	return;
				5790
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5791	/*
				5792	* Reschedule if we are currently running on this runqueue and
				5793	* our priority decreased, or if we are not currently running on
				5794	* this runqueue and our priority is higher than the current's
				5795	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5796	if (rq->curr == p) {
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5797	if (p->prio > oldprio)
				5798	resched_task(rq->curr);
				5799	} else
Peter Zijlstra	15afe09	2008-09-20 23:38:02 +0200	[diff] [blame]	5800	check_preempt_curr(rq, p, 0);
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5801	}
				5802
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5803	static void switched_from_fair(struct rq rq, struct task_struct p)
				5804	{
				5805	struct sched_entity *se = &p->se;
				5806	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				5807
				5808	/*
				5809	* Ensure the task's vruntime is normalized, so that when its
				5810	* switched back to the fair class the enqueue_entity(.flags=0) will
				5811	* do the right thing.
				5812	*
				5813	* If it was on_rq, then the dequeue_entity(.flags=0) will already
				5814	* have normalized the vruntime, if it was !on_rq, then only when
				5815	* the task is sleeping will it still have non-normalized vruntime.
				5816	*/
				5817	if (!se->on_rq && p->state != TASK_RUNNING) {
				5818	/*
				5819	* Fix up our vruntime so that the current sleep doesn't
				5820	* cause 'unlimited' sleep bonus.
				5821	*/
				5822	place_entity(cfs_rq, se, 0);
				5823	se->vruntime -= cfs_rq->min_vruntime;
				5824	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	5825
				5826	#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
				5827	/*
				5828	* Remove our load from contribution when we leave sched_fair
				5829	* and ensure we don't carry in an old decay_count if we
				5830	* switch back.
				5831	*/
				5832	if (p->se.avg.decay_count) {
				5833	struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
				5834	__synchronize_entity_decay(&p->se);
				5835	subtract_blocked_load_contrib(cfs_rq,
				5836	p->se.avg.load_avg_contrib);
				5837	}
				5838	#endif
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5839	}
				5840
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5841	/*
				5842	* We switched to the sched_fair class.
				5843	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5844	static void switched_to_fair(struct rq rq, struct task_struct p)
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5845	{
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5846	if (!p->se.on_rq)
				5847	return;
				5848
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5849	/*
				5850	* We were most likely switched from sched_rt, so
				5851	* kick off the schedule if running, otherwise just see
				5852	* if we can still preempt the current task.
				5853	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	5854	if (rq->curr == p)
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5855	resched_task(rq->curr);
				5856	else
Peter Zijlstra	15afe09	2008-09-20 23:38:02 +0200	[diff] [blame]	5857	check_preempt_curr(rq, p, 0);
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	5858	}
				5859
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	5860	/* Account for a task changing its policy or group.
				5861	*
				5862	* This routine is mostly called to set cfs_rq->curr field when a task
				5863	* migrates between groups/classes.
				5864	*/
				5865	static void set_curr_task_fair(struct rq *rq)
				5866	{
				5867	struct sched_entity *se = &rq->curr->se;
				5868
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	5869	for_each_sched_entity(se) {
				5870	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				5871
				5872	set_next_entity(cfs_rq, se);
				5873	/* ensure bandwidth has been allocated on our new cfs_rq */
				5874	account_cfs_rq_runtime(cfs_rq, 0);
				5875	}
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	5876	}
				5877
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5878	void init_cfs_rq(struct cfs_rq *cfs_rq)
				5879	{
				5880	cfs_rq->tasks_timeline = RB_ROOT;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5881	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
				5882	#ifndef CONFIG_64BIT
				5883	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
				5884	#endif
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	5885	#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
				5886	atomic64_set(&cfs_rq->decay_counter, 1);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	5887	atomic64_set(&cfs_rq->removed_load, 0);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	5888	#endif
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5889	}
				5890
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	5891	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	5892	static void task_move_group_fair(struct task_struct *p, int on_rq)
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	5893	{
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	5894	struct cfs_rq *cfs_rq;
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	5895	/*
				5896	* If the task was not on the rq at the time of this cgroup movement
				5897	* it must have been asleep, sleeping tasks keep their ->vruntime
				5898	* absolute on their old rq until wakeup (needed for the fair sleeper
				5899	* bonus in place_entity()).
				5900	*
				5901	* If it was on the rq, we've just 'preempted' it, which does convert
				5902	* ->vruntime to a relative base.
				5903	*
				5904	* Make sure both cases convert their relative position when migrating
				5905	* to another cgroup's rq. This does somewhat interfere with the
				5906	* fair sleeper stuff for the first placement, but who cares.
				5907	*/
Daisuke Nishimura	7ceff01	2011-12-15 14:36:07 +0900	[diff] [blame]	5908	/*
				5909	* When !on_rq, vruntime of the task has usually NOT been normalized.
				5910	* But there are some cases where it has already been normalized:
				5911	*
				5912	* - Moving a forked child which is waiting for being woken up by
				5913	* wake_up_new_task().
Daisuke Nishimura	62af378	2011-12-15 14:37:41 +0900	[diff] [blame]	5914	* - Moving a task which has been woken up by try_to_wake_up() and
				5915	* waiting for actually being woken up by sched_ttwu_pending().
Daisuke Nishimura	7ceff01	2011-12-15 14:36:07 +0900	[diff] [blame]	5916	*
				5917	* To prevent boost or penalty in the new cfs_rq caused by delta
				5918	* min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
				5919	*/
Daisuke Nishimura	62af378	2011-12-15 14:37:41 +0900	[diff] [blame]	5920	if (!on_rq && (!p->se.sum_exec_runtime \|\| p->state == TASK_WAKING))
Daisuke Nishimura	7ceff01	2011-12-15 14:36:07 +0900	[diff] [blame]	5921	on_rq = 1;
				5922
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	5923	if (!on_rq)
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	5924	p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
				5925	set_task_rq(p, task_cpu(p));
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	5926	if (!on_rq) {
				5927	cfs_rq = cfs_rq_of(&p->se);
				5928	p->se.vruntime += cfs_rq->min_vruntime;
				5929	#ifdef CONFIG_SMP
				5930	/*
				5931	* migrate_task_rq_fair() will have removed our previous
				5932	* contribution, but we must synchronize for ongoing future
				5933	* decay.
				5934	*/
				5935	p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
				5936	cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
				5937	#endif
				5938	}
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	5939	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5940
				5941	void free_fair_sched_group(struct task_group *tg)
				5942	{
				5943	int i;
				5944
				5945	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
				5946
				5947	for_each_possible_cpu(i) {
				5948	if (tg->cfs_rq)
				5949	kfree(tg->cfs_rq[i]);
				5950	if (tg->se)
				5951	kfree(tg->se[i]);
				5952	}
				5953
				5954	kfree(tg->cfs_rq);
				5955	kfree(tg->se);
				5956	}
				5957
				5958	int alloc_fair_sched_group(struct task_group tg, struct task_group parent)
				5959	{
				5960	struct cfs_rq *cfs_rq;
				5961	struct sched_entity *se;
				5962	int i;
				5963
				5964	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
				5965	if (!tg->cfs_rq)
				5966	goto err;
				5967	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
				5968	if (!tg->se)
				5969	goto err;
				5970
				5971	tg->shares = NICE_0_LOAD;
				5972
				5973	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
				5974
				5975	for_each_possible_cpu(i) {
				5976	cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
				5977	GFP_KERNEL, cpu_to_node(i));
				5978	if (!cfs_rq)
				5979	goto err;
				5980
				5981	se = kzalloc_node(sizeof(struct sched_entity),
				5982	GFP_KERNEL, cpu_to_node(i));
				5983	if (!se)
				5984	goto err_free_rq;
				5985
				5986	init_cfs_rq(cfs_rq);
				5987	init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
				5988	}
				5989
				5990	return 1;
				5991
				5992	err_free_rq:
				5993	kfree(cfs_rq);
				5994	err:
				5995	return 0;
				5996	}
				5997
				5998	void unregister_fair_sched_group(struct task_group *tg, int cpu)
				5999	{
				6000	struct rq *rq = cpu_rq(cpu);
				6001	unsigned long flags;
				6002
				6003	/*
				6004	* Only empty task groups can be destroyed; so we can speculatively
				6005	* check on_list without danger of it being re-added.
				6006	*/
				6007	if (!tg->cfs_rq[cpu]->on_list)
				6008	return;
				6009
				6010	raw_spin_lock_irqsave(&rq->lock, flags);
				6011	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
				6012	raw_spin_unlock_irqrestore(&rq->lock, flags);
				6013	}
				6014
				6015	void init_tg_cfs_entry(struct task_group tg, struct cfs_rq cfs_rq,
				6016	struct sched_entity *se, int cpu,
				6017	struct sched_entity *parent)
				6018	{
				6019	struct rq *rq = cpu_rq(cpu);
				6020
				6021	cfs_rq->tg = tg;
				6022	cfs_rq->rq = rq;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6023	init_cfs_rq_runtime(cfs_rq);
				6024
				6025	tg->cfs_rq[cpu] = cfs_rq;
				6026	tg->se[cpu] = se;
				6027
				6028	/* se could be NULL for root_task_group */
				6029	if (!se)
				6030	return;
				6031
				6032	if (!parent)
				6033	se->cfs_rq = &rq->cfs;
				6034	else
				6035	se->cfs_rq = parent->my_q;
				6036
				6037	se->my_q = cfs_rq;
				6038	update_load_set(&se->load, 0);
				6039	se->parent = parent;
				6040	}
				6041
				6042	static DEFINE_MUTEX(shares_mutex);
				6043
				6044	int sched_group_set_shares(struct task_group *tg, unsigned long shares)
				6045	{
				6046	int i;
				6047	unsigned long flags;
				6048
				6049	/*
				6050	* We can't change the weight of the root cgroup.
				6051	*/
				6052	if (!tg->se[0])
				6053	return -EINVAL;
				6054
				6055	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
				6056
				6057	mutex_lock(&shares_mutex);
				6058	if (tg->shares == shares)
				6059	goto done;
				6060
				6061	tg->shares = shares;
				6062	for_each_possible_cpu(i) {
				6063	struct rq *rq = cpu_rq(i);
				6064	struct sched_entity *se;
				6065
				6066	se = tg->se[i];
				6067	/* Propagate contribution to hierarchy */
				6068	raw_spin_lock_irqsave(&rq->lock, flags);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	6069	for_each_sched_entity(se)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6070	update_cfs_shares(group_cfs_rq(se));
				6071	raw_spin_unlock_irqrestore(&rq->lock, flags);
				6072	}
				6073
				6074	done:
				6075	mutex_unlock(&shares_mutex);
				6076	return 0;
				6077	}
				6078	#else /* CONFIG_FAIR_GROUP_SCHED */
				6079
				6080	void free_fair_sched_group(struct task_group *tg) { }
				6081
				6082	int alloc_fair_sched_group(struct task_group tg, struct task_group parent)
				6083	{
				6084	return 1;
				6085	}
				6086
				6087	void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
				6088
				6089	#endif /* CONFIG_FAIR_GROUP_SCHED */
				6090
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6091
H Hartley Sweeten	6d686f4	2010-01-13 20:21:52 -0700	[diff] [blame]	6092	static unsigned int get_rr_interval_fair(struct rq rq, struct task_struct task)
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6093	{
				6094	struct sched_entity *se = &task->se;
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6095	unsigned int rr_interval = 0;
				6096
				6097	/*
				6098	* Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
				6099	* idle runqueue:
				6100	*/
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6101	if (rq->cfs.load.weight)
Zhu Yanhai	a59f4e0	2013-01-08 12:56:52 +0800	[diff] [blame]	6102	rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6103
				6104	return rr_interval;
				6105	}
				6106
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6107	/*
				6108	* All the scheduling class methods:
				6109	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6110	const struct sched_class fair_sched_class = {
Ingo Molnar	5522d5d	2007-10-15 17:00:12 +0200	[diff] [blame]	6111	.next = &idle_sched_class,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6112	.enqueue_task = enqueue_task_fair,
				6113	.dequeue_task = dequeue_task_fair,
				6114	.yield_task = yield_task_fair,
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	6115	.yield_to_task = yield_to_task_fair,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6116
Ingo Molnar	2e09bf5	2007-10-15 17:00:05 +0200	[diff] [blame]	6117	.check_preempt_curr = check_preempt_wakeup,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6118
				6119	.pick_next_task = pick_next_task_fair,
				6120	.put_prev_task = put_prev_task_fair,
				6121
Peter Williams	681f3e6	2007-10-24 18:23:51 +0200	[diff] [blame]	6122	#ifdef CONFIG_SMP
Li Zefan	4ce72a2	2008-10-22 15:25:26 +0800	[diff] [blame]	6123	.select_task_rq = select_task_rq_fair,
Paul Turner	f4e26b1	2012-10-04 13:18:32 +0200	[diff] [blame]	6124	#ifdef CONFIG_FAIR_GROUP_SCHED
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	6125	.migrate_task_rq = migrate_task_rq_fair,
Paul Turner	f4e26b1	2012-10-04 13:18:32 +0200	[diff] [blame]	6126	#endif
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	6127	.rq_online = rq_online_fair,
				6128	.rq_offline = rq_offline_fair,
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	6129
				6130	.task_waking = task_waking_fair,
Peter Williams	681f3e6	2007-10-24 18:23:51 +0200	[diff] [blame]	6131	#endif
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6132
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	6133	.set_curr_task = set_curr_task_fair,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6134	.task_tick = task_tick_fair,
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	6135	.task_fork = task_fork_fair,
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6136
				6137	.prio_changed = prio_changed_fair,
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	6138	.switched_from = switched_from_fair,
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6139	.switched_to = switched_to_fair,
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6140
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6141	.get_rr_interval = get_rr_interval_fair,
				6142
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6143	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	6144	.task_move_group = task_move_group_fair,
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6145	#endif
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6146	};
				6147
				6148	#ifdef CONFIG_SCHED_DEBUG
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6149	void print_cfs_stats(struct seq_file *m, int cpu)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6150	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6151	struct cfs_rq *cfs_rq;
				6152
Peter Zijlstra	5973e5b	2008-01-25 21:08:34 +0100	[diff] [blame]	6153	rcu_read_lock();
Ingo Molnar	c3b64f1	2007-08-09 11:16:51 +0200	[diff] [blame]	6154	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
Ingo Molnar	5cef9ec	2007-08-09 11:16:47 +0200	[diff] [blame]	6155	print_cfs_rq(m, cpu, cfs_rq);
Peter Zijlstra	5973e5b	2008-01-25 21:08:34 +0100	[diff] [blame]	6156	rcu_read_unlock();
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6157	}
				6158	#endif
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6159
				6160	__init void init_sched_fair_class(void)
				6161	{
				6162	#ifdef CONFIG_SMP
				6163	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
				6164
				6165	#ifdef CONFIG_NO_HZ
Diwakar Tundlam	554ceca	2012-03-07 14:44:26 -0800	[diff] [blame]	6166	nohz.next_balance = jiffies;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6167	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	6168	cpu_notifier(sched_ilb_notifier, 0);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6169	#endif
				6170	#endif /* SMP */
				6171
				6172	}