Blame - kernel/sched/fair.c - SHIFTPHONES/android_kernel_shift_sdm845

blob: 9efd34f63e8196c9ab56baa9fc3cdbf0e5185047 [file] [log] [blame]

Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1	/*
				2	* Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
				3	*
				4	* Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
				5	*
				6	* Interactivity improvements by Mike Galbraith
				7	* (C) 2007 Mike Galbraith <efault@gmx.de>
				8	*
				9	* Various enhancements by Dmitry Adamushko.
				10	* (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
				11	*
				12	* Group scheduling enhancements by Srivatsa Vaddagiri
				13	* Copyright IBM Corporation, 2007
				14	* Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
				15	*
				16	* Scaled math optimizations by Thomas Gleixner
				17	* Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	18	*
				19	* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
				20	* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	21	*/
				22
Arjan van de Ven	9745512	2008-01-25 21:08:34 +0100	[diff] [blame]	23	#include <linux/latencytop.h>
Christian Ehrhardt	1983a92	2009-11-30 12:16:47 +0100	[diff] [blame]	24	#include <linux/sched.h>
Sisir Koppaka	3436ae1	2011-03-26 18:22:55 +0530	[diff] [blame]	25	#include <linux/cpumask.h>
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	26	#include <linux/slab.h>
				27	#include <linux/profile.h>
				28	#include <linux/interrupt.h>
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	29	#include <linux/mempolicy.h>
Mel Gorman	e14808b	2012-11-19 10:59:15 +0000	[diff] [blame]	30	#include <linux/migrate.h>
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	31	#include <linux/task_work.h>
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	32
				33	#include <trace/events/sched.h>
				34
				35	#include "sched.h"
Arjan van de Ven	9745512	2008-01-25 21:08:34 +0100	[diff] [blame]	36
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	37	/*
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	38	* Targeted preemption latency for CPU-bound tasks:
Takuya Yoshikawa	864616e	2010-10-14 16:09:13 +0900	[diff] [blame]	39	* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	40	*
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	41	* NOTE: this latency value is not the same as the concept of
Ingo Molnar	d274a4c	2007-10-15 17:00:14 +0200	[diff] [blame]	42	* 'timeslice length' - timeslices in CFS are of variable length
				43	* and have no persistent notion like in traditional, time-slice
				44	* based scheduling concepts.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	45	*
Ingo Molnar	d274a4c	2007-10-15 17:00:14 +0200	[diff] [blame]	46	* (to see the precise effective timeslice length of your workload,
				47	* run vmstat and monitor the context-switches (cs) field)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	48	*/
Mike Galbraith	2140692	2010-03-11 17:17:15 +0100	[diff] [blame]	49	unsigned int sysctl_sched_latency = 6000000ULL;
				50	unsigned int normalized_sysctl_sched_latency = 6000000ULL;
Ingo Molnar	2bd8e6d	2007-10-15 17:00:02 +0200	[diff] [blame]	51
				52	/*
Christian Ehrhardt	1983a92	2009-11-30 12:16:47 +0100	[diff] [blame]	53	* The initial- and re-scaling of tunables is configurable
				54	* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
				55	*
				56	* Options are:
				57	* SCHED_TUNABLESCALING_NONE - unscaled, always *1
				58	* SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
				59	* SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
				60	*/
				61	enum sched_tunable_scaling sysctl_sched_tunable_scaling
				62	= SCHED_TUNABLESCALING_LOG;
				63
				64	/*
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	65	* Minimal preemption granularity for CPU-bound tasks:
Takuya Yoshikawa	864616e	2010-10-14 16:09:13 +0900	[diff] [blame]	66	* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	67	*/
Ingo Molnar	0bf377b	2010-09-12 08:14:52 +0200	[diff] [blame]	68	unsigned int sysctl_sched_min_granularity = 750000ULL;
				69	unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	70
				71	/*
				72	* is kept at sysctl_sched_latency / sysctl_sched_min_granularity
				73	*/
Ingo Molnar	0bf377b	2010-09-12 08:14:52 +0200	[diff] [blame]	74	static unsigned int sched_nr_latency = 8;
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	75
				76	/*
Mike Galbraith	2bba22c	2009-09-09 15:41:37 +0200	[diff] [blame]	77	* After fork, child runs first. If set to 0 (default) then
Ingo Molnar	2bd8e6d	2007-10-15 17:00:02 +0200	[diff] [blame]	78	* parent will (try to) run first.
				79	*/
Mike Galbraith	2bba22c	2009-09-09 15:41:37 +0200	[diff] [blame]	80	unsigned int sysctl_sched_child_runs_first __read_mostly;
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	81
				82	/*
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	83	* SCHED_OTHER wake-up granularity.
Mike Galbraith	172e082	2009-09-09 15:41:37 +0200	[diff] [blame]	84	* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	85	*
				86	* This option delays the preemption effects of decoupled workloads
				87	* and reduces their over-scheduling. Synchronous workloads will still
				88	* have immediate wakeup/sleep latencies.
				89	*/
Mike Galbraith	172e082	2009-09-09 15:41:37 +0200	[diff] [blame]	90	unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	91	unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	92
Ingo Molnar	da84d96	2007-10-15 17:00:18 +0200	[diff] [blame]	93	const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
				94
Paul Turner	a7a4f8a	2010-11-15 15:47:06 -0800	[diff] [blame]	95	/*
				96	* The exponential sliding window over which load is averaged for shares
				97	* distribution.
				98	* (default: 10msec)
				99	*/
				100	unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
				101
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	102	#ifdef CONFIG_CFS_BANDWIDTH
				103	/*
				104	* Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
				105	* each time a cfs_rq requests quota.
				106	*
				107	* Note: in the case that the slice exceeds the runtime remaining (either due
				108	* to consumption or the quota being specified to be smaller than the slice)
				109	* we will always only issue the remaining available time.
				110	*
				111	* default: 5 msec, units: microseconds
				112	*/
				113	unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
				114	#endif
				115
Paul Gortmaker	8527632	2013-04-19 15:10:50 -0400	[diff] [blame]	116	static inline void update_load_add(struct load_weight *lw, unsigned long inc)
				117	{
				118	lw->weight += inc;
				119	lw->inv_weight = 0;
				120	}
				121
				122	static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
				123	{
				124	lw->weight -= dec;
				125	lw->inv_weight = 0;
				126	}
				127
				128	static inline void update_load_set(struct load_weight *lw, unsigned long w)
				129	{
				130	lw->weight = w;
				131	lw->inv_weight = 0;
				132	}
				133
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	134	/*
				135	* Increase the granularity value when there are more CPUs,
				136	* because with more CPUs the 'effective latency' as visible
				137	* to users decreases. But the relationship is not linear,
				138	* so pick a second-best guess by going with the log2 of the
				139	* number of CPUs.
				140	*
				141	* This idea comes from the SD scheduler of Con Kolivas:
				142	*/
				143	static int get_update_sysctl_factor(void)
				144	{
				145	unsigned int cpus = min_t(int, num_online_cpus(), 8);
				146	unsigned int factor;
				147
				148	switch (sysctl_sched_tunable_scaling) {
				149	case SCHED_TUNABLESCALING_NONE:
				150	factor = 1;
				151	break;
				152	case SCHED_TUNABLESCALING_LINEAR:
				153	factor = cpus;
				154	break;
				155	case SCHED_TUNABLESCALING_LOG:
				156	default:
				157	factor = 1 + ilog2(cpus);
				158	break;
				159	}
				160
				161	return factor;
				162	}
				163
				164	static void update_sysctl(void)
				165	{
				166	unsigned int factor = get_update_sysctl_factor();
				167
				168	#define SET_SYSCTL(name) \
				169	(sysctl_##name = (factor) * normalized_sysctl_##name)
				170	SET_SYSCTL(sched_min_granularity);
				171	SET_SYSCTL(sched_latency);
				172	SET_SYSCTL(sched_wakeup_granularity);
				173	#undef SET_SYSCTL
				174	}
				175
				176	void sched_init_granularity(void)
				177	{
				178	update_sysctl();
				179	}
				180
				181	#if BITS_PER_LONG == 32
				182	# define WMULT_CONST (~0UL)
				183	#else
				184	# define WMULT_CONST (1UL << 32)
				185	#endif
				186
				187	#define WMULT_SHIFT 32
				188
				189	/*
				190	* Shift right and round:
				191	*/
				192	#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
				193
				194	/*
				195	* delta *= weight / lw
				196	*/
				197	static unsigned long
				198	calc_delta_mine(unsigned long delta_exec, unsigned long weight,
				199	struct load_weight *lw)
				200	{
				201	u64 tmp;
				202
				203	/*
				204	* weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
				205	* entities since MIN_SHARES = 2. Treat weight as 1 if less than
				206	* 2^SCHED_LOAD_RESOLUTION.
				207	*/
				208	if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
				209	tmp = (u64)delta_exec * scale_load_down(weight);
				210	else
				211	tmp = (u64)delta_exec;
				212
				213	if (!lw->inv_weight) {
				214	unsigned long w = scale_load_down(lw->weight);
				215
				216	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
				217	lw->inv_weight = 1;
				218	else if (unlikely(!w))
				219	lw->inv_weight = WMULT_CONST;
				220	else
				221	lw->inv_weight = WMULT_CONST / w;
				222	}
				223
				224	/*
				225	* Check whether we'd overflow the 64-bit multiplication:
				226	*/
				227	if (unlikely(tmp > WMULT_CONST))
				228	tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
				229	WMULT_SHIFT/2);
				230	else
				231	tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
				232
				233	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
				234	}
				235
				236
				237	const struct sched_class fair_sched_class;
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	238
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	239	/**************************************************************
				240	* CFS operations on generic schedulable entities:
				241	*/
				242
				243	#ifdef CONFIG_FAIR_GROUP_SCHED
				244
				245	/* cpu runqueue to which this cfs_rq is attached */
				246	static inline struct rq rq_of(struct cfs_rq cfs_rq)
				247	{
				248	return cfs_rq->rq;
				249	}
				250
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	251	/* An entity is a task if it doesn't "own" a runqueue */
				252	#define entity_is_task(se) (!se->my_q)
				253
Peter Zijlstra	8f48894	2009-07-24 12:25:30 +0200	[diff] [blame]	254	static inline struct task_struct task_of(struct sched_entity se)
				255	{
				256	#ifdef CONFIG_SCHED_DEBUG
				257	WARN_ON_ONCE(!entity_is_task(se));
				258	#endif
				259	return container_of(se, struct task_struct, se);
				260	}
				261
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	262	/* Walk up scheduling entities hierarchy */
				263	#define for_each_sched_entity(se) \
				264	for (; se; se = se->parent)
				265
				266	static inline struct cfs_rq task_cfs_rq(struct task_struct p)
				267	{
				268	return p->se.cfs_rq;
				269	}
				270
				271	/* runqueue on which this entity is (to be) queued */
				272	static inline struct cfs_rq cfs_rq_of(struct sched_entity se)
				273	{
				274	return se->cfs_rq;
				275	}
				276
				277	/* runqueue "owned" by this group */
				278	static inline struct cfs_rq group_cfs_rq(struct sched_entity grp)
				279	{
				280	return grp->my_q;
				281	}
				282
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	283	static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
				284	int force_update);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	285
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	286	static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				287	{
				288	if (!cfs_rq->on_list) {
Paul Turner	67e8625	2010-11-15 15:47:05 -0800	[diff] [blame]	289	/*
				290	* Ensure we either appear before our parent (if already
				291	* enqueued) or force our parent to appear after us when it is
				292	* enqueued. The fact that we always enqueue bottom-up
				293	* reduces this to two cases.
				294	*/
				295	if (cfs_rq->tg->parent &&
				296	cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
				297	list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	298	&rq_of(cfs_rq)->leaf_cfs_rq_list);
Paul Turner	67e8625	2010-11-15 15:47:05 -0800	[diff] [blame]	299	} else {
				300	list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
				301	&rq_of(cfs_rq)->leaf_cfs_rq_list);
				302	}
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	303
				304	cfs_rq->on_list = 1;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	305	/* We should have no load, but we need to update last_decay. */
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	306	update_cfs_rq_blocked_load(cfs_rq, 0);
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	307	}
				308	}
				309
				310	static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				311	{
				312	if (cfs_rq->on_list) {
				313	list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
				314	cfs_rq->on_list = 0;
				315	}
				316	}
				317
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	318	/* Iterate thr' all leaf cfs_rq's on a runqueue */
				319	#define for_each_leaf_cfs_rq(rq, cfs_rq) \
				320	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
				321
				322	/* Do the two (enqueued) entities belong to the same group ? */
				323	static inline int
				324	is_same_group(struct sched_entity se, struct sched_entity pse)
				325	{
				326	if (se->cfs_rq == pse->cfs_rq)
				327	return 1;
				328
				329	return 0;
				330	}
				331
				332	static inline struct sched_entity parent_entity(struct sched_entity se)
				333	{
				334	return se->parent;
				335	}
				336
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	337	/* return depth at which a sched entity is present in the hierarchy */
				338	static inline int depth_se(struct sched_entity *se)
				339	{
				340	int depth = 0;
				341
				342	for_each_sched_entity(se)
				343	depth++;
				344
				345	return depth;
				346	}
				347
				348	static void
				349	find_matching_se(struct sched_entity se, struct sched_entity pse)
				350	{
				351	int se_depth, pse_depth;
				352
				353	/*
				354	* preemption test can be made between sibling entities who are in the
				355	* same cfs_rq i.e who have a common parent. Walk up the hierarchy of
				356	* both tasks until we find their ancestors who are siblings of common
				357	* parent.
				358	*/
				359
				360	/* First walk up until both entities are at same depth */
				361	se_depth = depth_se(*se);
				362	pse_depth = depth_se(*pse);
				363
				364	while (se_depth > pse_depth) {
				365	se_depth--;
				366	se = parent_entity(se);
				367	}
				368
				369	while (pse_depth > se_depth) {
				370	pse_depth--;
				371	pse = parent_entity(pse);
				372	}
				373
				374	while (!is_same_group(se, pse)) {
				375	se = parent_entity(se);
				376	pse = parent_entity(pse);
				377	}
				378	}
				379
Peter Zijlstra	8f48894	2009-07-24 12:25:30 +0200	[diff] [blame]	380	#else /* !CONFIG_FAIR_GROUP_SCHED */
				381
				382	static inline struct task_struct task_of(struct sched_entity se)
				383	{
				384	return container_of(se, struct task_struct, se);
				385	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	386
				387	static inline struct rq rq_of(struct cfs_rq cfs_rq)
				388	{
				389	return container_of(cfs_rq, struct rq, cfs);
				390	}
				391
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	392	#define entity_is_task(se) 1
				393
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	394	#define for_each_sched_entity(se) \
				395	for (; se; se = NULL)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	396
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	397	static inline struct cfs_rq task_cfs_rq(struct task_struct p)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	398	{
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	399	return &task_rq(p)->cfs;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	400	}
				401
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	402	static inline struct cfs_rq cfs_rq_of(struct sched_entity se)
				403	{
				404	struct task_struct *p = task_of(se);
				405	struct rq *rq = task_rq(p);
				406
				407	return &rq->cfs;
				408	}
				409
				410	/* runqueue "owned" by this group */
				411	static inline struct cfs_rq group_cfs_rq(struct sched_entity grp)
				412	{
				413	return NULL;
				414	}
				415
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	416	static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				417	{
				418	}
				419
				420	static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
				421	{
				422	}
				423
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	424	#define for_each_leaf_cfs_rq(rq, cfs_rq) \
				425	for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
				426
				427	static inline int
				428	is_same_group(struct sched_entity se, struct sched_entity pse)
				429	{
				430	return 1;
				431	}
				432
				433	static inline struct sched_entity parent_entity(struct sched_entity se)
				434	{
				435	return NULL;
				436	}
				437
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	438	static inline void
				439	find_matching_se(struct sched_entity se, struct sched_entity pse)
				440	{
				441	}
				442
Peter Zijlstra	b758149	2008-04-19 19:45:00 +0200	[diff] [blame]	443	#endif /* CONFIG_FAIR_GROUP_SCHED */
				444
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	445	static __always_inline
				446	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	447
				448	/**************************************************************
				449	* Scheduling class tree data structure manipulation methods:
				450	*/
				451
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	452	static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
Peter Zijlstra	02e0431	2007-10-15 17:00:07 +0200	[diff] [blame]	453	{
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	454	s64 delta = (s64)(vruntime - max_vruntime);
Peter Zijlstra	368059a	2007-10-15 17:00:11 +0200	[diff] [blame]	455	if (delta > 0)
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	456	max_vruntime = vruntime;
Peter Zijlstra	02e0431	2007-10-15 17:00:07 +0200	[diff] [blame]	457
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	458	return max_vruntime;
Peter Zijlstra	02e0431	2007-10-15 17:00:07 +0200	[diff] [blame]	459	}
				460
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	461	static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
Peter Zijlstra	b0ffd24	2007-10-15 17:00:12 +0200	[diff] [blame]	462	{
				463	s64 delta = (s64)(vruntime - min_vruntime);
				464	if (delta < 0)
				465	min_vruntime = vruntime;
				466
				467	return min_vruntime;
				468	}
				469
Fabio Checconi	54fdc58	2009-07-16 12:32:27 +0200	[diff] [blame]	470	static inline int entity_before(struct sched_entity *a,
				471	struct sched_entity *b)
				472	{
				473	return (s64)(a->vruntime - b->vruntime) < 0;
				474	}
				475
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	476	static void update_min_vruntime(struct cfs_rq *cfs_rq)
				477	{
				478	u64 vruntime = cfs_rq->min_vruntime;
				479
				480	if (cfs_rq->curr)
				481	vruntime = cfs_rq->curr->vruntime;
				482
				483	if (cfs_rq->rb_leftmost) {
				484	struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
				485	struct sched_entity,
				486	run_node);
				487
Peter Zijlstra	e17036d	2009-01-15 14:53:39 +0100	[diff] [blame]	488	if (!cfs_rq->curr)
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	489	vruntime = se->vruntime;
				490	else
				491	vruntime = min_vruntime(vruntime, se->vruntime);
				492	}
				493
Andrei Epure	1bf0823	2013-03-12 21:12:24 +0200	[diff] [blame]	494	/* ensure we never gain time by being placed backwards. */
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	495	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	496	#ifndef CONFIG_64BIT
				497	smp_wmb();
				498	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
				499	#endif
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	500	}
				501
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	502	/*
				503	* Enqueue an entity into the rb-tree:
				504	*/
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	505	static void __enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	506	{
				507	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
				508	struct rb_node *parent = NULL;
				509	struct sched_entity *entry;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	510	int leftmost = 1;
				511
				512	/*
				513	* Find the right place in the rbtree:
				514	*/
				515	while (*link) {
				516	parent = *link;
				517	entry = rb_entry(parent, struct sched_entity, run_node);
				518	/*
				519	* We dont care about collisions. Nodes with
				520	* the same key stay together.
				521	*/
Stephan Baerwolf	2bd2d6f	2011-07-20 14:46:59 +0200	[diff] [blame]	522	if (entity_before(se, entry)) {
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	523	link = &parent->rb_left;
				524	} else {
				525	link = &parent->rb_right;
				526	leftmost = 0;
				527	}
				528	}
				529
				530	/*
				531	* Maintain a cache of leftmost tree entries (it is frequently
				532	* used):
				533	*/
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	534	if (leftmost)
Ingo Molnar	57cb499	2007-10-15 17:00:11 +0200	[diff] [blame]	535	cfs_rq->rb_leftmost = &se->run_node;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	536
				537	rb_link_node(&se->run_node, parent, link);
				538	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	539	}
				540
Ingo Molnar	0702e3e	2007-10-15 17:00:14 +0200	[diff] [blame]	541	static void __dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	542	{
Peter Zijlstra	3fe6974	2008-03-14 20:55:51 +0100	[diff] [blame]	543	if (cfs_rq->rb_leftmost == &se->run_node) {
				544	struct rb_node *next_node;
Peter Zijlstra	3fe6974	2008-03-14 20:55:51 +0100	[diff] [blame]	545
				546	next_node = rb_next(&se->run_node);
				547	cfs_rq->rb_leftmost = next_node;
Peter Zijlstra	3fe6974	2008-03-14 20:55:51 +0100	[diff] [blame]	548	}
Ingo Molnar	e9acbff	2007-10-15 17:00:04 +0200	[diff] [blame]	549
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	550	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	551	}
				552
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	553	struct sched_entity __pick_first_entity(struct cfs_rq cfs_rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	554	{
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	555	struct rb_node *left = cfs_rq->rb_leftmost;
				556
				557	if (!left)
				558	return NULL;
				559
				560	return rb_entry(left, struct sched_entity, run_node);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	561	}
				562
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	563	static struct sched_entity __pick_next_entity(struct sched_entity se)
				564	{
				565	struct rb_node *next = rb_next(&se->run_node);
				566
				567	if (!next)
				568	return NULL;
				569
				570	return rb_entry(next, struct sched_entity, run_node);
				571	}
				572
				573	#ifdef CONFIG_SCHED_DEBUG
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	574	struct sched_entity __pick_last_entity(struct cfs_rq cfs_rq)
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	575	{
Ingo Molnar	7eee3e6	2008-02-22 10:32:21 +0100	[diff] [blame]	576	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	577
Balbir Singh	70eee74	2008-02-22 13:25:53 +0530	[diff] [blame]	578	if (!last)
				579	return NULL;
Ingo Molnar	7eee3e6	2008-02-22 10:32:21 +0100	[diff] [blame]	580
				581	return rb_entry(last, struct sched_entity, run_node);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	582	}
				583
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	584	/**************************************************************
				585	* Scheduling class statistics methods:
				586	*/
				587
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	588	int sched_proc_update_handler(struct ctl_table *table, int write,
Alexey Dobriyan	8d65af7	2009-09-23 15:57:19 -0700	[diff] [blame]	589	void __user buffer, size_t lenp,
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	590	loff_t *ppos)
				591	{
Alexey Dobriyan	8d65af7	2009-09-23 15:57:19 -0700	[diff] [blame]	592	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	593	int factor = get_update_sysctl_factor();
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	594
				595	if (ret \|\| !write)
				596	return ret;
				597
				598	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
				599	sysctl_sched_min_granularity);
				600
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	601	#define WRT_SYSCTL(name) \
				602	(normalized_sysctl_##name = sysctl_##name / (factor))
				603	WRT_SYSCTL(sched_min_granularity);
				604	WRT_SYSCTL(sched_latency);
				605	WRT_SYSCTL(sched_wakeup_granularity);
Christian Ehrhardt	acb4a84	2009-11-30 12:16:48 +0100	[diff] [blame]	606	#undef WRT_SYSCTL
				607
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	608	return 0;
				609	}
				610	#endif
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	611
				612	/*
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	613	* delta /= w
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	614	*/
				615	static inline unsigned long
				616	calc_delta_fair(unsigned long delta, struct sched_entity *se)
				617	{
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	618	if (unlikely(se->load.weight != NICE_0_LOAD))
				619	delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	620
				621	return delta;
				622	}
				623
				624	/*
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	625	* The idea is to set a period in which each task runs once.
				626	*
Borislav Petkov	532b185	2012-08-08 16:16:04 +0200	[diff] [blame]	627	* When there are too many tasks (sched_nr_latency) we have to stretch
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	628	* this period because otherwise the slices get too small.
				629	*
				630	* p = (nr <= nl) ? l : l*nr/nl
				631	*/
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	632	static u64 __sched_period(unsigned long nr_running)
				633	{
				634	u64 period = sysctl_sched_latency;
Peter Zijlstra	b2be5e9	2007-11-09 22:39:37 +0100	[diff] [blame]	635	unsigned long nr_latency = sched_nr_latency;
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	636
				637	if (unlikely(nr_running > nr_latency)) {
Peter Zijlstra	4bf0b77	2008-01-25 21:08:21 +0100	[diff] [blame]	638	period = sysctl_sched_min_granularity;
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	639	period *= nr_running;
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	640	}
				641
				642	return period;
				643	}
				644
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	645	/*
				646	* We calculate the wall-time slice from the period by taking a part
				647	* proportional to the weight.
				648	*
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	649	* s = p*P[w/rw]
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	650	*/
Peter Zijlstra	6d0f0ebd	2007-10-15 17:00:05 +0200	[diff] [blame]	651	static u64 sched_slice(struct cfs_rq cfs_rq, struct sched_entity se)
Peter Zijlstra	2180508	2007-08-25 18:41:53 +0200	[diff] [blame]	652	{
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	653	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	654
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	655	for_each_sched_entity(se) {
Lin Ming	6272d68	2009-01-15 17:17:15 +0100	[diff] [blame]	656	struct load_weight *load;
Christian Engelmayer	3104bf0	2009-06-16 10:35:12 +0200	[diff] [blame]	657	struct load_weight lw;
Lin Ming	6272d68	2009-01-15 17:17:15 +0100	[diff] [blame]	658
				659	cfs_rq = cfs_rq_of(se);
				660	load = &cfs_rq->load;
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	661
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	662	if (unlikely(!se->on_rq)) {
Christian Engelmayer	3104bf0	2009-06-16 10:35:12 +0200	[diff] [blame]	663	lw = cfs_rq->load;
Mike Galbraith	0a58244	2009-01-02 12:16:42 +0100	[diff] [blame]	664
				665	update_load_add(&lw, se->load.weight);
				666	load = &lw;
				667	}
				668	slice = calc_delta_mine(slice, se->load.weight, load);
				669	}
				670	return slice;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	671	}
				672
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	673	/*
Andrei Epure	660cc00	2013-03-11 12:03:20 +0200	[diff] [blame]	674	* We calculate the vruntime slice of a to-be-inserted task.
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	675	*
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	676	* vs = s/w
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	677	*/
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	678	static u64 sched_vslice(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	647e7ca	2007-10-15 17:00:13 +0200	[diff] [blame]	679	{
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	680	return calc_delta_fair(sched_slice(cfs_rq, se), se);
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	681	}
				682
Alex Shi	a75cdaa	2013-06-20 10:18:47 +0800	[diff] [blame]	683	#ifdef CONFIG_SMP
				684	static inline void __update_task_entity_contrib(struct sched_entity *se);
				685
				686	/* Give new task start runnable values to heavy its load in infant time */
				687	void init_task_runnable_average(struct task_struct *p)
				688	{
				689	u32 slice;
				690
				691	p->se.avg.decay_count = 0;
				692	slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
				693	p->se.avg.runnable_avg_sum = slice;
				694	p->se.avg.runnable_avg_period = slice;
				695	__update_task_entity_contrib(&p->se);
				696	}
				697	#else
				698	void init_task_runnable_average(struct task_struct *p)
				699	{
				700	}
				701	#endif
				702
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	703	/*
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	704	* Update the current task's runtime statistics. Skip current tasks that
				705	* are not in our scheduling class.
				706	*/
				707	static inline void
Ingo Molnar	8ebc91d	2007-10-15 17:00:03 +0200	[diff] [blame]	708	__update_curr(struct cfs_rq cfs_rq, struct sched_entity curr,
				709	unsigned long delta_exec)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	710	{
Ingo Molnar	bbdba7c	2007-10-15 17:00:06 +0200	[diff] [blame]	711	unsigned long delta_exec_weighted;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	712
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	713	schedstat_set(curr->statistics.exec_max,
				714	max((u64)delta_exec, curr->statistics.exec_max));
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	715
				716	curr->sum_exec_runtime += delta_exec;
Ingo Molnar	7a62eab	2007-10-15 17:00:06 +0200	[diff] [blame]	717	schedstat_add(cfs_rq, exec_clock, delta_exec);
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	718	delta_exec_weighted = calc_delta_fair(delta_exec, curr);
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	719
Ingo Molnar	e9acbff	2007-10-15 17:00:04 +0200	[diff] [blame]	720	curr->vruntime += delta_exec_weighted;
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	721	update_min_vruntime(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	722	}
				723
Ingo Molnar	b7cc089	2007-08-09 11:16:47 +0200	[diff] [blame]	724	static void update_curr(struct cfs_rq *cfs_rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	725	{
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	726	struct sched_entity *curr = cfs_rq->curr;
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	727	u64 now = rq_clock_task(rq_of(cfs_rq));
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	728	unsigned long delta_exec;
				729
				730	if (unlikely(!curr))
				731	return;
				732
				733	/*
				734	* Get the amount of time the current task was running
				735	* since the last time we changed load (this cannot
				736	* overflow on 32 bits):
				737	*/
Ingo Molnar	8ebc91d	2007-10-15 17:00:03 +0200	[diff] [blame]	738	delta_exec = (unsigned long)(now - curr->exec_start);
Peter Zijlstra	34f28ec	2008-12-16 08:45:31 +0100	[diff] [blame]	739	if (!delta_exec)
				740	return;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	741
Ingo Molnar	8ebc91d	2007-10-15 17:00:03 +0200	[diff] [blame]	742	__update_curr(cfs_rq, curr, delta_exec);
				743	curr->exec_start = now;
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	744
				745	if (entity_is_task(curr)) {
				746	struct task_struct *curtask = task_of(curr);
				747
Ingo Molnar	f977bb4	2009-09-13 18:15:54 +0200	[diff] [blame]	748	trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	749	cpuacct_charge(curtask, delta_exec);
Frank Mayhar	f06febc	2008-09-12 09:54:39 -0700	[diff] [blame]	750	account_group_exec_runtime(curtask, delta_exec);
Srivatsa Vaddagiri	d842de8	2007-12-02 20:04:49 +0100	[diff] [blame]	751	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	752
				753	account_cfs_rq_runtime(cfs_rq, delta_exec);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	754	}
				755
				756	static inline void
Ingo Molnar	5870db5	2007-08-09 11:16:47 +0200	[diff] [blame]	757	update_stats_wait_start(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	758	{
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	759	schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	760	}
				761
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	762	/*
				763	* Task is being enqueued - update stats:
				764	*/
Ingo Molnar	d2417e5	2007-08-09 11:16:47 +0200	[diff] [blame]	765	static void update_stats_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	766	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	767	/*
				768	* Are we enqueueing a waiting task? (for current tasks
				769	* a dequeue/enqueue event is a NOP)
				770	*/
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	771	if (se != cfs_rq->curr)
Ingo Molnar	5870db5	2007-08-09 11:16:47 +0200	[diff] [blame]	772	update_stats_wait_start(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	773	}
				774
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	775	static void
Ingo Molnar	9ef0a96	2007-08-09 11:16:47 +0200	[diff] [blame]	776	update_stats_wait_end(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	777	{
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	778	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	779	rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	780	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
				781	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	782	rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	783	#ifdef CONFIG_SCHEDSTATS
				784	if (entity_is_task(se)) {
				785	trace_sched_stat_wait(task_of(se),
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	786	rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	787	}
				788	#endif
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	789	schedstat_set(se->statistics.wait_start, 0);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	790	}
				791
				792	static inline void
Ingo Molnar	19b6a2e	2007-08-09 11:16:48 +0200	[diff] [blame]	793	update_stats_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	794	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	795	/*
				796	* Mark the end of the wait period if dequeueing a
				797	* waiting task:
				798	*/
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	799	if (se != cfs_rq->curr)
Ingo Molnar	9ef0a96	2007-08-09 11:16:47 +0200	[diff] [blame]	800	update_stats_wait_end(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	801	}
				802
				803	/*
				804	* We are picking a new current task - update its stats:
				805	*/
				806	static inline void
Ingo Molnar	79303e9	2007-08-09 11:16:47 +0200	[diff] [blame]	807	update_stats_curr_start(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	808	{
				809	/*
				810	* We are starting a new run period:
				811	*/
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	812	se->exec_start = rq_clock_task(rq_of(cfs_rq));
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	813	}
				814
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	815	/**************************************************
				816	* Scheduling class queueing methods:
				817	*/
				818
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	819	#ifdef CONFIG_NUMA_BALANCING
				820	/*
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	821	* Approximate time to scan a full NUMA task in ms. The task scan period is
				822	* calculated based on the tasks virtual memory size and
				823	* numa_balancing_scan_size.
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	824	*/
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	825	unsigned int sysctl_numa_balancing_scan_period_min = 1000;
				826	unsigned int sysctl_numa_balancing_scan_period_max = 60000;
				827	unsigned int sysctl_numa_balancing_scan_period_reset = 60000;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	828
				829	/* Portion of address space to scan in MB */
				830	unsigned int sysctl_numa_balancing_scan_size = 256;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	831
Peter Zijlstra	4b96a29	2012-10-25 14:16:47 +0200	[diff] [blame]	832	/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
				833	unsigned int sysctl_numa_balancing_scan_delay = 1000;
				834
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	835	static unsigned int task_nr_scan_windows(struct task_struct *p)
				836	{
				837	unsigned long rss = 0;
				838	unsigned long nr_scan_pages;
				839
				840	/*
				841	* Calculations based on RSS as non-present and empty pages are skipped
				842	* by the PTE scanner and NUMA hinting faults should be trapped based
				843	* on resident pages
				844	*/
				845	nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
				846	rss = get_mm_rss(p->mm);
				847	if (!rss)
				848	rss = nr_scan_pages;
				849
				850	rss = round_up(rss, nr_scan_pages);
				851	return rss / nr_scan_pages;
				852	}
				853
				854	/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
				855	#define MAX_SCAN_WINDOW 2560
				856
				857	static unsigned int task_scan_min(struct task_struct *p)
				858	{
				859	unsigned int scan, floor;
				860	unsigned int windows = 1;
				861
				862	if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
				863	windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
				864	floor = 1000 / windows;
				865
				866	scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
				867	return max_t(unsigned int, floor, scan);
				868	}
				869
				870	static unsigned int task_scan_max(struct task_struct *p)
				871	{
				872	unsigned int smin = task_scan_min(p);
				873	unsigned int smax;
				874
				875	/* Watch for min being lower than max due to floor calculations */
				876	smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
				877	return max(smin, smax);
				878	}
				879
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	880	static void task_numa_placement(struct task_struct *p)
				881	{
Mel Gorman	688b758	2013-10-07 11:28:58 +0100	[diff] [blame^]	882	int seq, nid, max_nid = -1;
				883	unsigned long max_faults = 0;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	884
Hugh Dickins	2832bc1	2012-12-19 17:42:16 -0800	[diff] [blame]	885	if (!p->mm) /* for example, ksmd faulting in a user's mm */
				886	return;
				887	seq = ACCESS_ONCE(p->mm->numa_scan_seq);
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	888	if (p->numa_scan_seq == seq)
				889	return;
				890	p->numa_scan_seq = seq;
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	891	p->numa_scan_period_max = task_scan_max(p);
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	892
Mel Gorman	688b758	2013-10-07 11:28:58 +0100	[diff] [blame^]	893	/* Find the node with the highest number of faults */
				894	for_each_online_node(nid) {
				895	unsigned long faults = p->numa_faults[nid];
				896	p->numa_faults[nid] >>= 1;
				897	if (faults > max_faults) {
				898	max_faults = faults;
				899	max_nid = nid;
				900	}
				901	}
				902
				903	/* Update the tasks preferred node if necessary */
				904	if (max_faults && max_nid != p->numa_preferred_nid)
				905	p->numa_preferred_nid = max_nid;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	906	}
				907
				908	/*
				909	* Got a PROT_NONE fault for a page on @node.
				910	*/
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	911	void task_numa_fault(int node, int pages, bool migrated)
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	912	{
				913	struct task_struct *p = current;
				914
Dave Kleikamp	10e84b9	2013-07-31 13:53:35 -0700	[diff] [blame]	915	if (!numabalancing_enabled)
Mel Gorman	1a687c2	2012-11-22 11:16:36 +0000	[diff] [blame]	916	return;
				917
Mel Gorman	f809ca9	2013-10-07 11:28:57 +0100	[diff] [blame]	918	/* Allocate buffer to track faults on a per-node basis */
				919	if (unlikely(!p->numa_faults)) {
				920	int size = sizeof(p->numa_faults) nr_node_ids;
				921
				922	p->numa_faults = kzalloc(size, GFP_KERNEL\|__GFP_NOWARN);
				923	if (!p->numa_faults)
				924	return;
				925	}
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	926
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	927	/*
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	928	* If pages are properly placed (did not migrate) then scan slower.
				929	* This is reset periodically in case of phase changes
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	930	*/
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	931	if (!migrated) {
				932	/* Initialise if necessary */
				933	if (!p->numa_scan_period_max)
				934	p->numa_scan_period_max = task_scan_max(p);
				935
				936	p->numa_scan_period = min(p->numa_scan_period_max,
				937	p->numa_scan_period + 10);
				938	}
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	939
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	940	task_numa_placement(p);
Mel Gorman	f809ca9	2013-10-07 11:28:57 +0100	[diff] [blame]	941
				942	p->numa_faults[node] += pages;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	943	}
				944
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	945	static void reset_ptenuma_scan(struct task_struct *p)
				946	{
				947	ACCESS_ONCE(p->mm->numa_scan_seq)++;
				948	p->mm->numa_scan_offset = 0;
				949	}
				950
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	951	/*
				952	* The expensive part of numa migration is done from task_work context.
				953	* Triggered from task_tick_numa().
				954	*/
				955	void task_numa_work(struct callback_head *work)
				956	{
				957	unsigned long migrate, next_scan, now = jiffies;
				958	struct task_struct *p = current;
				959	struct mm_struct *mm = p->mm;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	960	struct vm_area_struct *vma;
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	961	unsigned long start, end;
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	962	unsigned long nr_pte_updates = 0;
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	963	long pages;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	964
				965	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
				966
				967	work->next = work; /* protect against double add */
				968	/*
				969	* Who cares about NUMA placement when they're dying.
				970	*
				971	* NOTE: make sure not to dereference p->mm before this check,
				972	* exit_task_work() happens _after_ exit_mm() so we could be called
				973	* without p->mm even though we still had it when we enqueued this
				974	* work.
				975	*/
				976	if (p->flags & PF_EXITING)
				977	return;
				978
Mel Gorman	7e8d16b	2013-10-07 11:28:54 +0100	[diff] [blame]	979	if (!mm->numa_next_reset \|\| !mm->numa_next_scan) {
				980	mm->numa_next_scan = now +
				981	msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
				982	mm->numa_next_reset = now +
				983	msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
				984	}
				985
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	986	/*
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	987	* Reset the scan period if enough time has gone by. Objective is that
				988	* scanning will be reduced if pages are properly placed. As tasks
				989	* can enter different phases this needs to be re-examined. Lacking
				990	* proper tracking of reference behaviour, this blunt hammer is used.
				991	*/
				992	migrate = mm->numa_next_reset;
				993	if (time_after(now, migrate)) {
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	994	p->numa_scan_period = task_scan_min(p);
Mel Gorman	b8593bf	2012-11-21 01:18:23 +0000	[diff] [blame]	995	next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
				996	xchg(&mm->numa_next_reset, next_scan);
				997	}
				998
				999	/*
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1000	* Enforce maximal scan/migration frequency..
				1001	*/
				1002	migrate = mm->numa_next_scan;
				1003	if (time_before(now, migrate))
				1004	return;
				1005
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	1006	if (p->numa_scan_period == 0) {
				1007	p->numa_scan_period_max = task_scan_max(p);
				1008	p->numa_scan_period = task_scan_min(p);
				1009	}
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1010
Mel Gorman	fb003b8	2012-11-15 09:01:14 +0000	[diff] [blame]	1011	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1012	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
				1013	return;
				1014
Mel Gorman	e14808b	2012-11-19 10:59:15 +0000	[diff] [blame]	1015	/*
Peter Zijlstra	19a78d1	2013-10-07 11:28:51 +0100	[diff] [blame]	1016	* Delay this task enough that another task of this mm will likely win
				1017	* the next time around.
				1018	*/
				1019	p->node_stamp += 2 * TICK_NSEC;
				1020
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	1021	start = mm->numa_scan_offset;
				1022	pages = sysctl_numa_balancing_scan_size;
				1023	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
				1024	if (!pages)
				1025	return;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1026
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	1027	down_read(&mm->mmap_sem);
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	1028	vma = find_vma(mm, start);
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	1029	if (!vma) {
				1030	reset_ptenuma_scan(p);
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	1031	start = 0;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	1032	vma = mm->mmap;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1033	}
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	1034	for (; vma; vma = vma->vm_next) {
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	1035	if (!vma_migratable(vma))
				1036	continue;
				1037
				1038	/* Skip small VMAs. They are not likely to be of relevance */
Mel Gorman	221392c	2012-12-17 14:05:53 +0000	[diff] [blame]	1039	if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	1040	continue;
				1041
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	1042	do {
				1043	start = max(start, vma->vm_start);
				1044	end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
				1045	end = min(end, vma->vm_end);
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	1046	nr_pte_updates += change_prot_numa(vma, start, end);
				1047
				1048	/*
				1049	* Scan sysctl_numa_balancing_scan_size but ensure that
				1050	* at least one PTE is updated so that unused virtual
				1051	* address space is quickly skipped.
				1052	*/
				1053	if (nr_pte_updates)
				1054	pages -= (end - start) >> PAGE_SHIFT;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	1055
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	1056	start = end;
				1057	if (pages <= 0)
				1058	goto out;
				1059	} while (end != vma->vm_end);
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	1060	}
				1061
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	1062	out:
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	1063	/*
Mel Gorman	f307cd1	2013-10-07 11:28:56 +0100	[diff] [blame]	1064	* If the whole process was scanned without updates then no NUMA
				1065	* hinting faults are being recorded and scan rate should be lower.
				1066	*/
				1067	if (mm->numa_scan_offset == 0 && !nr_pte_updates) {
				1068	p->numa_scan_period = min(p->numa_scan_period_max,
				1069	p->numa_scan_period << 1);
				1070
				1071	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
				1072	mm->numa_next_scan = next_scan;
				1073	}
				1074
				1075	/*
Peter Zijlstra	c69307d	2013-10-07 11:28:41 +0100	[diff] [blame]	1076	* It is possible to reach the end of the VMA list but the last few
				1077	* VMAs are not guaranteed to the vma_migratable. If they are not, we
				1078	* would find the !migratable VMA on the next scan but not reset the
				1079	* scanner to the start so check it now.
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	1080	*/
				1081	if (vma)
Mel Gorman	9f40604	2012-11-14 18:34:32 +0000	[diff] [blame]	1082	mm->numa_scan_offset = start;
Peter Zijlstra	6e5fb22	2012-10-25 14:16:45 +0200	[diff] [blame]	1083	else
				1084	reset_ptenuma_scan(p);
				1085	up_read(&mm->mmap_sem);
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1086	}
				1087
				1088	/*
				1089	* Drive the periodic memory faults..
				1090	*/
				1091	void task_tick_numa(struct rq rq, struct task_struct curr)
				1092	{
				1093	struct callback_head *work = &curr->numa_work;
				1094	u64 period, now;
				1095
				1096	/*
				1097	* We don't care about NUMA placement if we don't have memory.
				1098	*/
				1099	if (!curr->mm \|\| (curr->flags & PF_EXITING) \|\| work->next != work)
				1100	return;
				1101
				1102	/*
				1103	* Using runtime rather than walltime has the dual advantage that
				1104	* we (mostly) drive the selection from busy threads and that the
				1105	* task needs to have done some actual work before we bother with
				1106	* NUMA placement.
				1107	*/
				1108	now = curr->se.sum_exec_runtime;
				1109	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
				1110
				1111	if (now - curr->node_stamp > period) {
Peter Zijlstra	4b96a29	2012-10-25 14:16:47 +0200	[diff] [blame]	1112	if (!curr->node_stamp)
Mel Gorman	598f0ec	2013-10-07 11:28:55 +0100	[diff] [blame]	1113	curr->numa_scan_period = task_scan_min(curr);
Peter Zijlstra	19a78d1	2013-10-07 11:28:51 +0100	[diff] [blame]	1114	curr->node_stamp += period;
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	1115
				1116	if (!time_before(jiffies, curr->mm->numa_next_scan)) {
				1117	init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
				1118	task_work_add(curr, work, true);
				1119	}
				1120	}
				1121	}
				1122	#else
				1123	static void task_tick_numa(struct rq rq, struct task_struct curr)
				1124	{
				1125	}
				1126	#endif /* CONFIG_NUMA_BALANCING */
				1127
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1128	static void
				1129	account_entity_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
				1130	{
				1131	update_load_add(&cfs_rq->load, se->load.weight);
Peter Zijlstra	c09595f	2008-06-27 13:41:14 +0200	[diff] [blame]	1132	if (!parent_entity(se))
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	1133	update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	1134	#ifdef CONFIG_SMP
				1135	if (entity_is_task(se))
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	1136	list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	1137	#endif
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1138	cfs_rq->nr_running++;
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1139	}
				1140
				1141	static void
				1142	account_entity_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)
				1143	{
				1144	update_load_sub(&cfs_rq->load, se->load.weight);
Peter Zijlstra	c09595f	2008-06-27 13:41:14 +0200	[diff] [blame]	1145	if (!parent_entity(se))
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	1146	update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	1147	if (entity_is_task(se))
Bharata B Rao	b87f172	2008-09-25 09:53:54 +0530	[diff] [blame]	1148	list_del_init(&se->group_node);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1149	cfs_rq->nr_running--;
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1150	}
				1151
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1152	#ifdef CONFIG_FAIR_GROUP_SCHED
				1153	# ifdef CONFIG_SMP
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1154	static inline long calc_tg_weight(struct task_group tg, struct cfs_rq cfs_rq)
				1155	{
				1156	long tg_weight;
				1157
				1158	/*
				1159	* Use this CPU's actual weight instead of the last load_contribution
				1160	* to gain a more accurate current total weight. See
				1161	* update_cfs_rq_load_contribution().
				1162	*/
Alex Shi	bf5b986	2013-06-20 10:18:54 +0800	[diff] [blame]	1163	tg_weight = atomic_long_read(&tg->load_avg);
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	1164	tg_weight -= cfs_rq->tg_load_contrib;
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1165	tg_weight += cfs_rq->load.weight;
				1166
				1167	return tg_weight;
				1168	}
				1169
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1170	static long calc_cfs_shares(struct cfs_rq cfs_rq, struct task_group tg)
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1171	{
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1172	long tg_weight, load, shares;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1173
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1174	tg_weight = calc_tg_weight(tg, cfs_rq);
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1175	load = cfs_rq->load.weight;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1176
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1177	shares = (tg->shares * load);
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	1178	if (tg_weight)
				1179	shares /= tg_weight;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1180
				1181	if (shares < MIN_SHARES)
				1182	shares = MIN_SHARES;
				1183	if (shares > tg->shares)
				1184	shares = tg->shares;
				1185
				1186	return shares;
				1187	}
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1188	# else /* CONFIG_SMP */
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1189	static inline long calc_cfs_shares(struct cfs_rq cfs_rq, struct task_group tg)
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1190	{
				1191	return tg->shares;
				1192	}
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1193	# endif /* CONFIG_SMP */
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1194	static void reweight_entity(struct cfs_rq cfs_rq, struct sched_entity se,
				1195	unsigned long weight)
				1196	{
Paul Turner	19e5eeb	2010-12-15 19:10:18 -0800	[diff] [blame]	1197	if (se->on_rq) {
				1198	/* commit outstanding execution time */
				1199	if (cfs_rq->curr == se)
				1200	update_curr(cfs_rq);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1201	account_entity_dequeue(cfs_rq, se);
Paul Turner	19e5eeb	2010-12-15 19:10:18 -0800	[diff] [blame]	1202	}
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1203
				1204	update_load_set(&se->load, weight);
				1205
				1206	if (se->on_rq)
				1207	account_entity_enqueue(cfs_rq, se);
				1208	}
				1209
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	1210	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
				1211
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1212	static void update_cfs_shares(struct cfs_rq *cfs_rq)
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1213	{
				1214	struct task_group *tg;
				1215	struct sched_entity *se;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1216	long shares;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1217
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1218	tg = cfs_rq->tg;
				1219	se = tg->se[cpu_of(rq_of(cfs_rq))];
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	1220	if (!se \|\| throttled_hierarchy(cfs_rq))
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1221	return;
Yong Zhang	3ff6dca	2011-01-24 15:33:52 +0800	[diff] [blame]	1222	#ifndef CONFIG_SMP
				1223	if (likely(se->load.weight == tg->shares))
				1224	return;
				1225	#endif
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1226	shares = calc_cfs_shares(cfs_rq, tg);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1227
				1228	reweight_entity(cfs_rq_of(se), se, shares);
				1229	}
				1230	#else /* CONFIG_FAIR_GROUP_SCHED */
Paul Turner	6d5ab29	2011-01-21 20:45:01 -0800	[diff] [blame]	1231	static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1232	{
				1233	}
				1234	#endif /* CONFIG_FAIR_GROUP_SCHED */
				1235
Alex Shi	141965c	2013-06-26 13:05:39 +0800	[diff] [blame]	1236	#ifdef CONFIG_SMP
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1237	/*
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1238	* We choose a half-life close to 1 scheduling period.
				1239	* Note: The tables below are dependent on this value.
				1240	*/
				1241	#define LOAD_AVG_PERIOD 32
				1242	#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
				1243	#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
				1244
				1245	/* Precomputed fixed inverse multiplies for multiplication by y^n */
				1246	static const u32 runnable_avg_yN_inv[] = {
				1247	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
				1248	0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
				1249	0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
				1250	0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
				1251	0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
				1252	0x85aac367, 0x82cd8698,
				1253	};
				1254
				1255	/*
				1256	* Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
				1257	* over-estimates when re-combining.
				1258	*/
				1259	static const u32 runnable_avg_yN_sum[] = {
				1260	0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
				1261	9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
				1262	17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
				1263	};
				1264
				1265	/*
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1266	* Approximate:
				1267	* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
				1268	*/
				1269	static __always_inline u64 decay_load(u64 val, u64 n)
				1270	{
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1271	unsigned int local_n;
				1272
				1273	if (!n)
				1274	return val;
				1275	else if (unlikely(n > LOAD_AVG_PERIOD * 63))
				1276	return 0;
				1277
				1278	/* after bounds checking we can collapse to 32-bit */
				1279	local_n = n;
				1280
				1281	/*
				1282	* As y^PERIOD = 1/2, we can combine
				1283	* y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
				1284	* With a look-up table which covers k^n (n<PERIOD)
				1285	*
				1286	* To achieve constant time decay_load.
				1287	*/
				1288	if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
				1289	val >>= local_n / LOAD_AVG_PERIOD;
				1290	local_n %= LOAD_AVG_PERIOD;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1291	}
				1292
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1293	val *= runnable_avg_yN_inv[local_n];
				1294	/* We don't use SRR here since we always want to round down. */
				1295	return val >> 32;
				1296	}
				1297
				1298	/*
				1299	* For updates fully spanning n periods, the contribution to runnable
				1300	* average will be: \Sum 1024*y^n
				1301	*
				1302	* We can compute this reasonably efficiently by combining:
				1303	* y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
				1304	*/
				1305	static u32 __compute_runnable_contrib(u64 n)
				1306	{
				1307	u32 contrib = 0;
				1308
				1309	if (likely(n <= LOAD_AVG_PERIOD))
				1310	return runnable_avg_yN_sum[n];
				1311	else if (unlikely(n >= LOAD_AVG_MAX_N))
				1312	return LOAD_AVG_MAX;
				1313
				1314	/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
				1315	do {
				1316	contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
				1317	contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
				1318
				1319	n -= LOAD_AVG_PERIOD;
				1320	} while (n > LOAD_AVG_PERIOD);
				1321
				1322	contrib = decay_load(contrib, n);
				1323	return contrib + runnable_avg_yN_sum[n];
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1324	}
				1325
				1326	/*
				1327	* We can represent the historical contribution to runnable average as the
				1328	* coefficients of a geometric series. To do this we sub-divide our runnable
				1329	* history into segments of approximately 1ms (1024us); label the segment that
				1330	* occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
				1331	*
				1332	* [<- 1024us ->\|<- 1024us ->\|<- 1024us ->\| ...
				1333	* p0 p1 p2
				1334	* (now) (~1ms ago) (~2ms ago)
				1335	*
				1336	* Let u_i denote the fraction of p_i that the entity was runnable.
				1337	*
				1338	* We then designate the fractions u_i as our co-efficients, yielding the
				1339	* following representation of historical load:
				1340	* u_0 + u_1y + u_2y^2 + u_3*y^3 + ...
				1341	*
				1342	* We choose y based on the with of a reasonably scheduling period, fixing:
				1343	* y^32 = 0.5
				1344	*
				1345	* This means that the contribution to load ~32ms ago (u_32) will be weighted
				1346	* approximately half as much as the contribution to load within the last ms
				1347	* (u_0).
				1348	*
				1349	* When a period "rolls over" and we have new u_0`, multiplying the previous
				1350	* sum again by y is sufficient to update:
				1351	* load_avg = u_0` + y(u_0 + u_1y + u_2*y^2 + ... )
				1352	* = u_0 + u_1y + u_2y^2 + ... [re-labeling u_i --> u_{i+1}]
				1353	*/
				1354	static __always_inline int __update_entity_runnable_avg(u64 now,
				1355	struct sched_avg *sa,
				1356	int runnable)
				1357	{
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1358	u64 delta, periods;
				1359	u32 runnable_contrib;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1360	int delta_w, decayed = 0;
				1361
				1362	delta = now - sa->last_runnable_update;
				1363	/*
				1364	* This should only happen when time goes backwards, which it
				1365	* unfortunately does during sched clock init when we swap over to TSC.
				1366	*/
				1367	if ((s64)delta < 0) {
				1368	sa->last_runnable_update = now;
				1369	return 0;
				1370	}
				1371
				1372	/*
				1373	* Use 1024ns as the unit of measurement since it's a reasonable
				1374	* approximation of 1us and fast to compute.
				1375	*/
				1376	delta >>= 10;
				1377	if (!delta)
				1378	return 0;
				1379	sa->last_runnable_update = now;
				1380
				1381	/* delta_w is the amount already accumulated against our next period */
				1382	delta_w = sa->runnable_avg_period % 1024;
				1383	if (delta + delta_w >= 1024) {
				1384	/* period roll-over */
				1385	decayed = 1;
				1386
				1387	/*
				1388	* Now that we know we're crossing a period boundary, figure
				1389	* out how much from delta we need to complete the current
				1390	* period and accrue it.
				1391	*/
				1392	delta_w = 1024 - delta_w;
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1393	if (runnable)
				1394	sa->runnable_avg_sum += delta_w;
				1395	sa->runnable_avg_period += delta_w;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1396
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1397	delta -= delta_w;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1398
Paul Turner	5b51f2f	2012-10-04 13:18:32 +0200	[diff] [blame]	1399	/* Figure out how many additional periods this update spans */
				1400	periods = delta / 1024;
				1401	delta %= 1024;
				1402
				1403	sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
				1404	periods + 1);
				1405	sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
				1406	periods + 1);
				1407
				1408	/* Efficiently calculate \sum (1..n_period) 1024y^i /
				1409	runnable_contrib = __compute_runnable_contrib(periods);
				1410	if (runnable)
				1411	sa->runnable_avg_sum += runnable_contrib;
				1412	sa->runnable_avg_period += runnable_contrib;
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1413	}
				1414
				1415	/* Remainder of delta accrued against u_0` */
				1416	if (runnable)
				1417	sa->runnable_avg_sum += delta;
				1418	sa->runnable_avg_period += delta;
				1419
				1420	return decayed;
				1421	}
				1422
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1423	/* Synchronize an entity's decay with its parenting cfs_rq.*/
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1424	static inline u64 __synchronize_entity_decay(struct sched_entity *se)
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1425	{
				1426	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1427	u64 decays = atomic64_read(&cfs_rq->decay_counter);
				1428
				1429	decays -= se->avg.decay_count;
				1430	if (!decays)
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1431	return 0;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1432
				1433	se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
				1434	se->avg.decay_count = 0;
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1435
				1436	return decays;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1437	}
				1438
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1439	#ifdef CONFIG_FAIR_GROUP_SCHED
				1440	static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
				1441	int force_update)
				1442	{
				1443	struct task_group *tg = cfs_rq->tg;
Alex Shi	bf5b986	2013-06-20 10:18:54 +0800	[diff] [blame]	1444	long tg_contrib;
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1445
				1446	tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
				1447	tg_contrib -= cfs_rq->tg_load_contrib;
				1448
Alex Shi	bf5b986	2013-06-20 10:18:54 +0800	[diff] [blame]	1449	if (force_update \|\| abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
				1450	atomic_long_add(tg_contrib, &tg->load_avg);
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1451	cfs_rq->tg_load_contrib += tg_contrib;
				1452	}
				1453	}
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1454
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1455	/*
				1456	* Aggregate cfs_rq runnable averages into an equivalent task_group
				1457	* representation for computing load contributions.
				1458	*/
				1459	static inline void __update_tg_runnable_avg(struct sched_avg *sa,
				1460	struct cfs_rq *cfs_rq)
				1461	{
				1462	struct task_group *tg = cfs_rq->tg;
				1463	long contrib;
				1464
				1465	/* The fraction of a cpu used by this cfs_rq */
				1466	contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
				1467	sa->runnable_avg_period + 1);
				1468	contrib -= cfs_rq->tg_runnable_contrib;
				1469
				1470	if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
				1471	atomic_add(contrib, &tg->runnable_avg);
				1472	cfs_rq->tg_runnable_contrib += contrib;
				1473	}
				1474	}
				1475
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1476	static inline void __update_group_entity_contrib(struct sched_entity *se)
				1477	{
				1478	struct cfs_rq *cfs_rq = group_cfs_rq(se);
				1479	struct task_group *tg = cfs_rq->tg;
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1480	int runnable_avg;
				1481
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1482	u64 contrib;
				1483
				1484	contrib = cfs_rq->tg_load_contrib * tg->shares;
Alex Shi	bf5b986	2013-06-20 10:18:54 +0800	[diff] [blame]	1485	se->avg.load_avg_contrib = div_u64(contrib,
				1486	atomic_long_read(&tg->load_avg) + 1);
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1487
				1488	/*
				1489	* For group entities we need to compute a correction term in the case
				1490	* that they are consuming <1 cpu so that we would contribute the same
				1491	* load as a task of equal weight.
				1492	*
				1493	* Explicitly co-ordinating this measurement would be expensive, but
				1494	* fortunately the sum of each cpus contribution forms a usable
				1495	* lower-bound on the true value.
				1496	*
				1497	* Consider the aggregate of 2 contributions. Either they are disjoint
				1498	* (and the sum represents true value) or they are disjoint and we are
				1499	* understating by the aggregate of their overlap.
				1500	*
				1501	* Extending this to N cpus, for a given overlap, the maximum amount we
				1502	* understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
				1503	* cpus that overlap for this interval and w_i is the interval width.
				1504	*
				1505	* On a small machine; the first term is well-bounded which bounds the
				1506	* total error since w_i is a subset of the period. Whereas on a
				1507	* larger machine, while this first term can be larger, if w_i is the
				1508	* of consequential size guaranteed to see n_i*w_i quickly converge to
				1509	* our upper bound of 1-cpu.
				1510	*/
				1511	runnable_avg = atomic_read(&tg->runnable_avg);
				1512	if (runnable_avg < NICE_0_LOAD) {
				1513	se->avg.load_avg_contrib *= runnable_avg;
				1514	se->avg.load_avg_contrib >>= NICE_0_SHIFT;
				1515	}
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1516	}
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1517	#else
				1518	static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
				1519	int force_update) {}
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1520	static inline void __update_tg_runnable_avg(struct sched_avg *sa,
				1521	struct cfs_rq *cfs_rq) {}
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1522	static inline void __update_group_entity_contrib(struct sched_entity *se) {}
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1523	#endif
				1524
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1525	static inline void __update_task_entity_contrib(struct sched_entity *se)
				1526	{
				1527	u32 contrib;
				1528
				1529	/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
				1530	contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
				1531	contrib /= (se->avg.runnable_avg_period + 1);
				1532	se->avg.load_avg_contrib = scale_load(contrib);
				1533	}
				1534
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1535	/* Compute the current contribution to load_avg by se, return any delta */
				1536	static long __update_entity_load_avg_contrib(struct sched_entity *se)
				1537	{
				1538	long old_contrib = se->avg.load_avg_contrib;
				1539
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1540	if (entity_is_task(se)) {
				1541	__update_task_entity_contrib(se);
				1542	} else {
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1543	__update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
Paul Turner	8165e14	2012-10-04 13:18:31 +0200	[diff] [blame]	1544	__update_group_entity_contrib(se);
				1545	}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1546
				1547	return se->avg.load_avg_contrib - old_contrib;
				1548	}
				1549
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1550	static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
				1551	long load_contrib)
				1552	{
				1553	if (likely(load_contrib < cfs_rq->blocked_load_avg))
				1554	cfs_rq->blocked_load_avg -= load_contrib;
				1555	else
				1556	cfs_rq->blocked_load_avg = 0;
				1557	}
				1558
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	1559	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
				1560
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1561	/* Update a sched_entity's runnable average */
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1562	static inline void update_entity_load_avg(struct sched_entity *se,
				1563	int update_cfs_rq)
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1564	{
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1565	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1566	long contrib_delta;
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	1567	u64 now;
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1568
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	1569	/*
				1570	* For a group entity we need to use their owned cfs_rq_clock_task() in
				1571	* case they are the parent of a throttled hierarchy.
				1572	*/
				1573	if (entity_is_task(se))
				1574	now = cfs_rq_clock_task(cfs_rq);
				1575	else
				1576	now = cfs_rq_clock_task(group_cfs_rq(se));
				1577
				1578	if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1579	return;
				1580
				1581	contrib_delta = __update_entity_load_avg_contrib(se);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1582
				1583	if (!update_cfs_rq)
				1584	return;
				1585
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1586	if (se->on_rq)
				1587	cfs_rq->runnable_load_avg += contrib_delta;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1588	else
				1589	subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
				1590	}
				1591
				1592	/*
				1593	* Decay the load contributed by all blocked children and account this so that
				1594	* their contribution may appropriately discounted when they wake up.
				1595	*/
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1596	static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1597	{
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	1598	u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1599	u64 decays;
				1600
				1601	decays = now - cfs_rq->last_decay;
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1602	if (!decays && !force_update)
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1603	return;
				1604
Alex Shi	2509940	2013-06-20 10:18:55 +0800	[diff] [blame]	1605	if (atomic_long_read(&cfs_rq->removed_load)) {
				1606	unsigned long removed_load;
				1607	removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1608	subtract_blocked_load_contrib(cfs_rq, removed_load);
				1609	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1610
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1611	if (decays) {
				1612	cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
				1613	decays);
				1614	atomic64_add(decays, &cfs_rq->decay_counter);
				1615	cfs_rq->last_decay = now;
				1616	}
Paul Turner	c566e8e	2012-10-04 13:18:30 +0200	[diff] [blame]	1617
				1618	__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1619	}
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	1620
				1621	static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
				1622	{
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	1623	__update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable);
Paul Turner	bb17f65	2012-10-04 13:18:31 +0200	[diff] [blame]	1624	__update_tg_runnable_avg(&rq->avg, &rq->cfs);
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	1625	}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1626
				1627	/* Add the load generated by se into cfs_rq's child load-average */
				1628	static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1629	struct sched_entity *se,
				1630	int wakeup)
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1631	{
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1632	/*
				1633	* We track migrations using entity decay_count <= 0, on a wake-up
				1634	* migration we use a negative decay count to track the remote decays
				1635	* accumulated while sleeping.
Alex Shi	a75cdaa	2013-06-20 10:18:47 +0800	[diff] [blame]	1636	*
				1637	* Newly forked tasks are enqueued with se->avg.decay_count == 0, they
				1638	* are seen by enqueue_entity_load_avg() as a migration with an already
				1639	* constructed load_avg_contrib.
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1640	*/
				1641	if (unlikely(se->avg.decay_count <= 0)) {
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	1642	se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1643	if (se->avg.decay_count) {
				1644	/*
				1645	* In a wake-up migration we have to approximate the
				1646	* time sleeping. This is because we can't synchronize
				1647	* clock_task between the two cpus, and it is not
				1648	* guaranteed to be read-safe. Instead, we can
				1649	* approximate this using our carried decays, which are
				1650	* explicitly atomically readable.
				1651	*/
				1652	se->avg.last_runnable_update -= (-se->avg.decay_count)
				1653	<< 20;
				1654	update_entity_load_avg(se, 0);
				1655	/* Indicate that we're now synchronized and on-rq */
				1656	se->avg.decay_count = 0;
				1657	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1658	wakeup = 0;
				1659	} else {
Alex Shi	282cf49	2013-06-20 10:18:48 +0800	[diff] [blame]	1660	/*
				1661	* Task re-woke on same cpu (or else migrate_task_rq_fair()
				1662	* would have made count negative); we must be careful to avoid
				1663	* double-accounting blocked time after synchronizing decays.
				1664	*/
				1665	se->avg.last_runnable_update += __synchronize_entity_decay(se)
				1666	<< 20;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1667	}
				1668
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1669	/* migrated tasks did not contribute to our blocked load */
				1670	if (wakeup) {
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1671	subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1672	update_entity_load_avg(se, 0);
				1673	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1674
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1675	cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1676	/* we force update consideration on load-balancer moves */
				1677	update_cfs_rq_blocked_load(cfs_rq, !wakeup);
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1678	}
				1679
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1680	/*
				1681	* Remove se's load from this cfs_rq child load-average, if the entity is
				1682	* transitioning to a blocked state we track its projected decay using
				1683	* blocked_load_avg.
				1684	*/
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1685	static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1686	struct sched_entity *se,
				1687	int sleep)
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1688	{
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1689	update_entity_load_avg(se, 1);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1690	/* we force update consideration on load-balancer moves */
				1691	update_cfs_rq_blocked_load(cfs_rq, !sleep);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1692
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1693	cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1694	if (sleep) {
				1695	cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
				1696	se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
				1697	} /* migrations, e.g. sleep=0 leave decay_count == 0 */
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1698	}
Vincent Guittot	642dbc3	2013-04-18 18:34:26 +0200	[diff] [blame]	1699
				1700	/*
				1701	* Update the rq's load with the elapsed running time before entering
				1702	* idle. if the last scheduled task is not a CFS task, idle_enter will
				1703	* be the only way to update the runnable statistic.
				1704	*/
				1705	void idle_enter_fair(struct rq *this_rq)
				1706	{
				1707	update_rq_runnable_avg(this_rq, 1);
				1708	}
				1709
				1710	/*
				1711	* Update the rq's load with the elapsed idle time before a task is
				1712	* scheduled. if the newly scheduled task is not a CFS task, idle_exit will
				1713	* be the only way to update the runnable statistic.
				1714	*/
				1715	void idle_exit_fair(struct rq *this_rq)
				1716	{
				1717	update_rq_runnable_avg(this_rq, 0);
				1718	}
				1719
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1720	#else
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1721	static inline void update_entity_load_avg(struct sched_entity *se,
				1722	int update_cfs_rq) {}
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	1723	static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1724	static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1725	struct sched_entity *se,
				1726	int wakeup) {}
Paul Turner	2dac754	2012-10-04 13:18:30 +0200	[diff] [blame]	1727	static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	1728	struct sched_entity *se,
				1729	int sleep) {}
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	1730	static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
				1731	int force_update) {}
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	1732	#endif
				1733
Ingo Molnar	2396af6	2007-08-09 11:16:48 +0200	[diff] [blame]	1734	static void enqueue_sleeper(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1735	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1736	#ifdef CONFIG_SCHEDSTATS
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	1737	struct task_struct *tsk = NULL;
				1738
				1739	if (entity_is_task(se))
				1740	tsk = task_of(se);
				1741
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1742	if (se->statistics.sleep_start) {
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	1743	u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1744
				1745	if ((s64)delta < 0)
				1746	delta = 0;
				1747
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1748	if (unlikely(delta > se->statistics.sleep_max))
				1749	se->statistics.sleep_max = delta;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1750
Peter Zijlstra	8c79a04	2012-01-30 14:51:37 +0100	[diff] [blame]	1751	se->statistics.sleep_start = 0;
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1752	se->statistics.sum_sleep_runtime += delta;
Arjan van de Ven	9745512	2008-01-25 21:08:34 +0100	[diff] [blame]	1753
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	1754	if (tsk) {
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	1755	account_scheduler_latency(tsk, delta >> 10, 1);
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	1756	trace_sched_stat_sleep(tsk, delta);
				1757	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1758	}
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1759	if (se->statistics.block_start) {
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	1760	u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1761
				1762	if ((s64)delta < 0)
				1763	delta = 0;
				1764
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1765	if (unlikely(delta > se->statistics.block_max))
				1766	se->statistics.block_max = delta;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1767
Peter Zijlstra	8c79a04	2012-01-30 14:51:37 +0100	[diff] [blame]	1768	se->statistics.block_start = 0;
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1769	se->statistics.sum_sleep_runtime += delta;
Ingo Molnar	30084fb	2007-10-02 14:13:08 +0200	[diff] [blame]	1770
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	1771	if (tsk) {
Arjan van de Ven	8f0dfc3	2009-07-20 11:26:58 -0700	[diff] [blame]	1772	if (tsk->in_iowait) {
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	1773	se->statistics.iowait_sum += delta;
				1774	se->statistics.iowait_count++;
Peter Zijlstra	768d0c2	2009-07-23 20:13:26 +0200	[diff] [blame]	1775	trace_sched_stat_iowait(tsk, delta);
Arjan van de Ven	8f0dfc3	2009-07-20 11:26:58 -0700	[diff] [blame]	1776	}
				1777
Andrew Vagin	b781a60	2011-11-28 12:03:35 +0300	[diff] [blame]	1778	trace_sched_stat_blocked(tsk, delta);
				1779
Peter Zijlstra	e414314	2009-07-23 20:13:26 +0200	[diff] [blame]	1780	/*
				1781	* Blocking time is in units of nanosecs, so shift by
				1782	* 20 to get a milliseconds-range estimation of the
				1783	* amount of time that the task spent sleeping:
				1784	*/
				1785	if (unlikely(prof_on == SLEEP_PROFILING)) {
				1786	profile_hits(SLEEP_PROFILING,
				1787	(void *)get_wchan(tsk),
				1788	delta >> 20);
				1789	}
				1790	account_scheduler_latency(tsk, delta >> 10, 0);
Ingo Molnar	30084fb	2007-10-02 14:13:08 +0200	[diff] [blame]	1791	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1792	}
				1793	#endif
				1794	}
				1795
Peter Zijlstra	ddc9729	2007-10-15 17:00:10 +0200	[diff] [blame]	1796	static void check_spread(struct cfs_rq cfs_rq, struct sched_entity se)
				1797	{
				1798	#ifdef CONFIG_SCHED_DEBUG
				1799	s64 d = se->vruntime - cfs_rq->min_vruntime;
				1800
				1801	if (d < 0)
				1802	d = -d;
				1803
				1804	if (d > 3*sysctl_sched_latency)
				1805	schedstat_inc(cfs_rq, nr_spread_over);
				1806	#endif
				1807	}
				1808
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1809	static void
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1810	place_entity(struct cfs_rq cfs_rq, struct sched_entity se, int initial)
				1811	{
Peter Zijlstra	1af5f73	2008-10-24 11:06:13 +0200	[diff] [blame]	1812	u64 vruntime = cfs_rq->min_vruntime;
Peter Zijlstra	94dfb5e	2007-10-15 17:00:05 +0200	[diff] [blame]	1813
Peter Zijlstra	2cb8600	2007-11-09 22:39:37 +0100	[diff] [blame]	1814	/*
				1815	* The 'current' period is already promised to the current tasks,
				1816	* however the extra weight of the new task will slow them down a
				1817	* little, place the new task so that it fits in the slot that
				1818	* stays open at the end.
				1819	*/
Peter Zijlstra	94dfb5e	2007-10-15 17:00:05 +0200	[diff] [blame]	1820	if (initial && sched_feat(START_DEBIT))
Peter Zijlstra	f9c0b09	2008-10-17 19:27:04 +0200	[diff] [blame]	1821	vruntime += sched_vslice(cfs_rq, se);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1822
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1823	/* sleeps up to a single latency don't count. */
Mike Galbraith	5ca9880	2010-03-11 17:17:17 +0100	[diff] [blame]	1824	if (!initial) {
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1825	unsigned long thresh = sysctl_sched_latency;
Peter Zijlstra	a7be37a	2008-06-27 13:41:11 +0200	[diff] [blame]	1826
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1827	/*
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1828	* Halve their sleep time's effect, to allow
				1829	* for a gentler effect of sleepers:
				1830	*/
				1831	if (sched_feat(GENTLE_FAIR_SLEEPERS))
				1832	thresh >>= 1;
Ingo Molnar	51e0304	2009-09-16 08:54:45 +0200	[diff] [blame]	1833
Mike Galbraith	a2e7a7e	2009-09-18 09:19:25 +0200	[diff] [blame]	1834	vruntime -= thresh;
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1835	}
				1836
Mike Galbraith	b5d9d73	2009-09-08 11:12:28 +0200	[diff] [blame]	1837	/* ensure we never gain time by being placed backwards. */
Viresh Kumar	16c8f1c	2012-11-08 13:33:46 +0530	[diff] [blame]	1838	se->vruntime = max_vruntime(se->vruntime, vruntime);
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1839	}
				1840
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	1841	static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
				1842
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1843	static void
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1844	enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1845	{
				1846	/*
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1847	* Update the normalized vruntime before updating min_vruntime
Kamalesh Babulal	0fc576d	2013-06-27 11:24:18 +0530	[diff] [blame]	1848	* through calling update_curr().
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1849	*/
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	1850	if (!(flags & ENQUEUE_WAKEUP) \|\| (flags & ENQUEUE_WAKING))
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1851	se->vruntime += cfs_rq->min_vruntime;
				1852
				1853	/*
Dmitry Adamushko	a2a2d68	2007-10-15 17:00:13 +0200	[diff] [blame]	1854	* Update run-time statistics of the 'current'.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1855	*/
Ingo Molnar	b7cc089	2007-08-09 11:16:47 +0200	[diff] [blame]	1856	update_curr(cfs_rq);
Paul Turner	f269ae0	2012-10-04 13:18:31 +0200	[diff] [blame]	1857	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	1858	account_entity_enqueue(cfs_rq, se);
				1859	update_cfs_shares(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1860
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1861	if (flags & ENQUEUE_WAKEUP) {
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	1862	place_entity(cfs_rq, se, 0);
Ingo Molnar	2396af6	2007-08-09 11:16:48 +0200	[diff] [blame]	1863	enqueue_sleeper(cfs_rq, se);
Ingo Molnar	e9acbff	2007-10-15 17:00:04 +0200	[diff] [blame]	1864	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1865
Ingo Molnar	d2417e5	2007-08-09 11:16:47 +0200	[diff] [blame]	1866	update_stats_enqueue(cfs_rq, se);
Peter Zijlstra	ddc9729	2007-10-15 17:00:10 +0200	[diff] [blame]	1867	check_spread(cfs_rq, se);
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	1868	if (se != cfs_rq->curr)
				1869	__enqueue_entity(cfs_rq, se);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	1870	se->on_rq = 1;
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	1871
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	1872	if (cfs_rq->nr_running == 1) {
Peter Zijlstra	3d4b47b	2010-11-15 15:47:01 -0800	[diff] [blame]	1873	list_add_leaf_cfs_rq(cfs_rq);
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	1874	check_enqueue_throttle(cfs_rq);
				1875	}
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1876	}
				1877
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	1878	static void __clear_buddies_last(struct sched_entity *se)
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	1879	{
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	1880	for_each_sched_entity(se) {
				1881	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1882	if (cfs_rq->last == se)
				1883	cfs_rq->last = NULL;
				1884	else
				1885	break;
				1886	}
				1887	}
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	1888
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	1889	static void __clear_buddies_next(struct sched_entity *se)
				1890	{
				1891	for_each_sched_entity(se) {
				1892	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1893	if (cfs_rq->next == se)
				1894	cfs_rq->next = NULL;
				1895	else
				1896	break;
				1897	}
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	1898	}
				1899
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1900	static void __clear_buddies_skip(struct sched_entity *se)
				1901	{
				1902	for_each_sched_entity(se) {
				1903	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				1904	if (cfs_rq->skip == se)
				1905	cfs_rq->skip = NULL;
				1906	else
				1907	break;
				1908	}
				1909	}
				1910
Peter Zijlstra	a571bbe	2009-01-28 14:51:40 +0100	[diff] [blame]	1911	static void clear_buddies(struct cfs_rq cfs_rq, struct sched_entity se)
				1912	{
Rik van Riel	2c13c919	2011-02-01 09:48:37 -0500	[diff] [blame]	1913	if (cfs_rq->last == se)
				1914	__clear_buddies_last(se);
				1915
				1916	if (cfs_rq->next == se)
				1917	__clear_buddies_next(se);
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	1918
				1919	if (cfs_rq->skip == se)
				1920	__clear_buddies_skip(se);
Peter Zijlstra	a571bbe	2009-01-28 14:51:40 +0100	[diff] [blame]	1921	}
				1922
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	1923	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	1924
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1925	static void
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	1926	dequeue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1927	{
Dmitry Adamushko	a2a2d68	2007-10-15 17:00:13 +0200	[diff] [blame]	1928	/*
				1929	* Update run-time statistics of the 'current'.
				1930	*/
				1931	update_curr(cfs_rq);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	1932	dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
Dmitry Adamushko	a2a2d68	2007-10-15 17:00:13 +0200	[diff] [blame]	1933
Ingo Molnar	19b6a2e	2007-08-09 11:16:48 +0200	[diff] [blame]	1934	update_stats_dequeue(cfs_rq, se);
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	1935	if (flags & DEQUEUE_SLEEP) {
Peter Zijlstra	67e9fb2	2007-10-15 17:00:10 +0200	[diff] [blame]	1936	#ifdef CONFIG_SCHEDSTATS
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1937	if (entity_is_task(se)) {
				1938	struct task_struct *tsk = task_of(se);
				1939
				1940	if (tsk->state & TASK_INTERRUPTIBLE)
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	1941	se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1942	if (tsk->state & TASK_UNINTERRUPTIBLE)
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	1943	se->statistics.block_start = rq_clock(rq_of(cfs_rq));
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1944	}
Dmitry Adamushko	db36cc7	2007-10-15 17:00:06 +0200	[diff] [blame]	1945	#endif
Peter Zijlstra	67e9fb2	2007-10-15 17:00:10 +0200	[diff] [blame]	1946	}
				1947
Peter Zijlstra	2002c69	2008-11-11 11:52:33 +0100	[diff] [blame]	1948	clear_buddies(cfs_rq, se);
Peter Zijlstra	4793241	2008-11-04 21:25:09 +0100	[diff] [blame]	1949
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	1950	if (se != cfs_rq->curr)
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1951	__dequeue_entity(cfs_rq, se);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	1952	se->on_rq = 0;
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	1953	account_entity_dequeue(cfs_rq, se);
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1954
				1955	/*
				1956	* Normalize the entity after updating the min_vruntime because the
				1957	* update can refer to the ->curr item and we need to reflect this
				1958	* movement in our normalized position.
				1959	*/
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	1960	if (!(flags & DEQUEUE_SLEEP))
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	1961	se->vruntime -= cfs_rq->min_vruntime;
Peter Zijlstra	1e87623	2011-05-17 16:21:10 -0700	[diff] [blame]	1962
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	1963	/* return excess runtime on last dequeue */
				1964	return_cfs_rq_runtime(cfs_rq);
				1965
Peter Zijlstra	1e87623	2011-05-17 16:21:10 -0700	[diff] [blame]	1966	update_min_vruntime(cfs_rq);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	1967	update_cfs_shares(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1968	}
				1969
				1970	/*
				1971	* Preempt the current task with a newly woken task if needed:
				1972	*/
Peter Zijlstra	7c92e54	2007-09-05 14:32:49 +0200	[diff] [blame]	1973	static void
Ingo Molnar	2e09bf5	2007-10-15 17:00:05 +0200	[diff] [blame]	1974	check_preempt_tick(struct cfs_rq cfs_rq, struct sched_entity curr)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1975	{
Peter Zijlstra	1169783	2007-09-05 14:32:49 +0200	[diff] [blame]	1976	unsigned long ideal_runtime, delta_exec;
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	1977	struct sched_entity *se;
				1978	s64 delta;
Peter Zijlstra	1169783	2007-09-05 14:32:49 +0200	[diff] [blame]	1979
Peter Zijlstra	6d0f0ebd	2007-10-15 17:00:05 +0200	[diff] [blame]	1980	ideal_runtime = sched_slice(cfs_rq, curr);
Peter Zijlstra	1169783	2007-09-05 14:32:49 +0200	[diff] [blame]	1981	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
Mike Galbraith	a9f3e2b	2009-01-28 14:51:39 +0100	[diff] [blame]	1982	if (delta_exec > ideal_runtime) {
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	1983	resched_task(rq_of(cfs_rq)->curr);
Mike Galbraith	a9f3e2b	2009-01-28 14:51:39 +0100	[diff] [blame]	1984	/*
				1985	* The current task ran long enough, ensure it doesn't get
				1986	* re-elected due to buddy favours.
				1987	*/
				1988	clear_buddies(cfs_rq, curr);
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1989	return;
				1990	}
				1991
				1992	/*
				1993	* Ensure that a task that missed wakeup preemption by a
				1994	* narrow margin doesn't have to wait for a full slice.
				1995	* This also mitigates buddy induced latencies under load.
				1996	*/
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	1997	if (delta_exec < sysctl_sched_min_granularity)
				1998	return;
				1999
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	2000	se = __pick_first_entity(cfs_rq);
				2001	delta = curr->vruntime - se->vruntime;
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	2002
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	2003	if (delta < 0)
				2004	return;
Mike Galbraith	d7d8294	2011-01-05 05:41:17 +0100	[diff] [blame]	2005
Wang Xingchao	f4cfb33	2011-09-16 13:35:52 -0400	[diff] [blame]	2006	if (delta > ideal_runtime)
				2007	resched_task(rq_of(cfs_rq)->curr);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2008	}
				2009
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	2010	static void
Ingo Molnar	8494f41	2007-08-09 11:16:48 +0200	[diff] [blame]	2011	set_next_entity(struct cfs_rq cfs_rq, struct sched_entity se)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2012	{
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	2013	/* 'current' is not kept within the tree. */
				2014	if (se->on_rq) {
				2015	/*
				2016	* Any task has to be enqueued before it get to execute on
				2017	* a CPU. So account for the time it spent waiting on the
				2018	* runqueue.
				2019	*/
				2020	update_stats_wait_end(cfs_rq, se);
				2021	__dequeue_entity(cfs_rq, se);
				2022	}
				2023
Ingo Molnar	79303e9	2007-08-09 11:16:47 +0200	[diff] [blame]	2024	update_stats_curr_start(cfs_rq, se);
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	2025	cfs_rq->curr = se;
Ingo Molnar	eba1ed4	2007-10-15 17:00:02 +0200	[diff] [blame]	2026	#ifdef CONFIG_SCHEDSTATS
				2027	/*
				2028	* Track our maximum slice length, if the CPU's load is at
				2029	* least twice that of our own weight (i.e. dont track it
				2030	* when there are only lesser-weight tasks around):
				2031	*/
Dmitry Adamushko	495eca4	2007-10-15 17:00:06 +0200	[diff] [blame]	2032	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	2033	se->statistics.slice_max = max(se->statistics.slice_max,
Ingo Molnar	eba1ed4	2007-10-15 17:00:02 +0200	[diff] [blame]	2034	se->sum_exec_runtime - se->prev_sum_exec_runtime);
				2035	}
				2036	#endif
Peter Zijlstra	4a55b45	2007-09-05 14:32:49 +0200	[diff] [blame]	2037	se->prev_sum_exec_runtime = se->sum_exec_runtime;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2038	}
				2039
Peter Zijlstra	3f3a490	2008-10-24 11:06:16 +0200	[diff] [blame]	2040	static int
				2041	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se);
				2042
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	2043	/*
				2044	* Pick the next process, keeping these things in mind, in this order:
				2045	* 1) keep things fair between processes/task groups
				2046	* 2) pick the "next" process, since someone really wants that to run
				2047	* 3) pick the "last" process, for cache locality
				2048	* 4) do not run the "skip" process, if something else is available
				2049	*/
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	2050	static struct sched_entity pick_next_entity(struct cfs_rq cfs_rq)
Peter Zijlstra	aa2ac25	2008-03-14 21:12:12 +0100	[diff] [blame]	2051	{
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	2052	struct sched_entity *se = __pick_first_entity(cfs_rq);
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	2053	struct sched_entity *left = se;
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	2054
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	2055	/*
				2056	* Avoid running the skip buddy, if running something else can
				2057	* be done without getting too unfair.
				2058	*/
				2059	if (cfs_rq->skip == se) {
				2060	struct sched_entity *second = __pick_next_entity(se);
				2061	if (second && wakeup_preempt_entity(second, left) < 1)
				2062	se = second;
				2063	}
Peter Zijlstra	aa2ac25	2008-03-14 21:12:12 +0100	[diff] [blame]	2064
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	2065	/*
				2066	* Prefer last buddy, try to return the CPU to a preempted task.
				2067	*/
				2068	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
				2069	se = cfs_rq->last;
				2070
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	2071	/*
				2072	* Someone really wants this to run. If it's not unfair, run it.
				2073	*/
				2074	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
				2075	se = cfs_rq->next;
				2076
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	2077	clear_buddies(cfs_rq, se);
Peter Zijlstra	4793241	2008-11-04 21:25:09 +0100	[diff] [blame]	2078
				2079	return se;
Peter Zijlstra	aa2ac25	2008-03-14 21:12:12 +0100	[diff] [blame]	2080	}
				2081
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2082	static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
				2083
Ingo Molnar	ab6cde2	2007-08-09 11:16:48 +0200	[diff] [blame]	2084	static void put_prev_entity(struct cfs_rq cfs_rq, struct sched_entity prev)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2085	{
				2086	/*
				2087	* If still on the runqueue then deactivate_task()
				2088	* was not called and update_curr() has to be done:
				2089	*/
				2090	if (prev->on_rq)
Ingo Molnar	b7cc089	2007-08-09 11:16:47 +0200	[diff] [blame]	2091	update_curr(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2092
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2093	/* throttle cfs_rqs exceeding runtime */
				2094	check_cfs_rq_runtime(cfs_rq);
				2095
Peter Zijlstra	ddc9729	2007-10-15 17:00:10 +0200	[diff] [blame]	2096	check_spread(cfs_rq, prev);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	2097	if (prev->on_rq) {
Ingo Molnar	5870db5	2007-08-09 11:16:47 +0200	[diff] [blame]	2098	update_stats_wait_start(cfs_rq, prev);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	2099	/* Put 'current' back into the tree. */
				2100	__enqueue_entity(cfs_rq, prev);
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2101	/* in !on_rq case, update occurred at dequeue */
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2102	update_entity_load_avg(prev, 1);
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	2103	}
Ingo Molnar	429d43b	2007-10-15 17:00:03 +0200	[diff] [blame]	2104	cfs_rq->curr = NULL;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2105	}
				2106
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2107	static void
				2108	entity_tick(struct cfs_rq cfs_rq, struct sched_entity curr, int queued)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2109	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2110	/*
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	2111	* Update run-time statistics of the 'current'.
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2112	*/
Dmitry Adamushko	30cfdcf	2007-10-15 17:00:07 +0200	[diff] [blame]	2113	update_curr(cfs_rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2114
Paul Turner	43365bd	2010-12-15 19:10:17 -0800	[diff] [blame]	2115	/*
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2116	* Ensure that runnable average is periodically updated.
				2117	*/
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2118	update_entity_load_avg(curr, 1);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	2119	update_cfs_rq_blocked_load(cfs_rq, 1);
Peter Zijlstra	bf0bd94	2013-07-26 23:48:42 +0200	[diff] [blame]	2120	update_cfs_shares(cfs_rq);
Paul Turner	9d85f21	2012-10-04 13:18:29 +0200	[diff] [blame]	2121
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2122	#ifdef CONFIG_SCHED_HRTICK
				2123	/*
				2124	* queued ticks are scheduled to match the slice, so don't bother
				2125	* validating it and just reschedule.
				2126	*/
Harvey Harrison	983ed7a	2008-04-24 18:17:55 -0700	[diff] [blame]	2127	if (queued) {
				2128	resched_task(rq_of(cfs_rq)->curr);
				2129	return;
				2130	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2131	/*
				2132	* don't let the period tick interfere with the hrtick preemption
				2133	*/
				2134	if (!sched_feat(DOUBLE_TICK) &&
				2135	hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
				2136	return;
				2137	#endif
				2138
Yong Zhang	2c2efae	2011-07-29 16:20:33 +0800	[diff] [blame]	2139	if (cfs_rq->nr_running > 1)
Ingo Molnar	2e09bf5	2007-10-15 17:00:05 +0200	[diff] [blame]	2140	check_preempt_tick(cfs_rq, curr);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2141	}
				2142
Paul Turner	ab84d31	2011-07-21 09:43:28 -0700	[diff] [blame]	2143
				2144	/**************************************************
				2145	* CFS bandwidth control machinery
				2146	*/
				2147
				2148	#ifdef CONFIG_CFS_BANDWIDTH
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2149
				2150	#ifdef HAVE_JUMP_LABEL
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	2151	static struct static_key __cfs_bandwidth_used;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2152
				2153	static inline bool cfs_bandwidth_used(void)
				2154	{
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	2155	return static_key_false(&__cfs_bandwidth_used);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2156	}
				2157
				2158	void account_cfs_bandwidth_used(int enabled, int was_enabled)
				2159	{
				2160	/* only need to count groups transitioning between enabled/!enabled */
				2161	if (enabled && !was_enabled)
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	2162	static_key_slow_inc(&__cfs_bandwidth_used);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2163	else if (!enabled && was_enabled)
Ingo Molnar	c5905af	2012-02-24 08:31:31 +0100	[diff] [blame]	2164	static_key_slow_dec(&__cfs_bandwidth_used);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2165	}
				2166	#else /* HAVE_JUMP_LABEL */
				2167	static bool cfs_bandwidth_used(void)
				2168	{
				2169	return true;
				2170	}
				2171
				2172	void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
				2173	#endif /* HAVE_JUMP_LABEL */
				2174
Paul Turner	ab84d31	2011-07-21 09:43:28 -0700	[diff] [blame]	2175	/*
				2176	* default period for cfs group bandwidth.
				2177	* default: 0.1s, units: nanoseconds
				2178	*/
				2179	static inline u64 default_cfs_period(void)
				2180	{
				2181	return 100000000ULL;
				2182	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2183
				2184	static inline u64 sched_cfs_bandwidth_slice(void)
				2185	{
				2186	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
				2187	}
				2188
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2189	/*
				2190	* Replenish runtime according to assigned quota and update expiration time.
				2191	* We use sched_clock_cpu directly instead of rq->clock to avoid adding
				2192	* additional synchronization around rq->lock.
				2193	*
				2194	* requires cfs_b->lock
				2195	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2196	void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2197	{
				2198	u64 now;
				2199
				2200	if (cfs_b->quota == RUNTIME_INF)
				2201	return;
				2202
				2203	now = sched_clock_cpu(smp_processor_id());
				2204	cfs_b->runtime = cfs_b->quota;
				2205	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
				2206	}
				2207
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2208	static inline struct cfs_bandwidth tg_cfs_bandwidth(struct task_group tg)
				2209	{
				2210	return &tg->cfs_bandwidth;
				2211	}
				2212
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2213	/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
				2214	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
				2215	{
				2216	if (unlikely(cfs_rq->throttle_count))
				2217	return cfs_rq->throttled_clock_task;
				2218
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	2219	return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2220	}
				2221
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2222	/* returns 0 on failure to allocate runtime */
				2223	static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2224	{
				2225	struct task_group *tg = cfs_rq->tg;
				2226	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2227	u64 amount = 0, min_amount, expires;
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2228
				2229	/* note: this is a positive sum as runtime_remaining <= 0 */
				2230	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
				2231
				2232	raw_spin_lock(&cfs_b->lock);
				2233	if (cfs_b->quota == RUNTIME_INF)
				2234	amount = min_amount;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2235	else {
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2236	/*
				2237	* If the bandwidth pool has become inactive, then at least one
				2238	* period must have elapsed since the last consumption.
				2239	* Refresh the global state and ensure bandwidth timer becomes
				2240	* active.
				2241	*/
				2242	if (!cfs_b->timer_active) {
				2243	__refill_cfs_bandwidth_runtime(cfs_b);
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2244	__start_cfs_bandwidth(cfs_b);
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2245	}
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2246
				2247	if (cfs_b->runtime > 0) {
				2248	amount = min(cfs_b->runtime, min_amount);
				2249	cfs_b->runtime -= amount;
				2250	cfs_b->idle = 0;
				2251	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2252	}
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2253	expires = cfs_b->runtime_expires;
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2254	raw_spin_unlock(&cfs_b->lock);
				2255
				2256	cfs_rq->runtime_remaining += amount;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2257	/*
				2258	* we may have advanced our local expiration to account for allowed
				2259	* spread between our sched_clock and the one on which runtime was
				2260	* issued.
				2261	*/
				2262	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
				2263	cfs_rq->runtime_expires = expires;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2264
				2265	return cfs_rq->runtime_remaining > 0;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2266	}
				2267
				2268	/*
				2269	* Note: This depends on the synchronization provided by sched_clock and the
				2270	* fact that rq->clock snapshots this value.
				2271	*/
				2272	static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2273	{
				2274	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2275
				2276	/* if the deadline is ahead of our clock, nothing to do */
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	2277	if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2278	return;
				2279
				2280	if (cfs_rq->runtime_remaining < 0)
				2281	return;
				2282
				2283	/*
				2284	* If the local deadline has passed we have to consider the
				2285	* possibility that our sched_clock is 'fast' and the global deadline
				2286	* has not truly expired.
				2287	*
				2288	* Fortunately we can check determine whether this the case by checking
				2289	* whether the global deadline has advanced.
				2290	*/
				2291
				2292	if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
				2293	/* extend local deadline, drift is bounded above by 2 ticks */
				2294	cfs_rq->runtime_expires += TICK_NSEC;
				2295	} else {
				2296	/* global deadline is ahead, expiration has passed */
				2297	cfs_rq->runtime_remaining = 0;
				2298	}
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2299	}
				2300
				2301	static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
				2302	unsigned long delta_exec)
				2303	{
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2304	/* dock delta_exec before expiring quota (as it could span periods) */
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2305	cfs_rq->runtime_remaining -= delta_exec;
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2306	expire_cfs_rq_runtime(cfs_rq);
				2307
				2308	if (likely(cfs_rq->runtime_remaining > 0))
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2309	return;
				2310
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2311	/*
				2312	* if we're unable to extend our runtime we resched so that the active
				2313	* hierarchy can be throttled
				2314	*/
				2315	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
				2316	resched_task(rq_of(cfs_rq)->curr);
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2317	}
				2318
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	2319	static __always_inline
				2320	void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2321	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2322	if (!cfs_bandwidth_used() \|\| !cfs_rq->runtime_enabled)
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	2323	return;
				2324
				2325	__account_cfs_rq_runtime(cfs_rq, delta_exec);
				2326	}
				2327
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2328	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
				2329	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2330	return cfs_bandwidth_used() && cfs_rq->throttled;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2331	}
				2332
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2333	/* check whether cfs_rq, or any parent, is throttled */
				2334	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
				2335	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2336	return cfs_bandwidth_used() && cfs_rq->throttle_count;
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2337	}
				2338
				2339	/*
				2340	* Ensure that neither of the group entities corresponding to src_cpu or
				2341	* dest_cpu are members of a throttled hierarchy when performing group
				2342	* load-balance operations.
				2343	*/
				2344	static inline int throttled_lb_pair(struct task_group *tg,
				2345	int src_cpu, int dest_cpu)
				2346	{
				2347	struct cfs_rq src_cfs_rq, dest_cfs_rq;
				2348
				2349	src_cfs_rq = tg->cfs_rq[src_cpu];
				2350	dest_cfs_rq = tg->cfs_rq[dest_cpu];
				2351
				2352	return throttled_hierarchy(src_cfs_rq) \|\|
				2353	throttled_hierarchy(dest_cfs_rq);
				2354	}
				2355
				2356	/* updated child weight may affect parent so we have to do this bottom up */
				2357	static int tg_unthrottle_up(struct task_group tg, void data)
				2358	{
				2359	struct rq *rq = data;
				2360	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
				2361
				2362	cfs_rq->throttle_count--;
				2363	#ifdef CONFIG_SMP
				2364	if (!cfs_rq->throttle_count) {
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2365	/* adjust cfs_rq_clock_task() */
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	2366	cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2367	cfs_rq->throttled_clock_task;
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2368	}
				2369	#endif
				2370
				2371	return 0;
				2372	}
				2373
				2374	static int tg_throttle_down(struct task_group tg, void data)
				2375	{
				2376	struct rq *rq = data;
				2377	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
				2378
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	2379	/* group is entering throttled state, stop time */
				2380	if (!cfs_rq->throttle_count)
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	2381	cfs_rq->throttled_clock_task = rq_clock_task(rq);
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2382	cfs_rq->throttle_count++;
				2383
				2384	return 0;
				2385	}
				2386
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2387	static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2388	{
				2389	struct rq *rq = rq_of(cfs_rq);
				2390	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2391	struct sched_entity *se;
				2392	long task_delta, dequeue = 1;
				2393
				2394	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
				2395
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2396	/* freeze hierarchy runnable averages while throttled */
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2397	rcu_read_lock();
				2398	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
				2399	rcu_read_unlock();
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2400
				2401	task_delta = cfs_rq->h_nr_running;
				2402	for_each_sched_entity(se) {
				2403	struct cfs_rq *qcfs_rq = cfs_rq_of(se);
				2404	/* throttled entity or throttle-on-deactivate */
				2405	if (!se->on_rq)
				2406	break;
				2407
				2408	if (dequeue)
				2409	dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
				2410	qcfs_rq->h_nr_running -= task_delta;
				2411
				2412	if (qcfs_rq->load.weight)
				2413	dequeue = 0;
				2414	}
				2415
				2416	if (!se)
				2417	rq->nr_running -= task_delta;
				2418
				2419	cfs_rq->throttled = 1;
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	2420	cfs_rq->throttled_clock = rq_clock(rq);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2421	raw_spin_lock(&cfs_b->lock);
				2422	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
				2423	raw_spin_unlock(&cfs_b->lock);
				2424	}
				2425
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2426	void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2427	{
				2428	struct rq *rq = rq_of(cfs_rq);
				2429	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2430	struct sched_entity *se;
				2431	int enqueue = 1;
				2432	long task_delta;
				2433
Michael Wang	22b958d	2013-06-04 14:23:39 +0800	[diff] [blame]	2434	se = cfs_rq->tg->se[cpu_of(rq)];
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2435
				2436	cfs_rq->throttled = 0;
Frederic Weisbecker	1a55af2	2013-04-12 01:51:01 +0200	[diff] [blame]	2437
				2438	update_rq_clock(rq);
				2439
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2440	raw_spin_lock(&cfs_b->lock);
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	2441	cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2442	list_del_rcu(&cfs_rq->throttled_list);
				2443	raw_spin_unlock(&cfs_b->lock);
				2444
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2445	/* update hierarchical throttle state */
				2446	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
				2447
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2448	if (!cfs_rq->load.weight)
				2449	return;
				2450
				2451	task_delta = cfs_rq->h_nr_running;
				2452	for_each_sched_entity(se) {
				2453	if (se->on_rq)
				2454	enqueue = 0;
				2455
				2456	cfs_rq = cfs_rq_of(se);
				2457	if (enqueue)
				2458	enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
				2459	cfs_rq->h_nr_running += task_delta;
				2460
				2461	if (cfs_rq_throttled(cfs_rq))
				2462	break;
				2463	}
				2464
				2465	if (!se)
				2466	rq->nr_running += task_delta;
				2467
				2468	/* determine whether we need to wake up potentially idle cpu */
				2469	if (rq->curr == rq->idle && rq->cfs.nr_running)
				2470	resched_task(rq->curr);
				2471	}
				2472
				2473	static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
				2474	u64 remaining, u64 expires)
				2475	{
				2476	struct cfs_rq *cfs_rq;
				2477	u64 runtime = remaining;
				2478
				2479	rcu_read_lock();
				2480	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
				2481	throttled_list) {
				2482	struct rq *rq = rq_of(cfs_rq);
				2483
				2484	raw_spin_lock(&rq->lock);
				2485	if (!cfs_rq_throttled(cfs_rq))
				2486	goto next;
				2487
				2488	runtime = -cfs_rq->runtime_remaining + 1;
				2489	if (runtime > remaining)
				2490	runtime = remaining;
				2491	remaining -= runtime;
				2492
				2493	cfs_rq->runtime_remaining += runtime;
				2494	cfs_rq->runtime_expires = expires;
				2495
				2496	/* we check whether we're throttled above */
				2497	if (cfs_rq->runtime_remaining > 0)
				2498	unthrottle_cfs_rq(cfs_rq);
				2499
				2500	next:
				2501	raw_spin_unlock(&rq->lock);
				2502
				2503	if (!remaining)
				2504	break;
				2505	}
				2506	rcu_read_unlock();
				2507
				2508	return remaining;
				2509	}
				2510
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2511	/*
				2512	* Responsible for refilling a task_group's bandwidth and unthrottling its
				2513	* cfs_rqs as appropriate. If there has been no activity within the last
				2514	* period the timer is deactivated until scheduling resumes; cfs_b->idle is
				2515	* used to track this state.
				2516	*/
				2517	static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
				2518	{
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2519	u64 runtime, runtime_expires;
				2520	int idle = 1, throttled;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2521
				2522	raw_spin_lock(&cfs_b->lock);
				2523	/* no need to continue the timer with no bandwidth constraint */
				2524	if (cfs_b->quota == RUNTIME_INF)
				2525	goto out_unlock;
				2526
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2527	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
				2528	/* idle depends on !throttled (for the case of a large deficit) */
				2529	idle = cfs_b->idle && !throttled;
Nikhil Rao	e8da1b1	2011-07-21 09:43:40 -0700	[diff] [blame]	2530	cfs_b->nr_periods += overrun;
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2531
Paul Turner	a9cf55b	2011-07-21 09:43:32 -0700	[diff] [blame]	2532	/* if we're going inactive then everything else can be deferred */
				2533	if (idle)
				2534	goto out_unlock;
				2535
				2536	__refill_cfs_bandwidth_runtime(cfs_b);
				2537
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2538	if (!throttled) {
				2539	/* mark as potentially idle for the upcoming period */
				2540	cfs_b->idle = 1;
				2541	goto out_unlock;
				2542	}
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2543
Nikhil Rao	e8da1b1	2011-07-21 09:43:40 -0700	[diff] [blame]	2544	/* account preceding periods in which throttling occurred */
				2545	cfs_b->nr_throttled += overrun;
				2546
Paul Turner	671fd9d	2011-07-21 09:43:34 -0700	[diff] [blame]	2547	/*
				2548	* There are throttled entities so we must first use the new bandwidth
				2549	* to unthrottle them before making it generally available. This
				2550	* ensures that all existing debts will be paid before a new cfs_rq is
				2551	* allowed to run.
				2552	*/
				2553	runtime = cfs_b->runtime;
				2554	runtime_expires = cfs_b->runtime_expires;
				2555	cfs_b->runtime = 0;
				2556
				2557	/*
				2558	* This check is repeated as we are holding onto the new bandwidth
				2559	* while we unthrottle. This can potentially race with an unthrottled
				2560	* group trying to acquire new bandwidth from the global pool.
				2561	*/
				2562	while (throttled && runtime > 0) {
				2563	raw_spin_unlock(&cfs_b->lock);
				2564	/* we can't nest cfs_b->lock while distributing bandwidth */
				2565	runtime = distribute_cfs_runtime(cfs_b, runtime,
				2566	runtime_expires);
				2567	raw_spin_lock(&cfs_b->lock);
				2568
				2569	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
				2570	}
				2571
				2572	/* return (any) remaining runtime */
				2573	cfs_b->runtime = runtime;
				2574	/*
				2575	* While we are ensured activity in the period following an
				2576	* unthrottle, this also covers the case in which the new bandwidth is
				2577	* insufficient to cover the existing bandwidth deficit. (Forcing the
				2578	* timer to remain active while there are any throttled entities.)
				2579	*/
				2580	cfs_b->idle = 0;
Paul Turner	58088ad	2011-07-21 09:43:31 -0700	[diff] [blame]	2581	out_unlock:
				2582	if (idle)
				2583	cfs_b->timer_active = 0;
				2584	raw_spin_unlock(&cfs_b->lock);
				2585
				2586	return idle;
				2587	}
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2588
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	2589	/* a cfs_rq won't donate quota below this amount */
				2590	static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
				2591	/* minimum remaining period time to redistribute slack quota */
				2592	static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
				2593	/* how long we wait to gather additional slack before distributing */
				2594	static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
				2595
				2596	/* are we near the end of the current quota period? */
				2597	static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
				2598	{
				2599	struct hrtimer *refresh_timer = &cfs_b->period_timer;
				2600	u64 remaining;
				2601
				2602	/* if the call-back is running a quota refresh is already occurring */
				2603	if (hrtimer_callback_running(refresh_timer))
				2604	return 1;
				2605
				2606	/* is a quota refresh about to occur? */
				2607	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
				2608	if (remaining < min_expire)
				2609	return 1;
				2610
				2611	return 0;
				2612	}
				2613
				2614	static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
				2615	{
				2616	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
				2617
				2618	/* if there's a quota refresh soon don't bother with slack */
				2619	if (runtime_refresh_within(cfs_b, min_left))
				2620	return;
				2621
				2622	start_bandwidth_timer(&cfs_b->slack_timer,
				2623	ns_to_ktime(cfs_bandwidth_slack_period));
				2624	}
				2625
				2626	/* we know any runtime found here is valid as update_curr() precedes return */
				2627	static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2628	{
				2629	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2630	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
				2631
				2632	if (slack_runtime <= 0)
				2633	return;
				2634
				2635	raw_spin_lock(&cfs_b->lock);
				2636	if (cfs_b->quota != RUNTIME_INF &&
				2637	cfs_rq->runtime_expires == cfs_b->runtime_expires) {
				2638	cfs_b->runtime += slack_runtime;
				2639
				2640	/* we are under rq->lock, defer unthrottling using a timer */
				2641	if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
				2642	!list_empty(&cfs_b->throttled_cfs_rq))
				2643	start_cfs_slack_bandwidth(cfs_b);
				2644	}
				2645	raw_spin_unlock(&cfs_b->lock);
				2646
				2647	/* even if it's not valid for return we don't want to try again */
				2648	cfs_rq->runtime_remaining -= slack_runtime;
				2649	}
				2650
				2651	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2652	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2653	if (!cfs_bandwidth_used())
				2654	return;
				2655
Paul Turner	fccfdc6	2011-11-07 20:26:34 -0800	[diff] [blame]	2656	if (!cfs_rq->runtime_enabled \|\| cfs_rq->nr_running)
Paul Turner	d8b4986	2011-07-21 09:43:41 -0700	[diff] [blame]	2657	return;
				2658
				2659	__return_cfs_rq_runtime(cfs_rq);
				2660	}
				2661
				2662	/*
				2663	* This is done with a timer (instead of inline with bandwidth return) since
				2664	* it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
				2665	*/
				2666	static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
				2667	{
				2668	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
				2669	u64 expires;
				2670
				2671	/* confirm we're still not at a refresh boundary */
				2672	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
				2673	return;
				2674
				2675	raw_spin_lock(&cfs_b->lock);
				2676	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
				2677	runtime = cfs_b->runtime;
				2678	cfs_b->runtime = 0;
				2679	}
				2680	expires = cfs_b->runtime_expires;
				2681	raw_spin_unlock(&cfs_b->lock);
				2682
				2683	if (!runtime)
				2684	return;
				2685
				2686	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
				2687
				2688	raw_spin_lock(&cfs_b->lock);
				2689	if (expires == cfs_b->runtime_expires)
				2690	cfs_b->runtime = runtime;
				2691	raw_spin_unlock(&cfs_b->lock);
				2692	}
				2693
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2694	/*
				2695	* When a group wakes up we want to make sure that its quota is not already
				2696	* expired/exceeded, otherwise it may be allowed to steal additional ticks of
				2697	* runtime as update_curr() throttling can not not trigger until it's on-rq.
				2698	*/
				2699	static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
				2700	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2701	if (!cfs_bandwidth_used())
				2702	return;
				2703
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2704	/* an active group must be handled by the update_curr()->put() path */
				2705	if (!cfs_rq->runtime_enabled \|\| cfs_rq->curr)
				2706	return;
				2707
				2708	/* ensure the group is not already throttled */
				2709	if (cfs_rq_throttled(cfs_rq))
				2710	return;
				2711
				2712	/* update runtime allocation */
				2713	account_cfs_rq_runtime(cfs_rq, 0);
				2714	if (cfs_rq->runtime_remaining <= 0)
				2715	throttle_cfs_rq(cfs_rq);
				2716	}
				2717
				2718	/* conditionally throttle active cfs_rq's from put_prev_entity() */
				2719	static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2720	{
Paul Turner	56f570e	2011-11-07 20:26:33 -0800	[diff] [blame]	2721	if (!cfs_bandwidth_used())
				2722	return;
				2723
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2724	if (likely(!cfs_rq->runtime_enabled \|\| cfs_rq->runtime_remaining > 0))
				2725	return;
				2726
				2727	/*
				2728	* it's possible for a throttled entity to be forced into a running
				2729	* state (e.g. set_curr_task), in this case we're finished.
				2730	*/
				2731	if (cfs_rq_throttled(cfs_rq))
				2732	return;
				2733
				2734	throttle_cfs_rq(cfs_rq);
				2735	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2736
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2737	static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
				2738	{
				2739	struct cfs_bandwidth *cfs_b =
				2740	container_of(timer, struct cfs_bandwidth, slack_timer);
				2741	do_sched_cfs_slack_timer(cfs_b);
				2742
				2743	return HRTIMER_NORESTART;
				2744	}
				2745
				2746	static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
				2747	{
				2748	struct cfs_bandwidth *cfs_b =
				2749	container_of(timer, struct cfs_bandwidth, period_timer);
				2750	ktime_t now;
				2751	int overrun;
				2752	int idle = 0;
				2753
				2754	for (;;) {
				2755	now = hrtimer_cb_get_time(timer);
				2756	overrun = hrtimer_forward(timer, now, cfs_b->period);
				2757
				2758	if (!overrun)
				2759	break;
				2760
				2761	idle = do_sched_cfs_period_timer(cfs_b, overrun);
				2762	}
				2763
				2764	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
				2765	}
				2766
				2767	void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				2768	{
				2769	raw_spin_lock_init(&cfs_b->lock);
				2770	cfs_b->runtime = 0;
				2771	cfs_b->quota = RUNTIME_INF;
				2772	cfs_b->period = ns_to_ktime(default_cfs_period());
				2773
				2774	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
				2775	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
				2776	cfs_b->period_timer.function = sched_cfs_period_timer;
				2777	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
				2778	cfs_b->slack_timer.function = sched_cfs_slack_timer;
				2779	}
				2780
				2781	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
				2782	{
				2783	cfs_rq->runtime_enabled = 0;
				2784	INIT_LIST_HEAD(&cfs_rq->throttled_list);
				2785	}
				2786
				2787	/* requires cfs_b->lock, may release to reprogram timer */
				2788	void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				2789	{
				2790	/*
				2791	* The timer may be active because we're trying to set a new bandwidth
				2792	* period or because we're racing with the tear-down path
				2793	* (timer_active==0 becomes visible before the hrtimer call-back
				2794	* terminates). In either case we ensure that it's re-programmed
				2795	*/
				2796	while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
				2797	raw_spin_unlock(&cfs_b->lock);
				2798	/* ensure cfs_b->lock is available while we wait */
				2799	hrtimer_cancel(&cfs_b->period_timer);
				2800
				2801	raw_spin_lock(&cfs_b->lock);
				2802	/* if someone else restarted the timer then we're done */
				2803	if (cfs_b->timer_active)
				2804	return;
				2805	}
				2806
				2807	cfs_b->timer_active = 1;
				2808	start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
				2809	}
				2810
				2811	static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
				2812	{
				2813	hrtimer_cancel(&cfs_b->period_timer);
				2814	hrtimer_cancel(&cfs_b->slack_timer);
				2815	}
				2816
Arnd Bergmann	38dc334	2013-01-25 14:14:22 +0000	[diff] [blame]	2817	static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2818	{
				2819	struct cfs_rq *cfs_rq;
				2820
				2821	for_each_leaf_cfs_rq(rq, cfs_rq) {
				2822	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
				2823
				2824	if (!cfs_rq->runtime_enabled)
				2825	continue;
				2826
				2827	/*
				2828	* clock_task is not advancing so we just need to make sure
				2829	* there's some valid quota amount
				2830	*/
				2831	cfs_rq->runtime_remaining = cfs_b->quota;
				2832	if (cfs_rq_throttled(cfs_rq))
				2833	unthrottle_cfs_rq(cfs_rq);
				2834	}
				2835	}
				2836
				2837	#else /* CONFIG_CFS_BANDWIDTH */
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2838	static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
				2839	{
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	2840	return rq_clock_task(rq_of(cfs_rq));
Paul Turner	f1b1728	2012-10-04 13:18:31 +0200	[diff] [blame]	2841	}
				2842
				2843	static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
				2844	unsigned long delta_exec) {}
Paul Turner	d3d9dc3	2011-07-21 09:43:39 -0700	[diff] [blame]	2845	static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
				2846	static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
Peter Zijlstra	6c16a6d	2012-03-21 13:07:16 -0700	[diff] [blame]	2847	static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2848
				2849	static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
				2850	{
				2851	return 0;
				2852	}
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	2853
				2854	static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
				2855	{
				2856	return 0;
				2857	}
				2858
				2859	static inline int throttled_lb_pair(struct task_group *tg,
				2860	int src_cpu, int dest_cpu)
				2861	{
				2862	return 0;
				2863	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2864
				2865	void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
				2866
				2867	#ifdef CONFIG_FAIR_GROUP_SCHED
				2868	static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
Paul Turner	ab84d31	2011-07-21 09:43:28 -0700	[diff] [blame]	2869	#endif
				2870
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2871	static inline struct cfs_bandwidth tg_cfs_bandwidth(struct task_group tg)
				2872	{
				2873	return NULL;
				2874	}
				2875	static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
Peter Boonstoppel	a4c96ae	2012-08-09 15:34:47 -0700	[diff] [blame]	2876	static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	2877
				2878	#endif /* CONFIG_CFS_BANDWIDTH */
				2879
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2880	/**************************************************
				2881	* CFS operations on tasks:
				2882	*/
				2883
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2884	#ifdef CONFIG_SCHED_HRTICK
				2885	static void hrtick_start_fair(struct rq rq, struct task_struct p)
				2886	{
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2887	struct sched_entity *se = &p->se;
				2888	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				2889
				2890	WARN_ON(task_rq(p) != rq);
				2891
Mike Galbraith	b39e66e	2011-11-22 15:20:07 +0100	[diff] [blame]	2892	if (cfs_rq->nr_running > 1) {
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2893	u64 slice = sched_slice(cfs_rq, se);
				2894	u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
				2895	s64 delta = slice - ran;
				2896
				2897	if (delta < 0) {
				2898	if (rq->curr == p)
				2899	resched_task(p);
				2900	return;
				2901	}
				2902
				2903	/*
				2904	* Don't schedule slices shorter than 10000ns, that just
				2905	* doesn't make sense. Rely on vruntime for fairness.
				2906	*/
Peter Zijlstra	3165651	2008-07-18 18:01:23 +0200	[diff] [blame]	2907	if (rq->curr != p)
Peter Zijlstra	157124c	2008-07-28 11:53:11 +0200	[diff] [blame]	2908	delta = max_t(s64, 10000LL, delta);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2909
Peter Zijlstra	3165651	2008-07-18 18:01:23 +0200	[diff] [blame]	2910	hrtick_start(rq, delta);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2911	}
				2912	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	2913
				2914	/*
				2915	* called from enqueue/dequeue and updates the hrtick when the
				2916	* current task is from our class and nr_running is low enough
				2917	* to matter.
				2918	*/
				2919	static void hrtick_update(struct rq *rq)
				2920	{
				2921	struct task_struct *curr = rq->curr;
				2922
Mike Galbraith	b39e66e	2011-11-22 15:20:07 +0100	[diff] [blame]	2923	if (!hrtick_enabled(rq) \|\| curr->sched_class != &fair_sched_class)
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	2924	return;
				2925
				2926	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
				2927	hrtick_start_fair(rq, curr);
				2928	}
Dhaval Giani	55e12e5	2008-06-24 23:39:43 +0530	[diff] [blame]	2929	#else /* !CONFIG_SCHED_HRTICK */
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2930	static inline void
				2931	hrtick_start_fair(struct rq rq, struct task_struct p)
				2932	{
				2933	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	2934
				2935	static inline void hrtick_update(struct rq *rq)
				2936	{
				2937	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2938	#endif
				2939
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2940	/*
				2941	* The enqueue_task method is called before nr_running is
				2942	* increased. Here we update the fair scheduling stats and
				2943	* then put the task into the rbtree:
				2944	*/
Thomas Gleixner	ea87bb7	2010-01-20 20:58:57 +0000	[diff] [blame]	2945	static void
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	2946	enqueue_task_fair(struct rq rq, struct task_struct p, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2947	{
				2948	struct cfs_rq *cfs_rq;
Peter Zijlstra	62fb185	2008-02-25 17:34:02 +0100	[diff] [blame]	2949	struct sched_entity *se = &p->se;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2950
				2951	for_each_sched_entity(se) {
Peter Zijlstra	62fb185	2008-02-25 17:34:02 +0100	[diff] [blame]	2952	if (se->on_rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2953	break;
				2954	cfs_rq = cfs_rq_of(se);
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2955	enqueue_entity(cfs_rq, se, flags);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2956
				2957	/*
				2958	* end evaluation on encountering a throttled cfs_rq
				2959	*
				2960	* note: in the case of encountering a throttled cfs_rq we will
				2961	* post the final h_nr_running increment below.
				2962	*/
				2963	if (cfs_rq_throttled(cfs_rq))
				2964	break;
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	2965	cfs_rq->h_nr_running++;
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2966
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	2967	flags = ENQUEUE_WAKEUP;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2968	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	2969
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2970	for_each_sched_entity(se) {
Lin Ming	0f31714	2011-07-22 09:14:31 +0800	[diff] [blame]	2971	cfs_rq = cfs_rq_of(se);
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	2972	cfs_rq->h_nr_running++;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2973
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2974	if (cfs_rq_throttled(cfs_rq))
				2975	break;
				2976
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	2977	update_cfs_shares(cfs_rq);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	2978	update_entity_load_avg(se, 1);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	2979	}
				2980
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	2981	if (!se) {
				2982	update_rq_runnable_avg(rq, rq->nr_running);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	2983	inc_nr_running(rq);
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	2984	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	2985	hrtick_update(rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2986	}
				2987
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	2988	static void set_next_buddy(struct sched_entity *se);
				2989
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2990	/*
				2991	* The dequeue_task method is called before nr_running is
				2992	* decreased. We remove the task from the rbtree and
				2993	* update the fair scheduling stats:
				2994	*/
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	2995	static void dequeue_task_fair(struct rq rq, struct task_struct p, int flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	2996	{
				2997	struct cfs_rq *cfs_rq;
Peter Zijlstra	62fb185	2008-02-25 17:34:02 +0100	[diff] [blame]	2998	struct sched_entity *se = &p->se;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	2999	int task_sleep = flags & DEQUEUE_SLEEP;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3000
				3001	for_each_sched_entity(se) {
				3002	cfs_rq = cfs_rq_of(se);
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	3003	dequeue_entity(cfs_rq, se, flags);
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	3004
				3005	/*
				3006	* end evaluation on encountering a throttled cfs_rq
				3007	*
				3008	* note: in the case of encountering a throttled cfs_rq we will
				3009	* post the final h_nr_running decrement below.
				3010	*/
				3011	if (cfs_rq_throttled(cfs_rq))
				3012	break;
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	3013	cfs_rq->h_nr_running--;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	3014
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3015	/* Don't dequeue parent if it has other entities besides us */
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3016	if (cfs_rq->load.weight) {
				3017	/*
				3018	* Bias pick_next to pick a task from this cfs_rq, as
				3019	* p is sleeping when it is within its sched_slice.
				3020	*/
				3021	if (task_sleep && parent_entity(se))
				3022	set_next_buddy(parent_entity(se));
Paul Turner	9598c82	2011-07-06 22:30:37 -0700	[diff] [blame]	3023
				3024	/* avoid re-evaluating load for this entity */
				3025	se = parent_entity(se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3026	break;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3027	}
Peter Zijlstra	371fd7e	2010-03-24 16:38:48 +0100	[diff] [blame]	3028	flags \|= DEQUEUE_SLEEP;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3029	}
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3030
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	3031	for_each_sched_entity(se) {
Lin Ming	0f31714	2011-07-22 09:14:31 +0800	[diff] [blame]	3032	cfs_rq = cfs_rq_of(se);
Paul Turner	953bfcd	2011-07-21 09:43:27 -0700	[diff] [blame]	3033	cfs_rq->h_nr_running--;
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	3034
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	3035	if (cfs_rq_throttled(cfs_rq))
				3036	break;
				3037
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	3038	update_cfs_shares(cfs_rq);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	3039	update_entity_load_avg(se, 1);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	3040	}
				3041
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	3042	if (!se) {
Paul Turner	85dac90	2011-07-21 09:43:33 -0700	[diff] [blame]	3043	dec_nr_running(rq);
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	3044	update_rq_runnable_avg(rq, 1);
				3045	}
Peter Zijlstra	a4c2f00	2008-10-17 19:27:03 +0200	[diff] [blame]	3046	hrtick_update(rq);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3047	}
				3048
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3049	#ifdef CONFIG_SMP
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3050	/* Used instead of source_load when we know the type == 0 */
				3051	static unsigned long weighted_cpuload(const int cpu)
				3052	{
Alex Shi	b92486c	2013-06-20 10:18:50 +0800	[diff] [blame]	3053	return cpu_rq(cpu)->cfs.runnable_load_avg;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3054	}
				3055
				3056	/*
				3057	* Return a low guess at the load of a migration-source cpu weighted
				3058	* according to the scheduling class and "nice" value.
				3059	*
				3060	* We want to under-estimate the load of migration sources, to
				3061	* balance conservatively.
				3062	*/
				3063	static unsigned long source_load(int cpu, int type)
				3064	{
				3065	struct rq *rq = cpu_rq(cpu);
				3066	unsigned long total = weighted_cpuload(cpu);
				3067
				3068	if (type == 0 \|\| !sched_feat(LB_BIAS))
				3069	return total;
				3070
				3071	return min(rq->cpu_load[type-1], total);
				3072	}
				3073
				3074	/*
				3075	* Return a high guess at the load of a migration-target cpu weighted
				3076	* according to the scheduling class and "nice" value.
				3077	*/
				3078	static unsigned long target_load(int cpu, int type)
				3079	{
				3080	struct rq *rq = cpu_rq(cpu);
				3081	unsigned long total = weighted_cpuload(cpu);
				3082
				3083	if (type == 0 \|\| !sched_feat(LB_BIAS))
				3084	return total;
				3085
				3086	return max(rq->cpu_load[type-1], total);
				3087	}
				3088
				3089	static unsigned long power_of(int cpu)
				3090	{
				3091	return cpu_rq(cpu)->cpu_power;
				3092	}
				3093
				3094	static unsigned long cpu_avg_load_per_task(int cpu)
				3095	{
				3096	struct rq *rq = cpu_rq(cpu);
				3097	unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
Alex Shi	b92486c	2013-06-20 10:18:50 +0800	[diff] [blame]	3098	unsigned long load_avg = rq->cfs.runnable_load_avg;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3099
				3100	if (nr_running)
Alex Shi	b92486c	2013-06-20 10:18:50 +0800	[diff] [blame]	3101	return load_avg / nr_running;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	3102
				3103	return 0;
				3104	}
				3105
Michael Wang	6247041	2013-07-04 12:55:51 +0800	[diff] [blame]	3106	static void record_wakee(struct task_struct *p)
				3107	{
				3108	/*
				3109	* Rough decay (wiping) for cost saving, don't worry
				3110	* about the boundary, really active task won't care
				3111	* about the loss.
				3112	*/
				3113	if (jiffies > current->wakee_flip_decay_ts + HZ) {
				3114	current->wakee_flips = 0;
				3115	current->wakee_flip_decay_ts = jiffies;
				3116	}
				3117
				3118	if (current->last_wakee != p) {
				3119	current->last_wakee = p;
				3120	current->wakee_flips++;
				3121	}
				3122	}
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3123
Peter Zijlstra	74f8e4b	2011-04-05 17:23:47 +0200	[diff] [blame]	3124	static void task_waking_fair(struct task_struct *p)
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	3125	{
				3126	struct sched_entity *se = &p->se;
				3127	struct cfs_rq *cfs_rq = cfs_rq_of(se);
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	3128	u64 min_vruntime;
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	3129
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	3130	#ifndef CONFIG_64BIT
				3131	u64 min_vruntime_copy;
Peter Zijlstra	74f8e4b	2011-04-05 17:23:47 +0200	[diff] [blame]	3132
Peter Zijlstra	3fe1698	2011-04-05 17:23:48 +0200	[diff] [blame]	3133	do {
				3134	min_vruntime_copy = cfs_rq->min_vruntime_copy;
				3135	smp_rmb();
				3136	min_vruntime = cfs_rq->min_vruntime;
				3137	} while (min_vruntime != min_vruntime_copy);
				3138	#else
				3139	min_vruntime = cfs_rq->min_vruntime;
				3140	#endif
				3141
				3142	se->vruntime -= min_vruntime;
Michael Wang	6247041	2013-07-04 12:55:51 +0800	[diff] [blame]	3143	record_wakee(p);
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	3144	}
				3145
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3146	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	f5bfb7d	2008-06-27 13:41:39 +0200	[diff] [blame]	3147	/*
				3148	* effective_load() calculates the load change as seen from the root_task_group
				3149	*
				3150	* Adding load to a group doesn't make a group heavier, but can cause movement
				3151	* of group shares between cpus. Assuming the shares were perfectly aligned one
				3152	* can calculate the shift in shares.
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3153	*
				3154	* Calculate the effective load difference if @wl is added (subtracted) to @tg
				3155	* on this @cpu and results in a total addition (subtraction) of @wg to the
				3156	* total group weight.
				3157	*
				3158	* Given a runqueue weight distribution (rw_i) we can compute a shares
				3159	* distribution (s_i) using:
				3160	*
				3161	* s_i = rw_i / \Sum rw_j (1)
				3162	*
				3163	* Suppose we have 4 CPUs and our @tg is a direct child of the root group and
				3164	* has 7 equal weight tasks, distributed as below (rw_i), with the resulting
				3165	* shares distribution (s_i):
				3166	*
				3167	* rw_i = { 2, 4, 1, 0 }
				3168	* s_i = { 2/7, 4/7, 1/7, 0 }
				3169	*
				3170	* As per wake_affine() we're interested in the load of two CPUs (the CPU the
				3171	* task used to run on and the CPU the waker is running on), we need to
				3172	* compute the effect of waking a task on either CPU and, in case of a sync
				3173	* wakeup, compute the effect of the current task going to sleep.
				3174	*
				3175	* So for a change of @wl to the local @cpu with an overall group weight change
				3176	* of @wl we can compute the new shares distribution (s'_i) using:
				3177	*
				3178	* s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
				3179	*
				3180	* Suppose we're interested in CPUs 0 and 1, and want to compute the load
				3181	* differences in waking a task to CPU 0. The additional task changes the
				3182	* weight and shares distributions like:
				3183	*
				3184	* rw'_i = { 3, 4, 1, 0 }
				3185	* s'_i = { 3/8, 4/8, 1/8, 0 }
				3186	*
				3187	* We can then compute the difference in effective weight by using:
				3188	*
				3189	* dw_i = S * (s'_i - s_i) (3)
				3190	*
				3191	* Where 'S' is the group weight as seen by its parent.
				3192	*
				3193	* Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
				3194	* times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
				3195	* 4/7) times the weight of the group.
Peter Zijlstra	f5bfb7d	2008-06-27 13:41:39 +0200	[diff] [blame]	3196	*/
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	3197	static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3198	{
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3199	struct sched_entity *se = tg->se[cpu];
Peter Zijlstra	f1d239f	2008-06-27 13:41:38 +0200	[diff] [blame]	3200
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3201	if (!tg->parent) /* the trivial, non-cgroup case */
Peter Zijlstra	f1d239f	2008-06-27 13:41:38 +0200	[diff] [blame]	3202	return wl;
				3203
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3204	for_each_sched_entity(se) {
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3205	long w, W;
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3206
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	3207	tg = se->my_q->tg;
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3208
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3209	/*
				3210	* W = @wg + \Sum rw_j
				3211	*/
				3212	W = wg + calc_tg_weight(tg, se->my_q);
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3213
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3214	/*
				3215	* w = rw_i + @wl
				3216	*/
				3217	w = se->my_q->load.weight + wl;
Peter Zijlstra	940959e	2008-09-23 15:33:42 +0200	[diff] [blame]	3218
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3219	/*
				3220	* wl = S * s'_i; see (2)
				3221	*/
				3222	if (W > 0 && w < W)
				3223	wl = (w * tg->shares) / W;
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	3224	else
				3225	wl = tg->shares;
Peter Zijlstra	940959e	2008-09-23 15:33:42 +0200	[diff] [blame]	3226
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3227	/*
				3228	* Per the above, wl is the new se->load.weight value; since
				3229	* those are clipped to [MIN_SHARES, ...) do so now. See
				3230	* calc_cfs_shares().
				3231	*/
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	3232	if (wl < MIN_SHARES)
				3233	wl = MIN_SHARES;
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3234
				3235	/*
				3236	* wl = dw_i = S * (s'_i - s_i); see (3)
				3237	*/
Paul Turner	977dda7	2011-01-14 17:57:50 -0800	[diff] [blame]	3238	wl -= se->load.weight;
Peter Zijlstra	cf5f0ac	2011-10-13 16:52:28 +0200	[diff] [blame]	3239
				3240	/*
				3241	* Recursively apply this logic to all parent groups to compute
				3242	* the final effective load change on the root group. Since
				3243	* only the @tg group gets extra weight, all parent groups can
				3244	* only redistribute existing shares. @wl is the shift in shares
				3245	* resulting from this level per the above.
				3246	*/
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3247	wg = 0;
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3248	}
				3249
				3250	return wl;
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3251	}
				3252	#else
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3253
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3254	static inline unsigned long effective_load(struct task_group *tg, int cpu,
				3255	unsigned long wl, unsigned long wg)
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3256	{
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3257	return wl;
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3258	}
Peter Zijlstra	4be9daa	2008-06-27 13:41:30 +0200	[diff] [blame]	3259
Peter Zijlstra	bb3469a	2008-06-27 13:41:27 +0200	[diff] [blame]	3260	#endif
				3261
Michael Wang	6247041	2013-07-04 12:55:51 +0800	[diff] [blame]	3262	static int wake_wide(struct task_struct *p)
				3263	{
Peter Zijlstra	7d9ffa8	2013-07-04 12:56:46 +0800	[diff] [blame]	3264	int factor = this_cpu_read(sd_llc_size);
Michael Wang	6247041	2013-07-04 12:55:51 +0800	[diff] [blame]	3265
				3266	/*
				3267	* Yeah, it's the switching-frequency, could means many wakee or
				3268	* rapidly switch, use factor here will just help to automatically
				3269	* adjust the loose-degree, so bigger node will lead to more pull.
				3270	*/
				3271	if (p->wakee_flips > factor) {
				3272	/*
				3273	* wakee is somewhat hot, it needs certain amount of cpu
				3274	* resource, so if waker is far more hot, prefer to leave
				3275	* it alone.
				3276	*/
				3277	if (current->wakee_flips > (factor * p->wakee_flips))
				3278	return 1;
				3279	}
				3280
				3281	return 0;
				3282	}
				3283
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3284	static int wake_affine(struct sched_domain sd, struct task_struct p, int sync)
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3285	{
Paul Turner	e37b6a7	2011-01-21 20:44:59 -0800	[diff] [blame]	3286	s64 this_load, load;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3287	int idx, this_cpu, prev_cpu;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3288	unsigned long tl_per_task;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3289	struct task_group *tg;
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3290	unsigned long weight;
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	3291	int balanced;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3292
Michael Wang	6247041	2013-07-04 12:55:51 +0800	[diff] [blame]	3293	/*
				3294	* If we wake multiple tasks be careful to not bounce
				3295	* ourselves around too much.
				3296	*/
				3297	if (wake_wide(p))
				3298	return 0;
				3299
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3300	idx = sd->wake_idx;
				3301	this_cpu = smp_processor_id();
				3302	prev_cpu = task_cpu(p);
				3303	load = source_load(prev_cpu, idx);
				3304	this_load = target_load(this_cpu, idx);
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3305
				3306	/*
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3307	* If sync wakeup then subtract the (maximum possible)
				3308	* effect of the currently running task from the load
				3309	* of the current CPU:
				3310	*/
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3311	if (sync) {
				3312	tg = task_group(current);
				3313	weight = current->se.load.weight;
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3314
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3315	this_load += effective_load(tg, this_cpu, -weight, -weight);
Peter Zijlstra	8337826	2008-06-27 13:41:37 +0200	[diff] [blame]	3316	load += effective_load(tg, prev_cpu, 0, -weight);
				3317	}
				3318
				3319	tg = task_group(p);
				3320	weight = p->se.load.weight;
				3321
Peter Zijlstra	71a29aa	2009-09-07 18:28:05 +0200	[diff] [blame]	3322	/*
				3323	* In low-load situations, where prev_cpu is idle and this_cpu is idle
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3324	* due to the sync cause above having dropped this_load to 0, we'll
				3325	* always have an imbalance, but there's really nothing you can do
				3326	* about that, so that's good too.
Peter Zijlstra	71a29aa	2009-09-07 18:28:05 +0200	[diff] [blame]	3327	*
				3328	* Otherwise check if either cpus are near enough in load to allow this
				3329	* task to be woken on this_cpu.
				3330	*/
Paul Turner	e37b6a7	2011-01-21 20:44:59 -0800	[diff] [blame]	3331	if (this_load > 0) {
				3332	s64 this_eff_load, prev_eff_load;
Peter Zijlstra	e51fd5e	2010-05-31 12:37:30 +0200	[diff] [blame]	3333
				3334	this_eff_load = 100;
				3335	this_eff_load *= power_of(prev_cpu);
				3336	this_eff_load *= this_load +
				3337	effective_load(tg, this_cpu, weight, weight);
				3338
				3339	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
				3340	prev_eff_load *= power_of(this_cpu);
				3341	prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
				3342
				3343	balanced = this_eff_load <= prev_eff_load;
				3344	} else
				3345	balanced = true;
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	3346
				3347	/*
				3348	* If the currently running task will sleep within
				3349	* a reasonable amount of time then attract this newly
				3350	* woken task:
				3351	*/
Peter Zijlstra	2fb7635	2008-10-08 09:16:04 +0200	[diff] [blame]	3352	if (sync && balanced)
				3353	return 1;
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	3354
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3355	schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
Mike Galbraith	b3137bc	2008-05-29 11:11:41 +0200	[diff] [blame]	3356	tl_per_task = cpu_avg_load_per_task(this_cpu);
				3357
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3358	if (balanced \|\|
				3359	(this_load <= load &&
				3360	this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3361	/*
				3362	* This domain has SD_WAKE_AFFINE and
				3363	* p is cache cold in this domain, and
				3364	* there is no bad imbalance.
				3365	*/
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3366	schedstat_inc(sd, ttwu_move_affine);
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	3367	schedstat_inc(p, se.statistics.nr_wakeups_affine);
Ingo Molnar	098fb9d	2008-03-16 20:36:10 +0100	[diff] [blame]	3368
				3369	return 1;
				3370	}
				3371	return 0;
				3372	}
				3373
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3374	/*
				3375	* find_idlest_group finds and returns the least busy CPU group within the
				3376	* domain.
				3377	*/
				3378	static struct sched_group *
Peter Zijlstra	78e7ed5	2009-09-03 13:16:51 +0200	[diff] [blame]	3379	find_idlest_group(struct sched_domain sd, struct task_struct p,
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	3380	int this_cpu, int load_idx)
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3381	{
Andi Kleen	b3bd3de	2010-08-10 14:17:51 -0700	[diff] [blame]	3382	struct sched_group idlest = NULL, group = sd->groups;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3383	unsigned long min_load = ULONG_MAX, this_load = 0;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3384	int imbalance = 100 + (sd->imbalance_pct-100)/2;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3385
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3386	do {
				3387	unsigned long load, avg_load;
				3388	int local_group;
				3389	int i;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3390
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3391	/* Skip over this group if it has no CPUs allowed */
				3392	if (!cpumask_intersects(sched_group_cpus(group),
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	3393	tsk_cpus_allowed(p)))
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3394	continue;
				3395
				3396	local_group = cpumask_test_cpu(this_cpu,
				3397	sched_group_cpus(group));
				3398
				3399	/* Tally up the load of all CPUs in the group */
				3400	avg_load = 0;
				3401
				3402	for_each_cpu(i, sched_group_cpus(group)) {
				3403	/* Bias balancing toward cpus of our domain */
				3404	if (local_group)
				3405	load = source_load(i, load_idx);
				3406	else
				3407	load = target_load(i, load_idx);
				3408
				3409	avg_load += load;
				3410	}
				3411
				3412	/* Adjust by relative CPU power of the group */
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	3413	avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3414
				3415	if (local_group) {
				3416	this_load = avg_load;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3417	} else if (avg_load < min_load) {
				3418	min_load = avg_load;
				3419	idlest = group;
				3420	}
				3421	} while (group = group->next, group != sd->groups);
				3422
				3423	if (!idlest \|\| 100this_load < imbalancemin_load)
				3424	return NULL;
				3425	return idlest;
				3426	}
				3427
				3428	/*
				3429	* find_idlest_cpu - find the idlest cpu among the cpus in group.
				3430	*/
				3431	static int
				3432	find_idlest_cpu(struct sched_group group, struct task_struct p, int this_cpu)
				3433	{
				3434	unsigned long load, min_load = ULONG_MAX;
				3435	int idlest = -1;
				3436	int i;
				3437
				3438	/* Traverse only the allowed CPUs */
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	3439	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3440	load = weighted_cpuload(i);
				3441
				3442	if (load < min_load \|\| (load == min_load && i == this_cpu)) {
				3443	min_load = load;
				3444	idlest = i;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3445	}
				3446	}
				3447
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3448	return idlest;
				3449	}
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3450
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3451	/*
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3452	* Try and locate an idle CPU in the sched_domain.
				3453	*/
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3454	static int select_idle_sibling(struct task_struct *p, int target)
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3455	{
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3456	struct sched_domain *sd;
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3457	struct sched_group *sg;
Mike Galbraith	e0a79f5	2013-01-28 12:19:25 +0100	[diff] [blame]	3458	int i = task_cpu(p);
				3459
				3460	if (idle_cpu(target))
				3461	return target;
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3462
				3463	/*
Mike Galbraith	e0a79f5	2013-01-28 12:19:25 +0100	[diff] [blame]	3464	* If the prevous cpu is cache affine and idle, don't be stupid.
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3465	*/
Mike Galbraith	e0a79f5	2013-01-28 12:19:25 +0100	[diff] [blame]	3466	if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
				3467	return i;
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3468
				3469	/*
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3470	* Otherwise, iterate the domains and find an elegible idle cpu.
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3471	*/
Peter Zijlstra	518cd62	2011-12-07 15:07:31 +0100	[diff] [blame]	3472	sd = rcu_dereference(per_cpu(sd_llc, target));
Suresh Siddha	77e8136	2011-11-17 11:08:23 -0800	[diff] [blame]	3473	for_each_lower_domain(sd) {
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3474	sg = sd->groups;
				3475	do {
				3476	if (!cpumask_intersects(sched_group_cpus(sg),
				3477	tsk_cpus_allowed(p)))
				3478	goto next;
Mike Galbraith	970e178	2012-06-12 05:18:32 +0200	[diff] [blame]	3479
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3480	for_each_cpu(i, sched_group_cpus(sg)) {
Mike Galbraith	e0a79f5	2013-01-28 12:19:25 +0100	[diff] [blame]	3481	if (i == target \|\| !idle_cpu(i))
Linus Torvalds	37407ea	2012-09-16 12:29:43 -0700	[diff] [blame]	3482	goto next;
				3483	}
				3484
				3485	target = cpumask_first_and(sched_group_cpus(sg),
				3486	tsk_cpus_allowed(p));
				3487	goto done;
				3488	next:
				3489	sg = sg->next;
				3490	} while (sg != sd->groups);
				3491	}
				3492	done:
Peter Zijlstra	a50bde5	2009-11-12 15:55:28 +0100	[diff] [blame]	3493	return target;
				3494	}
				3495
				3496	/*
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3497	* sched_balance_self: balance the current task (running on cpu) in domains
				3498	* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
				3499	* SD_BALANCE_EXEC.
				3500	*
				3501	* Balance, ie. select the least loaded group.
				3502	*
				3503	* Returns the target CPU number, or the same CPU if no balancing is needed.
				3504	*
				3505	* preempt must be disabled.
				3506	*/
Peter Zijlstra	0017d73	2010-03-24 18:34:10 +0100	[diff] [blame]	3507	static int
Peter Zijlstra	7608dec	2011-04-05 17:23:46 +0200	[diff] [blame]	3508	select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3509	{
Peter Zijlstra	29cd8ba	2009-09-17 09:01:14 +0200	[diff] [blame]	3510	struct sched_domain tmp, affine_sd = NULL, *sd = NULL;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3511	int cpu = smp_processor_id();
				3512	int prev_cpu = task_cpu(p);
				3513	int new_cpu = cpu;
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3514	int want_affine = 0;
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	3515	int sync = wake_flags & WF_SYNC;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3516
Peter Zijlstra	29baa74	2012-04-23 12:11:21 +0200	[diff] [blame]	3517	if (p->nr_cpus_allowed == 1)
Mike Galbraith	76854c7	2011-11-22 15:18:24 +0100	[diff] [blame]	3518	return prev_cpu;
				3519
Peter Zijlstra	0763a66	2009-09-14 19:37:39 +0200	[diff] [blame]	3520	if (sd_flag & SD_BALANCE_WAKE) {
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	3521	if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3522	want_affine = 1;
				3523	new_cpu = prev_cpu;
				3524	}
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3525
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	3526	rcu_read_lock();
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3527	for_each_domain(cpu, tmp) {
Peter Zijlstra	e4f42888	2009-12-16 18:04:34 +0100	[diff] [blame]	3528	if (!(tmp->flags & SD_LOAD_BALANCE))
				3529	continue;
				3530
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3531	/*
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3532	* If both cpu and prev_cpu are part of this domain,
				3533	* cpu is a valid SD_WAKE_AFFINE target.
Peter Zijlstra	fe3bcfe	2009-11-12 15:55:29 +0100	[diff] [blame]	3534	*/
Suresh Siddha	99bd5e2	2010-03-31 16:47:45 -0700	[diff] [blame]	3535	if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
				3536	cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
				3537	affine_sd = tmp;
Alex Shi	f03542a	2012-07-26 08:55:34 +0800	[diff] [blame]	3538	break;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3539	}
				3540
Alex Shi	f03542a	2012-07-26 08:55:34 +0800	[diff] [blame]	3541	if (tmp->flags & sd_flag)
Peter Zijlstra	29cd8ba	2009-09-17 09:01:14 +0200	[diff] [blame]	3542	sd = tmp;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3543	}
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3544
Mike Galbraith	8b911ac	2010-03-11 17:17:16 +0100	[diff] [blame]	3545	if (affine_sd) {
Alex Shi	f03542a	2012-07-26 08:55:34 +0800	[diff] [blame]	3546	if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	3547	prev_cpu = cpu;
				3548
				3549	new_cpu = select_idle_sibling(p, prev_cpu);
				3550	goto unlock;
Mike Galbraith	8b911ac	2010-03-11 17:17:16 +0100	[diff] [blame]	3551	}
Peter Zijlstra	3b64089	2009-09-16 13:44:33 +0200	[diff] [blame]	3552
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3553	while (sd) {
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	3554	int load_idx = sd->forkexec_idx;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3555	struct sched_group *group;
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3556	int weight;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3557
Peter Zijlstra	0763a66	2009-09-14 19:37:39 +0200	[diff] [blame]	3558	if (!(sd->flags & sd_flag)) {
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3559	sd = sd->child;
				3560	continue;
				3561	}
				3562
Peter Zijlstra	5158f4e	2009-09-16 13:46:59 +0200	[diff] [blame]	3563	if (sd_flag & SD_BALANCE_WAKE)
				3564	load_idx = sd->wake_idx;
				3565
				3566	group = find_idlest_group(sd, p, cpu, load_idx);
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3567	if (!group) {
				3568	sd = sd->child;
				3569	continue;
				3570	}
				3571
Peter Zijlstra	d7c33c4	2009-09-11 12:45:38 +0200	[diff] [blame]	3572	new_cpu = find_idlest_cpu(group, p, cpu);
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3573	if (new_cpu == -1 \|\| new_cpu == cpu) {
				3574	/* Now try balancing at a lower domain level of cpu */
				3575	sd = sd->child;
				3576	continue;
				3577	}
				3578
				3579	/* Now try balancing at a lower domain level of new_cpu */
				3580	cpu = new_cpu;
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	3581	weight = sd->span_weight;
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3582	sd = NULL;
				3583	for_each_domain(cpu, tmp) {
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	3584	if (weight <= tmp->span_weight)
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3585	break;
Peter Zijlstra	0763a66	2009-09-14 19:37:39 +0200	[diff] [blame]	3586	if (tmp->flags & sd_flag)
Peter Zijlstra	aaee120	2009-09-10 13:36:25 +0200	[diff] [blame]	3587	sd = tmp;
				3588	}
				3589	/* while loop will break here if sd == NULL */
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3590	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	3591	unlock:
				3592	rcu_read_unlock();
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3593
Peter Zijlstra	c88d591	2009-09-10 13:50:02 +0200	[diff] [blame]	3594	return new_cpu;
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3595	}
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	3596
				3597	/*
				3598	* Called immediately before a task is migrated to a new cpu; task_cpu(p) and
				3599	* cfs_rq_of(p) references at time of call are still valid and identify the
				3600	* previous cpu. However, the caller only guarantees p->pi_lock is held; no
				3601	* other assumptions, including the state of rq->lock, should be made.
				3602	*/
				3603	static void
				3604	migrate_task_rq_fair(struct task_struct *p, int next_cpu)
				3605	{
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	3606	struct sched_entity *se = &p->se;
				3607	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				3608
				3609	/*
				3610	* Load tracking: accumulate removed load so that it can be processed
				3611	* when we next update owning cfs_rq under rq->lock. Tasks contribute
				3612	* to blocked load iff they have a positive decay-count. It can never
				3613	* be negative here since on-rq tasks have decay-count == 0.
				3614	*/
				3615	if (se->avg.decay_count) {
				3616	se->avg.decay_count = -__synchronize_entity_decay(se);
Alex Shi	2509940	2013-06-20 10:18:55 +0800	[diff] [blame]	3617	atomic_long_add(se->avg.load_avg_contrib,
				3618	&cfs_rq->removed_load);
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	3619	}
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	3620	}
Gregory Haskins	e7693a3	2008-01-25 21:08:09 +0100	[diff] [blame]	3621	#endif /* CONFIG_SMP */
				3622
Peter Zijlstra	e52fb7c	2009-01-14 12:39:19 +0100	[diff] [blame]	3623	static unsigned long
				3624	wakeup_gran(struct sched_entity curr, struct sched_entity se)
Peter Zijlstra	0bbd333	2008-04-19 19:44:57 +0200	[diff] [blame]	3625	{
				3626	unsigned long gran = sysctl_sched_wakeup_granularity;
				3627
				3628	/*
Peter Zijlstra	e52fb7c	2009-01-14 12:39:19 +0100	[diff] [blame]	3629	* Since its curr running now, convert the gran from real-time
				3630	* to virtual-time in his units.
Mike Galbraith	13814d4	2010-03-11 17:17:04 +0100	[diff] [blame]	3631	*
				3632	* By using 'se' instead of 'curr' we penalize light tasks, so
				3633	* they get preempted easier. That is, if 'se' < 'curr' then
				3634	* the resulting gran will be larger, therefore penalizing the
				3635	* lighter, if otoh 'se' > 'curr' then the resulting gran will
				3636	* be smaller, again penalizing the lighter task.
				3637	*
				3638	* This is especially important for buddies when the leftmost
				3639	* task is higher priority than the buddy.
Peter Zijlstra	0bbd333	2008-04-19 19:44:57 +0200	[diff] [blame]	3640	*/
Shaohua Li	f4ad9bd	2011-04-08 12:53:09 +0800	[diff] [blame]	3641	return calc_delta_fair(gran, se);
Peter Zijlstra	0bbd333	2008-04-19 19:44:57 +0200	[diff] [blame]	3642	}
				3643
				3644	/*
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	3645	* Should 'se' preempt 'curr'.
				3646	*
				3647	* \|s1
				3648	* \|s2
				3649	* \|s3
				3650	* g
				3651	* \|<--->\|c
				3652	*
				3653	* w(c, s1) = -1
				3654	* w(c, s2) = 0
				3655	* w(c, s3) = 1
				3656	*
				3657	*/
				3658	static int
				3659	wakeup_preempt_entity(struct sched_entity curr, struct sched_entity se)
				3660	{
				3661	s64 gran, vdiff = curr->vruntime - se->vruntime;
				3662
				3663	if (vdiff <= 0)
				3664	return -1;
				3665
Peter Zijlstra	e52fb7c	2009-01-14 12:39:19 +0100	[diff] [blame]	3666	gran = wakeup_gran(curr, se);
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	3667	if (vdiff > gran)
				3668	return 1;
				3669
				3670	return 0;
				3671	}
				3672
Peter Zijlstra	0247909	2008-11-04 21:25:10 +0100	[diff] [blame]	3673	static void set_last_buddy(struct sched_entity *se)
				3674	{
Venkatesh Pallipadi	69c80f3	2011-04-13 18:21:09 -0700	[diff] [blame]	3675	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
				3676	return;
				3677
				3678	for_each_sched_entity(se)
				3679	cfs_rq_of(se)->last = se;
Peter Zijlstra	0247909	2008-11-04 21:25:10 +0100	[diff] [blame]	3680	}
				3681
				3682	static void set_next_buddy(struct sched_entity *se)
				3683	{
Venkatesh Pallipadi	69c80f3	2011-04-13 18:21:09 -0700	[diff] [blame]	3684	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
				3685	return;
				3686
				3687	for_each_sched_entity(se)
				3688	cfs_rq_of(se)->next = se;
Peter Zijlstra	0247909	2008-11-04 21:25:10 +0100	[diff] [blame]	3689	}
				3690
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3691	static void set_skip_buddy(struct sched_entity *se)
				3692	{
Venkatesh Pallipadi	69c80f3	2011-04-13 18:21:09 -0700	[diff] [blame]	3693	for_each_sched_entity(se)
				3694	cfs_rq_of(se)->skip = se;
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3695	}
				3696
Peter Zijlstra	464b752	2008-10-24 11:06:15 +0200	[diff] [blame]	3697	/*
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3698	* Preempt the current task with a newly woken task if needed:
				3699	*/
Peter Zijlstra	5a9b86f	2009-09-16 13:47:58 +0200	[diff] [blame]	3700	static void check_preempt_wakeup(struct rq rq, struct task_struct p, int wake_flags)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3701	{
				3702	struct task_struct *curr = rq->curr;
Srivatsa Vaddagiri	8651a86	2007-10-15 17:00:12 +0200	[diff] [blame]	3703	struct sched_entity se = &curr->se, pse = &p->se;
Mike Galbraith	03e89e4	2008-12-16 08:45:30 +0100	[diff] [blame]	3704	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
Mike Galbraith	f685cea	2009-10-23 23:09:22 +0200	[diff] [blame]	3705	int scale = cfs_rq->nr_running >= sched_nr_latency;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3706	int next_buddy_marked = 0;
Mike Galbraith	03e89e4	2008-12-16 08:45:30 +0100	[diff] [blame]	3707
Ingo Molnar	4ae7d5c	2008-03-19 01:42:00 +0100	[diff] [blame]	3708	if (unlikely(se == pse))
				3709	return;
				3710
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	3711	/*
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3712	* This is possible from callers such as move_task(), in which we
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	3713	* unconditionally check_prempt_curr() after an enqueue (which may have
				3714	* lead to a throttle). This both saves work and prevents false
				3715	* next-buddy nomination below.
				3716	*/
				3717	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
				3718	return;
				3719
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3720	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
Mike Galbraith	3cb63d5	2009-09-11 12:01:17 +0200	[diff] [blame]	3721	set_next_buddy(pse);
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3722	next_buddy_marked = 1;
				3723	}
Peter Zijlstra	57fdc26	2008-09-23 15:33:45 +0200	[diff] [blame]	3724
Bharata B Rao	aec0a51	2008-08-28 14:42:49 +0530	[diff] [blame]	3725	/*
				3726	* We can come here with TIF_NEED_RESCHED already set from new task
				3727	* wake up path.
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	3728	*
				3729	* Note: this also catches the edge-case of curr being in a throttled
				3730	* group (e.g. via set_curr_task), since update_curr() (in the
				3731	* enqueue of curr) will have resulted in resched being set. This
				3732	* prevents us from potentially nominating it as a false LAST_BUDDY
				3733	* below.
Bharata B Rao	aec0a51	2008-08-28 14:42:49 +0530	[diff] [blame]	3734	*/
				3735	if (test_tsk_need_resched(curr))
				3736	return;
				3737
Darren Hart	a2f5c9a	2011-02-22 13:04:33 -0800	[diff] [blame]	3738	/* Idle tasks are by definition preempted by non-idle tasks. */
				3739	if (unlikely(curr->policy == SCHED_IDLE) &&
				3740	likely(p->policy != SCHED_IDLE))
				3741	goto preempt;
				3742
Ingo Molnar	91c234b	2007-10-15 17:00:18 +0200	[diff] [blame]	3743	/*
Darren Hart	a2f5c9a	2011-02-22 13:04:33 -0800	[diff] [blame]	3744	* Batch and idle tasks do not preempt non-idle tasks (their preemption
				3745	* is driven by the tick):
Ingo Molnar	91c234b	2007-10-15 17:00:18 +0200	[diff] [blame]	3746	*/
Ingo Molnar	8ed92e51	2012-10-14 14:28:50 +0200	[diff] [blame]	3747	if (unlikely(p->policy != SCHED_NORMAL) \|\| !sched_feat(WAKEUP_PREEMPTION))
Ingo Molnar	91c234b	2007-10-15 17:00:18 +0200	[diff] [blame]	3748	return;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3749
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	3750	find_matching_se(&se, &pse);
Paul Turner	9bbd737	2011-07-05 19:07:21 -0700	[diff] [blame]	3751	update_curr(cfs_rq_of(se));
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	3752	BUG_ON(!pse);
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3753	if (wakeup_preempt_entity(se, pse) == 1) {
				3754	/*
				3755	* Bias pick_next to pick the sched entity that is
				3756	* triggering this preemption.
				3757	*/
				3758	if (!next_buddy_marked)
				3759	set_next_buddy(pse);
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	3760	goto preempt;
Venkatesh Pallipadi	2f36825	2011-04-14 10:30:53 -0700	[diff] [blame]	3761	}
Jupyung Lee	a65ac74	2009-11-17 18:51:40 +0900	[diff] [blame]	3762
Peter Zijlstra	3a7e73a	2009-11-28 18:51:02 +0100	[diff] [blame]	3763	return;
				3764
				3765	preempt:
				3766	resched_task(curr);
				3767	/*
				3768	* Only set the backward buddy when the current task is still
				3769	* on the rq. This can happen when a wakeup gets interleaved
				3770	* with schedule on the ->pre_schedule() or idle_balance()
				3771	* point, either of which can * drop the rq lock.
				3772	*
				3773	* Also, during early boot the idle thread is in the fair class,
				3774	* for obvious reasons its a bad idea to schedule back to it.
				3775	*/
				3776	if (unlikely(!se->on_rq \|\| curr == rq->idle))
				3777	return;
				3778
				3779	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
				3780	set_last_buddy(se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3781	}
				3782
Ingo Molnar	fb8d472	2007-08-09 11:16:48 +0200	[diff] [blame]	3783	static struct task_struct pick_next_task_fair(struct rq rq)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3784	{
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3785	struct task_struct *p;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3786	struct cfs_rq *cfs_rq = &rq->cfs;
				3787	struct sched_entity *se;
				3788
Tim Blechmann	36ace27	2009-11-24 11:55:45 +0100	[diff] [blame]	3789	if (!cfs_rq->nr_running)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3790	return NULL;
				3791
				3792	do {
Ingo Molnar	9948f4b	2007-08-09 11:16:48 +0200	[diff] [blame]	3793	se = pick_next_entity(cfs_rq);
Peter Zijlstra	f4b6755	2008-11-04 21:25:07 +0100	[diff] [blame]	3794	set_next_entity(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3795	cfs_rq = group_cfs_rq(se);
				3796	} while (cfs_rq);
				3797
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3798	p = task_of(se);
Mike Galbraith	b39e66e	2011-11-22 15:20:07 +0100	[diff] [blame]	3799	if (hrtick_enabled(rq))
				3800	hrtick_start_fair(rq, p);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	3801
				3802	return p;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3803	}
				3804
				3805	/*
				3806	* Account for a descheduled task:
				3807	*/
Ingo Molnar	31ee529	2007-08-09 11:16:49 +0200	[diff] [blame]	3808	static void put_prev_task_fair(struct rq rq, struct task_struct prev)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3809	{
				3810	struct sched_entity *se = &prev->se;
				3811	struct cfs_rq *cfs_rq;
				3812
				3813	for_each_sched_entity(se) {
				3814	cfs_rq = cfs_rq_of(se);
Ingo Molnar	ab6cde2	2007-08-09 11:16:48 +0200	[diff] [blame]	3815	put_prev_entity(cfs_rq, se);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3816	}
				3817	}
				3818
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3819	/*
				3820	* sched_yield() is very simple
				3821	*
				3822	* The magic of dealing with the ->skip buddy is in pick_next_entity.
				3823	*/
				3824	static void yield_task_fair(struct rq *rq)
				3825	{
				3826	struct task_struct *curr = rq->curr;
				3827	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
				3828	struct sched_entity *se = &curr->se;
				3829
				3830	/*
				3831	* Are we the only task in the tree?
				3832	*/
				3833	if (unlikely(rq->nr_running == 1))
				3834	return;
				3835
				3836	clear_buddies(cfs_rq, se);
				3837
				3838	if (curr->policy != SCHED_BATCH) {
				3839	update_rq_clock(rq);
				3840	/*
				3841	* Update run-time statistics of the 'current'.
				3842	*/
				3843	update_curr(cfs_rq);
Mike Galbraith	916671c	2011-11-22 15:21:26 +0100	[diff] [blame]	3844	/*
				3845	* Tell update_rq_clock() that we've just updated,
				3846	* so we don't do microscopic update in schedule()
				3847	* and double the fastpath cost.
				3848	*/
				3849	rq->skip_clock_update = 1;
Rik van Riel	ac53db5	2011-02-01 09:51:03 -0500	[diff] [blame]	3850	}
				3851
				3852	set_skip_buddy(se);
				3853	}
				3854
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	3855	static bool yield_to_task_fair(struct rq rq, struct task_struct p, bool preempt)
				3856	{
				3857	struct sched_entity *se = &p->se;
				3858
Paul Turner	5238cdd	2011-07-21 09:43:37 -0700	[diff] [blame]	3859	/* throttled hierarchies are not runnable */
				3860	if (!se->on_rq \|\| throttled_hierarchy(cfs_rq_of(se)))
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	3861	return false;
				3862
				3863	/* Tell the scheduler that we'd really like pse to run next. */
				3864	set_next_buddy(se);
				3865
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	3866	yield_task_fair(rq);
				3867
				3868	return true;
				3869	}
				3870
Peter Williams	681f3e6	2007-10-24 18:23:51 +0200	[diff] [blame]	3871	#ifdef CONFIG_SMP
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3872	/**************************************************
Peter Zijlstra	e9c84cb	2012-07-03 13:53:26 +0200	[diff] [blame]	3873	* Fair scheduling class load-balancing methods.
				3874	*
				3875	* BASICS
				3876	*
				3877	* The purpose of load-balancing is to achieve the same basic fairness the
				3878	* per-cpu scheduler provides, namely provide a proportional amount of compute
				3879	* time to each task. This is expressed in the following equation:
				3880	*
				3881	* W_i,n/P_i == W_j,n/P_j for all i,j (1)
				3882	*
				3883	* Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
				3884	* W_i,0 is defined as:
				3885	*
				3886	* W_i,0 = \Sum_j w_i,j (2)
				3887	*
				3888	* Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
				3889	* is derived from the nice value as per prio_to_weight[].
				3890	*
				3891	* The weight average is an exponential decay average of the instantaneous
				3892	* weight:
				3893	*
				3894	* W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
				3895	*
				3896	* P_i is the cpu power (or compute capacity) of cpu i, typically it is the
				3897	* fraction of 'recent' time available for SCHED_OTHER task execution. But it
				3898	* can also include other factors [XXX].
				3899	*
				3900	* To achieve this balance we define a measure of imbalance which follows
				3901	* directly from (1):
				3902	*
				3903	* imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4)
				3904	*
				3905	* We them move tasks around to minimize the imbalance. In the continuous
				3906	* function space it is obvious this converges, in the discrete case we get
				3907	* a few fun cases generally called infeasible weight scenarios.
				3908	*
				3909	* [XXX expand on:
				3910	* - infeasible weights;
				3911	* - local vs global optima in the discrete case. ]
				3912	*
				3913	*
				3914	* SCHED DOMAINS
				3915	*
				3916	* In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
				3917	* for all i,j solution, we create a tree of cpus that follows the hardware
				3918	* topology where each level pairs two lower groups (or better). This results
				3919	* in O(log n) layers. Furthermore we reduce the number of cpus going up the
				3920	* tree to only the first of the previous level and we decrease the frequency
				3921	* of load-balance at each level inv. proportional to the number of cpus in
				3922	* the groups.
				3923	*
				3924	* This yields:
				3925	*
				3926	* log_2 n 1 n
				3927	* \Sum { --- * --- * 2^i } = O(n) (5)
				3928	* i = 0 2^i 2^i
				3929	* `- size of each group
				3930	* \| \| `- number of cpus doing load-balance
				3931	* \| `- freq
				3932	* `- sum over all levels
				3933	*
				3934	* Coupled with a limit on how many tasks we can migrate every balance pass,
				3935	* this makes (5) the runtime complexity of the balancer.
				3936	*
				3937	* An important property here is that each CPU is still (indirectly) connected
				3938	* to every other cpu in at most O(log n) steps:
				3939	*
				3940	* The adjacency matrix of the resulting graph is given by:
				3941	*
				3942	* log_2 n
				3943	* A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
				3944	* k = 0
				3945	*
				3946	* And you'll find that:
				3947	*
				3948	* A^(log_2 n)_i,j != 0 for all i,j (7)
				3949	*
				3950	* Showing there's indeed a path between every cpu in at most O(log n) steps.
				3951	* The task movement gives a factor of O(m), giving a convergence complexity
				3952	* of:
				3953	*
				3954	* O(nm log n), n := nr_cpus, m := nr_tasks (8)
				3955	*
				3956	*
				3957	* WORK CONSERVING
				3958	*
				3959	* In order to avoid CPUs going idle while there's still work to do, new idle
				3960	* balancing is more aggressive and has the newly idle cpu iterate up the domain
				3961	* tree itself instead of relying on other CPUs to bring it work.
				3962	*
				3963	* This adds some complexity to both (5) and (8) but it reduces the total idle
				3964	* time.
				3965	*
				3966	* [XXX more?]
				3967	*
				3968	*
				3969	* CGROUPS
				3970	*
				3971	* Cgroups make a horror show out of (2), instead of a simple sum we get:
				3972	*
				3973	* s_k,i
				3974	* W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
				3975	* S_k
				3976	*
				3977	* Where
				3978	*
				3979	* s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
				3980	*
				3981	* w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
				3982	*
				3983	* The big problem is S_k, its a global sum needed to compute a local (W_i)
				3984	* property.
				3985	*
				3986	* [XXX write more on how we solve this.. _after_ merging pjt's patches that
				3987	* rewrite all of this once again.]
				3988	*/
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	3989
Hiroshi Shimamoto	ed387b7	2012-01-31 11:40:32 +0900	[diff] [blame]	3990	static unsigned long __read_mostly max_load_balance_interval = HZ/10;
				3991
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3992	#define LBF_ALL_PINNED 0x01
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	3993	#define LBF_NEED_BREAK 0x02
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	3994	#define LBF_DST_PINNED 0x04
				3995	#define LBF_SOME_PINNED 0x08
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	3996
				3997	struct lb_env {
				3998	struct sched_domain *sd;
				3999
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4000	struct rq *src_rq;
Prashanth Nageshappa	85c1e7d	2012-06-19 17:47:34 +0530	[diff] [blame]	4001	int src_cpu;
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4002
				4003	int dst_cpu;
				4004	struct rq *dst_rq;
				4005
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	4006	struct cpumask *dst_grpmask;
				4007	int new_dst_cpu;
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4008	enum cpu_idle_type idle;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4009	long imbalance;
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	4010	/* The set of CPUs under consideration for load-balancing */
				4011	struct cpumask *cpus;
				4012
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4013	unsigned int flags;
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4014
				4015	unsigned int loop;
				4016	unsigned int loop_break;
				4017	unsigned int loop_max;
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4018	};
				4019
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4020	/*
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4021	* move_task - move a task from one runqueue to another runqueue.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4022	* Both runqueues must be locked.
				4023	*/
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4024	static void move_task(struct task_struct p, struct lb_env env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4025	{
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4026	deactivate_task(env->src_rq, p, 0);
				4027	set_task_cpu(p, env->dst_cpu);
				4028	activate_task(env->dst_rq, p, 0);
				4029	check_preempt_curr(env->dst_rq, p, 0);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4030	}
				4031
				4032	/*
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4033	* Is this task likely cache-hot:
				4034	*/
				4035	static int
				4036	task_hot(struct task_struct p, u64 now, struct sched_domain sd)
				4037	{
				4038	s64 delta;
				4039
				4040	if (p->sched_class != &fair_sched_class)
				4041	return 0;
				4042
				4043	if (unlikely(p->policy == SCHED_IDLE))
				4044	return 0;
				4045
				4046	/*
				4047	* Buddy candidates are cache hot:
				4048	*/
				4049	if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
				4050	(&p->se == cfs_rq_of(&p->se)->next \|\|
				4051	&p->se == cfs_rq_of(&p->se)->last))
				4052	return 1;
				4053
				4054	if (sysctl_sched_migration_cost == -1)
				4055	return 1;
				4056	if (sysctl_sched_migration_cost == 0)
				4057	return 0;
				4058
				4059	delta = now - p->se.exec_start;
				4060
				4061	return delta < (s64)sysctl_sched_migration_cost;
				4062	}
				4063
				4064	/*
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4065	* can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
				4066	*/
				4067	static
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	4068	int can_migrate_task(struct task_struct p, struct lb_env env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4069	{
				4070	int tsk_cache_hot = 0;
				4071	/*
				4072	* We do not migrate tasks that are:
Joonsoo Kim	d319808	2013-04-23 17:27:40 +0900	[diff] [blame]	4073	* 1) throttled_lb_pair, or
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4074	* 2) cannot be migrated to this CPU due to cpus_allowed, or
Joonsoo Kim	d319808	2013-04-23 17:27:40 +0900	[diff] [blame]	4075	* 3) running (obviously), or
				4076	* 4) are cache-hot on their current CPU.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4077	*/
Joonsoo Kim	d319808	2013-04-23 17:27:40 +0900	[diff] [blame]	4078	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
				4079	return 0;
				4080
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4081	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	4082	int cpu;
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	4083
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	4084	schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	4085
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	4086	env->flags \|= LBF_SOME_PINNED;
				4087
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	4088	/*
				4089	* Remember if this task can be migrated to any other cpu in
				4090	* our sched_group. We may want to revisit it if we couldn't
				4091	* meet load balance goals by pulling other tasks on src_cpu.
				4092	*
				4093	* Also avoid computing new_dst_cpu if we have already computed
				4094	* one in current iteration.
				4095	*/
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	4096	if (!env->dst_grpmask \|\| (env->flags & LBF_DST_PINNED))
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	4097	return 0;
				4098
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	4099	/* Prevent to re-select dst_cpu via env's cpus */
				4100	for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
				4101	if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	4102	env->flags \|= LBF_DST_PINNED;
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	4103	env->new_dst_cpu = cpu;
				4104	break;
				4105	}
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	4106	}
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	4107
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4108	return 0;
				4109	}
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	4110
				4111	/* Record that we found atleast one task that could run on dst_cpu */
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	4112	env->flags &= ~LBF_ALL_PINNED;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4113
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4114	if (task_running(env->src_rq, p)) {
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	4115	schedstat_inc(p, se.statistics.nr_failed_migrations_running);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4116	return 0;
				4117	}
				4118
				4119	/*
				4120	* Aggressive migration if:
				4121	* 1) task is cache cold, or
				4122	* 2) too many balance attempts have failed.
				4123	*/
				4124
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	4125	tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4126	if (!tsk_cache_hot \|\|
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	4127	env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
Zhang Hang	4e2dcb7	2013-04-10 14:04:55 +0800	[diff] [blame]	4128
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4129	if (tsk_cache_hot) {
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	4130	schedstat_inc(env->sd, lb_hot_gained[env->idle]);
Lucas De Marchi	41acab8	2010-03-10 23:37:45 -0300	[diff] [blame]	4131	schedstat_inc(p, se.statistics.nr_forced_migrations);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4132	}
Zhang Hang	4e2dcb7	2013-04-10 14:04:55 +0800	[diff] [blame]	4133
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4134	return 1;
				4135	}
				4136
Zhang Hang	4e2dcb7	2013-04-10 14:04:55 +0800	[diff] [blame]	4137	schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
				4138	return 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4139	}
				4140
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	4141	/*
				4142	* move_one_task tries to move exactly one task from busiest to this_rq, as
				4143	* part of active balancing operations within "domain".
				4144	* Returns 1 if successful and 0 otherwise.
				4145	*
				4146	* Called with both runqueues locked.
				4147	*/
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	4148	static int move_one_task(struct lb_env *env)
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	4149	{
				4150	struct task_struct p, n;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	4151
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4152	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4153	if (!can_migrate_task(p, env))
				4154	continue;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	4155
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4156	move_task(p, env);
				4157	/*
				4158	* Right now, this is only the second place move_task()
				4159	* is called, so we can safely collect move_task()
				4160	* stats here rather than inside move_task().
				4161	*/
				4162	schedstat_inc(env->sd, lb_gained[env->idle]);
				4163	return 1;
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	4164	}
Peter Zijlstra	897c395	2009-12-17 17:45:42 +0100	[diff] [blame]	4165	return 0;
				4166	}
				4167
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4168	static unsigned long task_h_load(struct task_struct *p);
				4169
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	4170	static const unsigned int sched_nr_migrate_break = 32;
				4171
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4172	/*
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4173	* move_tasks tries to move up to imbalance weighted load from busiest to
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4174	* this_rq, as part of a balancing operation within domain "sd".
				4175	* Returns 1 if successful and 0 otherwise.
				4176	*
				4177	* Called with both runqueues locked.
				4178	*/
				4179	static int move_tasks(struct lb_env *env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4180	{
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4181	struct list_head *tasks = &env->src_rq->cfs_tasks;
				4182	struct task_struct *p;
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4183	unsigned long load;
				4184	int pulled = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4185
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4186	if (env->imbalance <= 0)
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4187	return 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4188
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4189	while (!list_empty(tasks)) {
				4190	p = list_first_entry(tasks, struct task_struct, se.group_node);
				4191
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4192	env->loop++;
				4193	/* We've more or less seen every task there is, call it quits */
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4194	if (env->loop > env->loop_max)
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4195	break;
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4196
				4197	/* take a breather every nr_migrate tasks */
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4198	if (env->loop > env->loop_break) {
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	4199	env->loop_break += sched_nr_migrate_break;
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	4200	env->flags \|= LBF_NEED_BREAK;
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4201	break;
Peter Zijlstra	a195f00	2011-09-22 15:30:18 +0200	[diff] [blame]	4202	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4203
Joonsoo Kim	d319808	2013-04-23 17:27:40 +0900	[diff] [blame]	4204	if (!can_migrate_task(p, env))
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4205	goto next;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4206
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4207	load = task_h_load(p);
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4208
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	4209	if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4210	goto next;
				4211
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4212	if ((load / 2) > env->imbalance)
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4213	goto next;
				4214
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4215	move_task(p, env);
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4216	pulled++;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4217	env->imbalance -= load;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4218
				4219	#ifdef CONFIG_PREEMPT
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4220	/*
				4221	* NEWIDLE balancing is a source of latency, so preemptible
				4222	* kernels will stop after the first task is pulled to minimize
				4223	* the critical section.
				4224	*/
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4225	if (env->idle == CPU_NEWLY_IDLE)
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4226	break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4227	#endif
				4228
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4229	/*
				4230	* We only want to steal up to the prescribed amount of
				4231	* weighted load.
				4232	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4233	if (env->imbalance <= 0)
Peter Zijlstra	ee00e66	2009-12-17 17:25:20 +0100	[diff] [blame]	4234	break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4235
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4236	continue;
				4237	next:
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4238	list_move_tail(&p->se.group_node, tasks);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4239	}
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4240
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4241	/*
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	4242	* Right now, this is one of only two places move_task() is called,
				4243	* so we can safely collect move_task() stats here rather than
				4244	* inside move_task().
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4245	*/
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	4246	schedstat_add(env->sd, lb_gained[env->idle], pulled);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4247
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	4248	return pulled;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4249	}
				4250
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4251	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4252	/*
				4253	* update tg->load_weight by folding this cpu's load_avg
				4254	*/
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4255	static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4256	{
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4257	struct sched_entity *se = tg->se[cpu];
				4258	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4259
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4260	/* throttled entities do not contribute to load */
				4261	if (throttled_hierarchy(cfs_rq))
				4262	return;
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4263
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	4264	update_cfs_rq_blocked_load(cfs_rq, 1);
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4265
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	4266	if (se) {
				4267	update_entity_load_avg(se, 1);
				4268	/*
				4269	* We pivot on our runnable average having decayed to zero for
				4270	* list removal. This generally implies that all our children
				4271	* have also been removed (modulo rounding error or bandwidth
				4272	* control); however, such cases are rare and we can fix these
				4273	* at enqueue.
				4274	*
				4275	* TODO: fix up out-of-order children on enqueue.
				4276	*/
				4277	if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
				4278	list_del_leaf_cfs_rq(cfs_rq);
				4279	} else {
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4280	struct rq *rq = rq_of(cfs_rq);
Paul Turner	8295836	2012-10-04 13:18:31 +0200	[diff] [blame]	4281	update_rq_runnable_avg(rq, rq->nr_running);
				4282	}
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4283	}
				4284
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4285	static void update_blocked_averages(int cpu)
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4286	{
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4287	struct rq *rq = cpu_rq(cpu);
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4288	struct cfs_rq *cfs_rq;
				4289	unsigned long flags;
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4290
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4291	raw_spin_lock_irqsave(&rq->lock, flags);
				4292	update_rq_clock(rq);
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4293	/*
				4294	* Iterates the task_group tree in a bottom up fashion, see
				4295	* list_add_leaf_cfs_rq() for details.
				4296	*/
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	4297	for_each_leaf_cfs_rq(rq, cfs_rq) {
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4298	/*
				4299	* Note: We may want to consider periodically releasing
				4300	* rq->lock about these updates so that creating many task
				4301	* groups does not result in continually extending hold time.
				4302	*/
				4303	__update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
Paul Turner	64660c8	2011-07-21 09:43:36 -0700	[diff] [blame]	4304	}
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4305
				4306	raw_spin_unlock_irqrestore(&rq->lock, flags);
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4307	}
				4308
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4309	/*
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	4310	* Compute the hierarchical load factor for cfs_rq and all its ascendants.
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4311	* This needs to be done in a top-down fashion because the load of a child
				4312	* group is a fraction of its parents load.
				4313	*/
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	4314	static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4315	{
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	4316	struct rq *rq = rq_of(cfs_rq);
				4317	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
Peter Zijlstra	a35b646	2012-08-08 21:46:40 +0200	[diff] [blame]	4318	unsigned long now = jiffies;
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	4319	unsigned long load;
Peter Zijlstra	a35b646	2012-08-08 21:46:40 +0200	[diff] [blame]	4320
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	4321	if (cfs_rq->last_h_load_update == now)
Peter Zijlstra	a35b646	2012-08-08 21:46:40 +0200	[diff] [blame]	4322	return;
				4323
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	4324	cfs_rq->h_load_next = NULL;
				4325	for_each_sched_entity(se) {
				4326	cfs_rq = cfs_rq_of(se);
				4327	cfs_rq->h_load_next = se;
				4328	if (cfs_rq->last_h_load_update == now)
				4329	break;
				4330	}
Peter Zijlstra	a35b646	2012-08-08 21:46:40 +0200	[diff] [blame]	4331
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	4332	if (!se) {
Vladimir Davydov	7e3115e	2013-09-14 19:39:46 +0400	[diff] [blame]	4333	cfs_rq->h_load = cfs_rq->runnable_load_avg;
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	4334	cfs_rq->last_h_load_update = now;
				4335	}
				4336
				4337	while ((se = cfs_rq->h_load_next) != NULL) {
				4338	load = cfs_rq->h_load;
				4339	load = div64_ul(load * se->avg.load_avg_contrib,
				4340	cfs_rq->runnable_load_avg + 1);
				4341	cfs_rq = group_cfs_rq(se);
				4342	cfs_rq->h_load = load;
				4343	cfs_rq->last_h_load_update = now;
				4344	}
Peter Zijlstra	9763b67	2011-07-13 13:09:25 +0200	[diff] [blame]	4345	}
				4346
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4347	static unsigned long task_h_load(struct task_struct *p)
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4348	{
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4349	struct cfs_rq *cfs_rq = task_cfs_rq(p);
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4350
Vladimir Davydov	6852079	2013-07-15 17:49:19 +0400	[diff] [blame]	4351	update_cfs_rq_h_load(cfs_rq);
Alex Shi	a003a25	2013-06-20 10:18:51 +0800	[diff] [blame]	4352	return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
				4353	cfs_rq->runnable_load_avg + 1);
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4354	}
				4355	#else
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	4356	static inline void update_blocked_averages(int cpu)
Peter Zijlstra	9e3081c	2010-11-15 15:47:02 -0800	[diff] [blame]	4357	{
				4358	}
				4359
Peter Zijlstra	367456c	2012-02-20 21:49:09 +0100	[diff] [blame]	4360	static unsigned long task_h_load(struct task_struct *p)
				4361	{
Alex Shi	a003a25	2013-06-20 10:18:51 +0800	[diff] [blame]	4362	return p->se.avg.load_avg_contrib;
Peter Zijlstra	230059de	2009-12-17 17:47:12 +0100	[diff] [blame]	4363	}
				4364	#endif
				4365
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4366	/******** Helpers for find_busiest_group **********************/
				4367	/*
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4368	* sg_lb_stats - stats of a sched_group required for load_balancing
				4369	*/
				4370	struct sg_lb_stats {
				4371	unsigned long avg_load; /Avg load across the CPUs of the group /
				4372	unsigned long group_load; /* Total load over the CPUs of the group */
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4373	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4374	unsigned long load_per_task;
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	4375	unsigned long group_power;
Peter Zijlstra	147c5fc	2013-08-19 15:22:57 +0200	[diff] [blame]	4376	unsigned int sum_nr_running; /* Nr tasks running in the group */
				4377	unsigned int group_capacity;
				4378	unsigned int idle_cpus;
				4379	unsigned int group_weight;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4380	int group_imb; /* Is there an imbalance in the group ? */
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4381	int group_has_capacity; /* Is there extra capacity in the group? */
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4382	};
				4383
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4384	/*
				4385	* sd_lb_stats - Structure to store the statistics of a sched_domain
				4386	* during load balancing.
				4387	*/
				4388	struct sd_lb_stats {
				4389	struct sched_group busiest; / Busiest group in this sd */
				4390	struct sched_group local; / Local group in this sd */
				4391	unsigned long total_load; /* Total load of all groups in sd */
				4392	unsigned long total_pwr; /* Total power of all groups in sd */
				4393	unsigned long avg_load; /* Average load across all groups in sd */
				4394
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4395	struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
Peter Zijlstra	147c5fc	2013-08-19 15:22:57 +0200	[diff] [blame]	4396	struct sg_lb_stats local_stat; /* Statistics of the local group */
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4397	};
				4398
Peter Zijlstra	147c5fc	2013-08-19 15:22:57 +0200	[diff] [blame]	4399	static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
				4400	{
				4401	/*
				4402	* Skimp on the clearing to avoid duplicate work. We can avoid clearing
				4403	* local_stat because update_sg_lb_stats() does a full clear/assignment.
				4404	* We must however clear busiest_stat::avg_load because
				4405	* update_sd_pick_busiest() reads this before assignment.
				4406	*/
				4407	*sds = (struct sd_lb_stats){
				4408	.busiest = NULL,
				4409	.local = NULL,
				4410	.total_load = 0UL,
				4411	.total_pwr = 0UL,
				4412	.busiest_stat = {
				4413	.avg_load = 0UL,
				4414	},
				4415	};
				4416	}
				4417
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4418	/**
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4419	* get_sd_load_idx - Obtain the load index for a given sched domain.
				4420	* @sd: The sched_domain whose load_idx is to be obtained.
				4421	* @idle: The Idle status of the CPU for whose sd load_icx is obtained.
Yacine Belkadi	e69f618	2013-07-12 20:45:47 +0200	[diff] [blame]	4422	*
				4423	* Return: The load index.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4424	*/
				4425	static inline int get_sd_load_idx(struct sched_domain *sd,
				4426	enum cpu_idle_type idle)
				4427	{
				4428	int load_idx;
				4429
				4430	switch (idle) {
				4431	case CPU_NOT_IDLE:
				4432	load_idx = sd->busy_idx;
				4433	break;
				4434
				4435	case CPU_NEWLY_IDLE:
				4436	load_idx = sd->newidle_idx;
				4437	break;
				4438	default:
				4439	load_idx = sd->idle_idx;
				4440	break;
				4441	}
				4442
				4443	return load_idx;
				4444	}
				4445
Li Zefan	15f803c	2013-03-05 16:07:11 +0800	[diff] [blame]	4446	static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4447	{
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4448	return SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4449	}
				4450
				4451	unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
				4452	{
				4453	return default_scale_freq_power(sd, cpu);
				4454	}
				4455
Li Zefan	15f803c	2013-03-05 16:07:11 +0800	[diff] [blame]	4456	static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4457	{
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	4458	unsigned long weight = sd->span_weight;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4459	unsigned long smt_gain = sd->smt_gain;
				4460
				4461	smt_gain /= weight;
				4462
				4463	return smt_gain;
				4464	}
				4465
				4466	unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
				4467	{
				4468	return default_scale_smt_power(sd, cpu);
				4469	}
				4470
Li Zefan	15f803c	2013-03-05 16:07:11 +0800	[diff] [blame]	4471	static unsigned long scale_rt_power(int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4472	{
				4473	struct rq *rq = cpu_rq(cpu);
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	4474	u64 total, available, age_stamp, avg;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4475
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	4476	/*
				4477	* Since we're reading these variables without serialization make sure
				4478	* we read them once before doing sanity checks on them.
				4479	*/
				4480	age_stamp = ACCESS_ONCE(rq->age_stamp);
				4481	avg = ACCESS_ONCE(rq->rt_avg);
Venkatesh Pallipadi	aa48380	2010-10-04 17:03:22 -0700	[diff] [blame]	4482
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	4483	total = sched_avg_period() + (rq_clock(rq) - age_stamp);
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	4484
				4485	if (unlikely(total < avg)) {
Venkatesh Pallipadi	aa48380	2010-10-04 17:03:22 -0700	[diff] [blame]	4486	/* Ensures that power won't end up being negative */
				4487	available = 0;
				4488	} else {
Peter Zijlstra	b654f7d	2012-05-22 14:04:28 +0200	[diff] [blame]	4489	available = total - avg;
Venkatesh Pallipadi	aa48380	2010-10-04 17:03:22 -0700	[diff] [blame]	4490	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4491
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4492	if (unlikely((s64)total < SCHED_POWER_SCALE))
				4493	total = SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4494
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4495	total >>= SCHED_POWER_SHIFT;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4496
				4497	return div_u64(available, total);
				4498	}
				4499
				4500	static void update_cpu_power(struct sched_domain *sd, int cpu)
				4501	{
Peter Zijlstra	669c55e	2010-04-16 14:59:29 +0200	[diff] [blame]	4502	unsigned long weight = sd->span_weight;
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4503	unsigned long power = SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4504	struct sched_group *sdg = sd->groups;
				4505
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4506	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
				4507	if (sched_feat(ARCH_POWER))
				4508	power *= arch_scale_smt_power(sd, cpu);
				4509	else
				4510	power *= default_scale_smt_power(sd, cpu);
				4511
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4512	power >>= SCHED_POWER_SHIFT;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4513	}
				4514
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4515	sdg->sgp->power_orig = power;
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4516
				4517	if (sched_feat(ARCH_POWER))
				4518	power *= arch_scale_freq_power(sd, cpu);
				4519	else
				4520	power *= default_scale_freq_power(sd, cpu);
				4521
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4522	power >>= SCHED_POWER_SHIFT;
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4523
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4524	power *= scale_rt_power(cpu);
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4525	power >>= SCHED_POWER_SHIFT;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4526
				4527	if (!power)
				4528	power = 1;
				4529
Peter Zijlstra	e51fd5e	2010-05-31 12:37:30 +0200	[diff] [blame]	4530	cpu_rq(cpu)->cpu_power = power;
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4531	sdg->sgp->power = power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4532	}
				4533
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	4534	void update_group_power(struct sched_domain *sd, int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4535	{
				4536	struct sched_domain *child = sd->child;
				4537	struct sched_group group, sdg = sd->groups;
Peter Zijlstra	863bffc	2013-08-28 11:44:39 +0200	[diff] [blame]	4538	unsigned long power, power_orig;
Vincent Guittot	4ec4412	2011-12-12 20:21:08 +0100	[diff] [blame]	4539	unsigned long interval;
				4540
				4541	interval = msecs_to_jiffies(sd->balance_interval);
				4542	interval = clamp(interval, 1UL, max_load_balance_interval);
				4543	sdg->sgp->next_update = jiffies + interval;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4544
				4545	if (!child) {
				4546	update_cpu_power(sd, cpu);
				4547	return;
				4548	}
				4549
Peter Zijlstra	863bffc	2013-08-28 11:44:39 +0200	[diff] [blame]	4550	power_orig = power = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4551
Peter Zijlstra	74a5ce2	2012-05-23 18:00:43 +0200	[diff] [blame]	4552	if (child->flags & SD_OVERLAP) {
				4553	/*
				4554	* SD_OVERLAP domains cannot assume that child groups
				4555	* span the current group.
				4556	*/
				4557
Peter Zijlstra	863bffc	2013-08-28 11:44:39 +0200	[diff] [blame]	4558	for_each_cpu(cpu, sched_group_cpus(sdg)) {
				4559	struct sched_group *sg = cpu_rq(cpu)->sd->groups;
				4560
				4561	power_orig += sg->sgp->power_orig;
				4562	power += sg->sgp->power;
				4563	}
Peter Zijlstra	74a5ce2	2012-05-23 18:00:43 +0200	[diff] [blame]	4564	} else {
				4565	/*
				4566	* !SD_OVERLAP domains can assume that child groups
				4567	* span the current group.
				4568	*/
				4569
				4570	group = child->groups;
				4571	do {
Peter Zijlstra	863bffc	2013-08-28 11:44:39 +0200	[diff] [blame]	4572	power_orig += group->sgp->power_orig;
Peter Zijlstra	74a5ce2	2012-05-23 18:00:43 +0200	[diff] [blame]	4573	power += group->sgp->power;
				4574	group = group->next;
				4575	} while (group != child->groups);
				4576	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4577
Peter Zijlstra	863bffc	2013-08-28 11:44:39 +0200	[diff] [blame]	4578	sdg->sgp->power_orig = power_orig;
				4579	sdg->sgp->power = power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4580	}
				4581
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4582	/*
				4583	* Try and fix up capacity for tiny siblings, this is needed when
				4584	* things like SD_ASYM_PACKING need f_b_g to select another sibling
				4585	* which on its own isn't powerful enough.
				4586	*
				4587	* See update_sd_pick_busiest() and check_asym_packing().
				4588	*/
				4589	static inline int
				4590	fix_small_capacity(struct sched_domain sd, struct sched_group group)
				4591	{
				4592	/*
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4593	* Only siblings can have significantly less than SCHED_POWER_SCALE
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4594	*/
Peter Zijlstra	a6c75f2	2011-04-07 14:09:52 +0200	[diff] [blame]	4595	if (!(sd->flags & SD_SHARE_CPUPOWER))
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4596	return 0;
				4597
				4598	/*
				4599	* If ~90% of the cpu_power is still there, we're good.
				4600	*/
Peter Zijlstra	9c3f75c	2011-07-14 13:00:06 +0200	[diff] [blame]	4601	if (group->sgp->power * 32 > group->sgp->power_orig * 29)
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	4602	return 1;
				4603
				4604	return 0;
				4605	}
				4606
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	4607	/*
				4608	* Group imbalance indicates (and tries to solve) the problem where balancing
				4609	* groups is inadequate due to tsk_cpus_allowed() constraints.
				4610	*
				4611	* Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
				4612	* cpumask covering 1 cpu of the first group and 3 cpus of the second group.
				4613	* Something like:
				4614	*
				4615	* { 0 1 2 3 } { 4 5 6 7 }
				4616	* * * * *
				4617	*
				4618	* If we were to balance group-wise we'd place two tasks in the first group and
				4619	* two tasks in the second group. Clearly this is undesired as it will overload
				4620	* cpu 3 and leave one of the cpus in the second group unused.
				4621	*
				4622	* The current solution to this issue is detecting the skew in the first group
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	4623	* by noticing the lower domain failed to reach balance and had difficulty
				4624	* moving tasks due to affinity constraints.
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	4625	*
				4626	* When this is so detected; this group becomes a candidate for busiest; see
				4627	* update_sd_pick_busiest(). And calculcate_imbalance() and
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	4628	* find_busiest_group() avoid some of the usual balance conditions to allow it
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	4629	* to create an effective group imbalance.
				4630	*
				4631	* This is a somewhat tricky proposition since the next run might not find the
				4632	* group imbalance and decide the groups need to be balanced again. A most
				4633	* subtle and fragile situation.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4634	*/
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	4635
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	4636	static inline int sg_imbalanced(struct sched_group *group)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4637	{
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	4638	return group->sgp->imbalance;
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	4639	}
				4640
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	4641	/*
				4642	* Compute the group capacity.
				4643	*
Peter Zijlstra	c61037e	2013-08-28 12:40:38 +0200	[diff] [blame]	4644	* Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by
				4645	* first dividing out the smt factor and computing the actual number of cores
				4646	* and limit power unit capacity with that.
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	4647	*/
				4648	static inline int sg_capacity(struct lb_env env, struct sched_group group)
				4649	{
Peter Zijlstra	c61037e	2013-08-28 12:40:38 +0200	[diff] [blame]	4650	unsigned int capacity, smt, cpus;
				4651	unsigned int power, power_orig;
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	4652
Peter Zijlstra	c61037e	2013-08-28 12:40:38 +0200	[diff] [blame]	4653	power = group->sgp->power;
				4654	power_orig = group->sgp->power_orig;
				4655	cpus = group->group_weight;
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	4656
Peter Zijlstra	c61037e	2013-08-28 12:40:38 +0200	[diff] [blame]	4657	/* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */
				4658	smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig);
				4659	capacity = cpus / smt; /* cores */
				4660
				4661	capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE));
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	4662	if (!capacity)
				4663	capacity = fix_small_capacity(env->sd, group);
				4664
				4665	return capacity;
				4666	}
				4667
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4668	/**
				4669	* update_sg_lb_stats - Update sched_group's statistics for load balancing.
				4670	* @env: The load balancing environment.
				4671	* @group: sched_group whose statistics are to be updated.
				4672	* @load_idx: Load index of sched_domain of this_cpu for load calc.
				4673	* @local_group: Does group contain this_cpu.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4674	* @sgs: variable to hold the statistics for this group.
				4675	*/
				4676	static inline void update_sg_lb_stats(struct lb_env *env,
				4677	struct sched_group *group, int load_idx,
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	4678	int local_group, struct sg_lb_stats *sgs)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4679	{
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	4680	unsigned long nr_running;
				4681	unsigned long load;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4682	int i;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4683
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	4684	memset(sgs, 0, sizeof(*sgs));
				4685
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	4686	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4687	struct rq *rq = cpu_rq(i);
				4688
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4689	nr_running = rq->nr_running;
				4690
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4691	/* Bias balancing toward cpus of our domain */
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	4692	if (local_group)
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	4693	load = target_load(i, load_idx);
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	4694	else
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4695	load = source_load(i, load_idx);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4696
				4697	sgs->group_load += load;
Peter Zijlstra	e44bc5c	2012-05-11 00:22:12 +0200	[diff] [blame]	4698	sgs->sum_nr_running += nr_running;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4699	sgs->sum_weighted_load += weighted_cpuload(i);
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4700	if (idle_cpu(i))
				4701	sgs->idle_cpus++;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4702	}
				4703
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4704	/* Adjust by relative CPU power of the group */
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	4705	sgs->group_power = group->sgp->power;
				4706	sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4707
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4708	if (sgs->sum_nr_running)
Peter Zijlstra	38d0f77	2013-08-15 19:47:56 +0200	[diff] [blame]	4709	sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4710
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	4711	sgs->group_weight = group->group_weight;
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4712
Peter Zijlstra	b37d931	2013-08-28 11:50:34 +0200	[diff] [blame]	4713	sgs->group_imb = sg_imbalanced(group);
				4714	sgs->group_capacity = sg_capacity(env, group);
				4715
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	4716	if (sgs->group_capacity > sgs->sum_nr_running)
				4717	sgs->group_has_capacity = 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4718	}
				4719
				4720	/**
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4721	* update_sd_pick_busiest - return 1 on busiest group
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4722	* @env: The load balancing environment.
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4723	* @sds: sched_domain statistics
				4724	* @sg: sched_group candidate to be checked for being the busiest
Michael Neuling	b6b1229	2010-06-10 12:06:21 +1000	[diff] [blame]	4725	* @sgs: sched_group statistics
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4726	*
				4727	* Determine if @sg is a busier group than the previously selected
				4728	* busiest group.
Yacine Belkadi	e69f618	2013-07-12 20:45:47 +0200	[diff] [blame]	4729	*
				4730	* Return: %true if @sg is a busier group than the previously selected
				4731	* busiest group. %false otherwise.
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4732	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4733	static bool update_sd_pick_busiest(struct lb_env *env,
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4734	struct sd_lb_stats *sds,
				4735	struct sched_group *sg,
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4736	struct sg_lb_stats *sgs)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4737	{
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4738	if (sgs->avg_load <= sds->busiest_stat.avg_load)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4739	return false;
				4740
				4741	if (sgs->sum_nr_running > sgs->group_capacity)
				4742	return true;
				4743
				4744	if (sgs->group_imb)
				4745	return true;
				4746
				4747	/*
				4748	* ASYM_PACKING needs to move all the work to the lowest
				4749	* numbered CPUs in the group, therefore mark all groups
				4750	* higher than ourself as busy.
				4751	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4752	if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
				4753	env->dst_cpu < group_first_cpu(sg)) {
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4754	if (!sds->busiest)
				4755	return true;
				4756
				4757	if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
				4758	return true;
				4759	}
				4760
				4761	return false;
				4762	}
				4763
				4764	/**
Hui Kang	461819a	2011-10-11 23:00:59 -0400	[diff] [blame]	4765	* update_sd_lb_stats - Update sched_domain's statistics for load balancing.
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4766	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4767	* @balance: Should we balance.
				4768	* @sds: variable to hold the statistics for this sched_domain.
				4769	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4770	static inline void update_sd_lb_stats(struct lb_env *env,
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	4771	struct sd_lb_stats *sds)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4772	{
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4773	struct sched_domain *child = env->sd->child;
				4774	struct sched_group *sg = env->sd->groups;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4775	struct sg_lb_stats tmp_sgs;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4776	int load_idx, prefer_sibling = 0;
				4777
				4778	if (child && child->flags & SD_PREFER_SIBLING)
				4779	prefer_sibling = 1;
				4780
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4781	load_idx = get_sd_load_idx(env->sd, env->idle);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4782
				4783	do {
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4784	struct sg_lb_stats *sgs = &tmp_sgs;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4785	int local_group;
				4786
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4787	local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4788	if (local_group) {
				4789	sds->local = sg;
				4790	sgs = &sds->local_stat;
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	4791
				4792	if (env->idle != CPU_NEWLY_IDLE \|\|
				4793	time_after_eq(jiffies, sg->sgp->next_update))
				4794	update_group_power(env->sd, env->dst_cpu);
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4795	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4796
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4797	update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4798
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	4799	if (local_group)
				4800	goto next_group;
				4801
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4802	/*
				4803	* In case the child domain prefers tasks go to siblings
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4804	* first, lower the sg capacity to one so that we'll try
Nikhil Rao	75dd321	2010-10-15 13:12:30 -0700	[diff] [blame]	4805	* and move all the excess tasks away. We lower the capacity
				4806	* of a group only if the local group has the capacity to fit
				4807	* these excess tasks, i.e. nr_running < group_capacity. The
				4808	* extra check prevents the case where you always pull from the
				4809	* heaviest group when it is already under-utilized (possible
				4810	* with a large weight task outweighs the tasks on the system).
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4811	*/
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	4812	if (prefer_sibling && sds->local &&
				4813	sds->local_stat.group_has_capacity)
Peter Zijlstra	147c5fc	2013-08-19 15:22:57 +0200	[diff] [blame]	4814	sgs->group_capacity = min(sgs->group_capacity, 1U);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4815
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	4816	if (update_sd_pick_busiest(env, sds, sg, sgs)) {
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4817	sds->busiest = sg;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4818	sds->busiest_stat = *sgs;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4819	}
				4820
Peter Zijlstra	b72ff13	2013-08-28 10:32:32 +0200	[diff] [blame]	4821	next_group:
				4822	/* Now, start updating sd_lb_stats */
				4823	sds->total_load += sgs->group_load;
				4824	sds->total_pwr += sgs->group_power;
				4825
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4826	sg = sg->next;
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4827	} while (sg != env->sd->groups);
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4828	}
				4829
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4830	/**
				4831	* check_asym_packing - Check to see if the group is packed into the
				4832	* sched doman.
				4833	*
				4834	* This is primarily intended to used at the sibling level. Some
				4835	* cores like POWER7 prefer to use lower numbered SMT threads. In the
				4836	* case of POWER7, it can move to lower SMT modes only when higher
				4837	* threads are idle. When in lower SMT modes, the threads will
				4838	* perform better since they share less core resources. Hence when we
				4839	* have idle threads, we want them to be the higher ones.
				4840	*
				4841	* This packing function is run on idle threads. It checks to see if
				4842	* the busiest CPU in this domain (core in the P7 case) has a higher
				4843	* CPU number than the packing function is being run on. Here we are
				4844	* assuming lower CPU number will be equivalent to lower a SMT thread
				4845	* number.
				4846	*
Yacine Belkadi	e69f618	2013-07-12 20:45:47 +0200	[diff] [blame]	4847	* Return: 1 when packing is required and a task should be moved to
Michael Neuling	b6b1229	2010-06-10 12:06:21 +1000	[diff] [blame]	4848	* this CPU. The amount of the imbalance is returned in *imbalance.
				4849	*
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4850	* @env: The load balancing environment.
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4851	* @sds: Statistics of the sched_domain which is to be packed
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4852	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4853	static int check_asym_packing(struct lb_env env, struct sd_lb_stats sds)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4854	{
				4855	int busiest_cpu;
				4856
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4857	if (!(env->sd->flags & SD_ASYM_PACKING))
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4858	return 0;
				4859
				4860	if (!sds->busiest)
				4861	return 0;
				4862
				4863	busiest_cpu = group_first_cpu(sds->busiest);
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4864	if (env->dst_cpu > busiest_cpu)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4865	return 0;
				4866
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4867	env->imbalance = DIV_ROUND_CLOSEST(
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	4868	sds->busiest_stat.avg_load * sds->busiest_stat.group_power,
				4869	SCHED_POWER_SCALE);
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4870
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	4871	return 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4872	}
				4873
				4874	/**
				4875	* fix_small_imbalance - Calculate the minor imbalance that exists
				4876	* amongst the groups of a sched_domain, during
				4877	* load balancing.
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	4878	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4879	* @sds: Statistics of the sched_domain whose imbalance is to be calculated.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4880	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4881	static inline
				4882	void fix_small_imbalance(struct lb_env env, struct sd_lb_stats sds)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4883	{
				4884	unsigned long tmp, pwr_now = 0, pwr_move = 0;
				4885	unsigned int imbn = 2;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4886	unsigned long scaled_busy_load_per_task;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4887	struct sg_lb_stats local, busiest;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4888
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4889	local = &sds->local_stat;
				4890	busiest = &sds->busiest_stat;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4891
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4892	if (!local->sum_nr_running)
				4893	local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
				4894	else if (busiest->load_per_task > local->load_per_task)
				4895	imbn = 1;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4896
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4897	scaled_busy_load_per_task =
				4898	(busiest->load_per_task * SCHED_POWER_SCALE) /
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	4899	busiest->group_power;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4900
Vladimir Davydov	3029ede	2013-09-15 17:49:14 +0400	[diff] [blame]	4901	if (busiest->avg_load + scaled_busy_load_per_task >=
				4902	local->avg_load + (scaled_busy_load_per_task * imbn)) {
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4903	env->imbalance = busiest->load_per_task;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4904	return;
				4905	}
				4906
				4907	/*
				4908	* OK, we don't have enough imbalance to justify moving tasks,
				4909	* however we may be able to increase total CPU power used by
				4910	* moving them.
				4911	*/
				4912
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	4913	pwr_now += busiest->group_power *
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4914	min(busiest->load_per_task, busiest->avg_load);
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	4915	pwr_now += local->group_power *
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4916	min(local->load_per_task, local->avg_load);
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4917	pwr_now /= SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4918
				4919	/* Amount of load we'd subtract */
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4920	tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	4921	busiest->group_power;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4922	if (busiest->avg_load > tmp) {
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	4923	pwr_move += busiest->group_power *
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4924	min(busiest->load_per_task,
				4925	busiest->avg_load - tmp);
				4926	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4927
				4928	/* Amount of load we'd add */
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	4929	if (busiest->avg_load * busiest->group_power <
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4930	busiest->load_per_task * SCHED_POWER_SCALE) {
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	4931	tmp = (busiest->avg_load * busiest->group_power) /
				4932	local->group_power;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4933	} else {
				4934	tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	4935	local->group_power;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4936	}
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	4937	pwr_move += local->group_power *
				4938	min(local->load_per_task, local->avg_load + tmp);
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4939	pwr_move /= SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4940
				4941	/* Move if we gain throughput */
				4942	if (pwr_move > pwr_now)
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4943	env->imbalance = busiest->load_per_task;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4944	}
				4945
				4946	/**
				4947	* calculate_imbalance - Calculate the amount of imbalance present within the
				4948	* groups of a given sched_domain during load balance.
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4949	* @env: load balance environment
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4950	* @sds: statistics of the sched_domain whose imbalance is to be calculated.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4951	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4952	static inline void calculate_imbalance(struct lb_env env, struct sd_lb_stats sds)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4953	{
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4954	unsigned long max_pull, load_above_capacity = ~0UL;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4955	struct sg_lb_stats local, busiest;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4956
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4957	local = &sds->local_stat;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4958	busiest = &sds->busiest_stat;
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4959
				4960	if (busiest->group_imb) {
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	4961	/*
				4962	* In the group_imb case we cannot rely on group-wide averages
				4963	* to ensure cpu-load equilibrium, look at wider averages. XXX
				4964	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4965	busiest->load_per_task =
				4966	min(busiest->load_per_task, sds->avg_load);
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4967	}
				4968
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4969	/*
				4970	* In the presence of smp nice balancing, certain scenarios can have
				4971	* max load less than avg load(as we skip the groups at or below
				4972	* its cpu_power, while calculating max_load..)
				4973	*/
Vladimir Davydov	b188555	2013-09-15 17:49:13 +0400	[diff] [blame]	4974	if (busiest->avg_load <= sds->avg_load \|\|
				4975	local->avg_load >= sds->avg_load) {
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	4976	env->imbalance = 0;
				4977	return fix_small_imbalance(env, sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	4978	}
				4979
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4980	if (!busiest->group_imb) {
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4981	/*
				4982	* Don't want to pull so many tasks that a group would go idle.
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	4983	* Except of course for the group_imb case, since then we might
				4984	* have to drop below capacity to reach cpu-load equilibrium.
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4985	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	4986	load_above_capacity =
				4987	(busiest->sum_nr_running - busiest->group_capacity);
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4988
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	4989	load_above_capacity = (SCHED_LOAD_SCALE SCHED_POWER_SCALE);
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	4990	load_above_capacity /= busiest->group_power;
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	4991	}
				4992
				4993	/*
				4994	* We're trying to get all the cpus to the average_load, so we don't
				4995	* want to push ourselves above the average load, nor do we wish to
				4996	* reduce the max loaded cpu below the average load. At the same time,
				4997	* we also don't want to reduce the group load below the group capacity
				4998	* (so that we can implement power-savings policies etc). Thus we look
				4999	* for the minimum possible imbalance.
Suresh Siddha	dd5feea	2010-02-23 16:13:52 -0800	[diff] [blame]	5000	*/
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	5001	max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5002
				5003	/* How much load to actually move to equalise the imbalance */
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5004	env->imbalance = min(
Peter Zijlstra	3ae11c9	2013-08-15 20:37:48 +0200	[diff] [blame]	5005	max_pull * busiest->group_power,
				5006	(sds->avg_load - local->avg_load) * local->group_power
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5007	) / SCHED_POWER_SCALE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5008
				5009	/*
				5010	* if *imbalance is less than the average load per runnable task
Lucas De Marchi	25985ed	2011-03-30 22:57:33 -0300	[diff] [blame]	5011	* there is no guarantee that any tasks will be moved so we'll have
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5012	* a think about bumping its value to force at least one task to be
				5013	* moved
				5014	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5015	if (env->imbalance < busiest->load_per_task)
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5016	return fix_small_imbalance(env, sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5017	}
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	5018
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5019	/***** find_busiest_group() helpers end here *******************/
				5020
				5021	/**
				5022	* find_busiest_group - Returns the busiest group within the sched_domain
				5023	* if there is an imbalance. If there isn't an imbalance, and
				5024	* the user has opted for power-savings, it returns a group whose
				5025	* CPUs can be put to idle by rebalancing those tasks elsewhere, if
				5026	* such a group exists.
				5027	*
				5028	* Also calculates the amount of weighted load which should be moved
				5029	* to restore balance.
				5030	*
Randy Dunlap	cd96891	2012-06-08 13:18:33 -0700	[diff] [blame]	5031	* @env: The load balancing environment.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5032	*
Yacine Belkadi	e69f618	2013-07-12 20:45:47 +0200	[diff] [blame]	5033	* Return: - The busiest group if imbalance exists.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5034	* - If no imbalance and user has opted for power-savings balance,
				5035	* return the least loaded group whose CPUs can be
				5036	* put to idle by rebalancing its tasks onto our group.
				5037	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5038	static struct sched_group find_busiest_group(struct lb_env env)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5039	{
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5040	struct sg_lb_stats local, busiest;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5041	struct sd_lb_stats sds;
				5042
Peter Zijlstra	147c5fc	2013-08-19 15:22:57 +0200	[diff] [blame]	5043	init_sd_lb_stats(&sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5044
				5045	/*
				5046	* Compute the various statistics relavent for load balancing at
				5047	* this level.
				5048	*/
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	5049	update_sd_lb_stats(env, &sds);
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5050	local = &sds.local_stat;
				5051	busiest = &sds.busiest_stat;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5052
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5053	if ((env->idle == CPU_IDLE \|\| env->idle == CPU_NEWLY_IDLE) &&
				5054	check_asym_packing(env, &sds))
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	5055	return sds.busiest;
				5056
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	5057	/* There is no busy sibling group to pull tasks from */
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5058	if (!sds.busiest \|\| busiest->sum_nr_running == 0)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5059	goto out_balanced;
				5060
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	5061	sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
Ken Chen	b0432d8	2011-04-07 17:23:22 -0700	[diff] [blame]	5062
Peter Zijlstra	866ab43	2011-02-21 18:56:47 +0100	[diff] [blame]	5063	/*
				5064	* If the busiest group is imbalanced the below checks don't
Peter Zijlstra	30ce5da	2013-08-15 20:29:29 +0200	[diff] [blame]	5065	* work because they assume all things are equal, which typically
Peter Zijlstra	866ab43	2011-02-21 18:56:47 +0100	[diff] [blame]	5066	* isn't true due to cpus_allowed constraints and the like.
				5067	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5068	if (busiest->group_imb)
Peter Zijlstra	866ab43	2011-02-21 18:56:47 +0100	[diff] [blame]	5069	goto force_balance;
				5070
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	5071	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5072	if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity &&
				5073	!busiest->group_has_capacity)
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	5074	goto force_balance;
				5075
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	5076	/*
				5077	* If the local group is more busy than the selected busiest group
				5078	* don't try and pull any tasks.
				5079	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5080	if (local->avg_load >= busiest->avg_load)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5081	goto out_balanced;
				5082
Peter Zijlstra	cc57aa8	2011-02-21 18:55:32 +0100	[diff] [blame]	5083	/*
				5084	* Don't pull any tasks if this group is already above the domain
				5085	* average load.
				5086	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5087	if (local->avg_load >= sds.avg_load)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5088	goto out_balanced;
				5089
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5090	if (env->idle == CPU_IDLE) {
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	5091	/*
				5092	* This cpu is idle. If the busiest group load doesn't
				5093	* have more tasks than the number of available cpu's and
				5094	* there is no imbalance between this and busiest group
				5095	* wrt to idle cpu's, it is balanced.
				5096	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5097	if ((local->idle_cpus < busiest->idle_cpus) &&
				5098	busiest->sum_nr_running <= busiest->group_weight)
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	5099	goto out_balanced;
Peter Zijlstra	c186faf	2011-02-21 18:52:53 +0100	[diff] [blame]	5100	} else {
				5101	/*
				5102	* In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
				5103	* imbalance_pct to be conservative.
				5104	*/
Joonsoo Kim	56cf515	2013-08-06 17:36:43 +0900	[diff] [blame]	5105	if (100 * busiest->avg_load <=
				5106	env->sd->imbalance_pct * local->avg_load)
Peter Zijlstra	c186faf	2011-02-21 18:52:53 +0100	[diff] [blame]	5107	goto out_balanced;
Suresh Siddha	aae6d3d	2010-09-17 15:02:32 -0700	[diff] [blame]	5108	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5109
Nikhil Rao	fab4762	2010-10-15 13:12:29 -0700	[diff] [blame]	5110	force_balance:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5111	/* Looks like there is an imbalance. Compute it */
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5112	calculate_imbalance(env, &sds);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5113	return sds.busiest;
				5114
				5115	out_balanced:
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5116	env->imbalance = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5117	return NULL;
				5118	}
				5119
				5120	/*
				5121	* find_busiest_queue - find the busiest runqueue among the cpus in group.
				5122	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5123	static struct rq find_busiest_queue(struct lb_env env,
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	5124	struct sched_group *group)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5125	{
				5126	struct rq busiest = NULL, rq;
Joonsoo Kim	95a79b8	2013-08-06 17:36:41 +0900	[diff] [blame]	5127	unsigned long busiest_load = 0, busiest_power = 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5128	int i;
				5129
Peter Zijlstra	6906a40	2013-08-19 15:20:21 +0200	[diff] [blame]	5130	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5131	unsigned long power = power_of(i);
Nikhil Rao	1399fa7	2011-05-18 10:09:39 -0700	[diff] [blame]	5132	unsigned long capacity = DIV_ROUND_CLOSEST(power,
				5133	SCHED_POWER_SCALE);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5134	unsigned long wl;
				5135
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	5136	if (!capacity)
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5137	capacity = fix_small_capacity(env->sd, group);
Srivatsa Vaddagiri	9d5efe0	2010-06-08 14:57:02 +1000	[diff] [blame]	5138
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5139	rq = cpu_rq(i);
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	5140	wl = weighted_cpuload(i);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5141
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	5142	/*
				5143	* When comparing with imbalance, use weighted_cpuload()
				5144	* which is not scaled with the cpu power.
				5145	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5146	if (capacity && rq->nr_running == 1 && wl > env->imbalance)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5147	continue;
				5148
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	5149	/*
				5150	* For the load comparisons with the other cpu's, consider
				5151	* the weighted_cpuload() scaled with the cpu power, so that
				5152	* the load can be moved away from the cpu that is potentially
				5153	* running at a lower capacity.
Joonsoo Kim	95a79b8	2013-08-06 17:36:41 +0900	[diff] [blame]	5154	*
				5155	* Thus we're looking for max(wl_i / power_i), crosswise
				5156	* multiplication to rid ourselves of the division works out
				5157	* to: wl_i * power_j > wl_j * power_i; where j is our
				5158	* previous maximum.
Thomas Gleixner	6e40f5b	2010-02-16 16:48:56 +0100	[diff] [blame]	5159	*/
Joonsoo Kim	95a79b8	2013-08-06 17:36:41 +0900	[diff] [blame]	5160	if (wl * busiest_power > busiest_load * power) {
				5161	busiest_load = wl;
				5162	busiest_power = power;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5163	busiest = rq;
				5164	}
				5165	}
				5166
				5167	return busiest;
				5168	}
				5169
				5170	/*
				5171	* Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
				5172	* so long as it is large enough.
				5173	*/
				5174	#define MAX_PINNED_INTERVAL 512
				5175
				5176	/* Working cpumask for load_balance and load_balance_newidle. */
Joonsoo Kim	e6252c3	2013-04-23 17:27:41 +0900	[diff] [blame]	5177	DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5178
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5179	static int need_active_balance(struct lb_env *env)
Peter Zijlstra	1af3ed3	2009-12-23 15:10:31 +0100	[diff] [blame]	5180	{
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5181	struct sched_domain *sd = env->sd;
				5182
				5183	if (env->idle == CPU_NEWLY_IDLE) {
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	5184
				5185	/*
				5186	* ASYM_PACKING needs to force migrate tasks from busy but
				5187	* higher numbered CPUs in order to pack all tasks in the
				5188	* lowest numbered CPUs.
				5189	*/
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5190	if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
Michael Neuling	532cb4c	2010-06-08 14:57:02 +1000	[diff] [blame]	5191	return 1;
Peter Zijlstra	1af3ed3	2009-12-23 15:10:31 +0100	[diff] [blame]	5192	}
				5193
				5194	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
				5195	}
				5196
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5197	static int active_load_balance_cpu_stop(void *data);
				5198
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	5199	static int should_we_balance(struct lb_env *env)
				5200	{
				5201	struct sched_group *sg = env->sd->groups;
				5202	struct cpumask sg_cpus, sg_mask;
				5203	int cpu, balance_cpu = -1;
				5204
				5205	/*
				5206	* In the newly idle case, we will allow all the cpu's
				5207	* to do the newly idle load balance.
				5208	*/
				5209	if (env->idle == CPU_NEWLY_IDLE)
				5210	return 1;
				5211
				5212	sg_cpus = sched_group_cpus(sg);
				5213	sg_mask = sched_group_mask(sg);
				5214	/* Try to find first idle cpu */
				5215	for_each_cpu_and(cpu, sg_cpus, env->cpus) {
				5216	if (!cpumask_test_cpu(cpu, sg_mask) \|\| !idle_cpu(cpu))
				5217	continue;
				5218
				5219	balance_cpu = cpu;
				5220	break;
				5221	}
				5222
				5223	if (balance_cpu == -1)
				5224	balance_cpu = group_balance_cpu(sg);
				5225
				5226	/*
				5227	* First idle cpu or the first cpu(busiest) in this sched group
				5228	* is eligible for doing load balancing at this and above domains.
				5229	*/
Joonsoo Kim	b0cff9d	2013-09-10 15:54:49 +0900	[diff] [blame]	5230	return balance_cpu == env->dst_cpu;
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	5231	}
				5232
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5233	/*
				5234	* Check this_cpu to ensure it is balanced within domain. Attempt to move
				5235	* tasks if there is an imbalance.
				5236	*/
				5237	static int load_balance(int this_cpu, struct rq *this_rq,
				5238	struct sched_domain *sd, enum cpu_idle_type idle,
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	5239	int *continue_balancing)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5240	{
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5241	int ld_moved, cur_ld_moved, active_balance = 0;
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	5242	struct sched_domain *sd_parent = sd->parent;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5243	struct sched_group *group;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5244	struct rq *busiest;
				5245	unsigned long flags;
Joonsoo Kim	e6252c3	2013-04-23 17:27:41 +0900	[diff] [blame]	5246	struct cpumask *cpus = __get_cpu_var(load_balance_mask);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5247
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5248	struct lb_env env = {
				5249	.sd = sd,
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	5250	.dst_cpu = this_cpu,
				5251	.dst_rq = this_rq,
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5252	.dst_grpmask = sched_group_cpus(sd->groups),
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5253	.idle = idle,
Peter Zijlstra	eb95308	2012-04-17 13:38:40 +0200	[diff] [blame]	5254	.loop_break = sched_nr_migrate_break,
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	5255	.cpus = cpus,
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5256	};
				5257
Joonsoo Kim	cfc0311	2013-04-23 17:27:39 +0900	[diff] [blame]	5258	/*
				5259	* For NEWLY_IDLE load_balancing, we don't need to consider
				5260	* other cpus in our group
				5261	*/
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	5262	if (idle == CPU_NEWLY_IDLE)
Joonsoo Kim	cfc0311	2013-04-23 17:27:39 +0900	[diff] [blame]	5263	env.dst_grpmask = NULL;
Joonsoo Kim	cfc0311	2013-04-23 17:27:39 +0900	[diff] [blame]	5264
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5265	cpumask_copy(cpus, cpu_active_mask);
				5266
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5267	schedstat_inc(sd, lb_count[idle]);
				5268
				5269	redo:
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	5270	if (!should_we_balance(&env)) {
				5271	*continue_balancing = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5272	goto out_balanced;
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	5273	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5274
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	5275	group = find_busiest_group(&env);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5276	if (!group) {
				5277	schedstat_inc(sd, lb_nobusyg[idle]);
				5278	goto out_balanced;
				5279	}
				5280
Michael Wang	b9403130	2012-07-12 16:10:13 +0800	[diff] [blame]	5281	busiest = find_busiest_queue(&env, group);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5282	if (!busiest) {
				5283	schedstat_inc(sd, lb_nobusyq[idle]);
				5284	goto out_balanced;
				5285	}
				5286
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	5287	BUG_ON(busiest == env.dst_rq);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5288
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5289	schedstat_add(sd, lb_imbalance[idle], env.imbalance);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5290
				5291	ld_moved = 0;
				5292	if (busiest->nr_running > 1) {
				5293	/*
				5294	* Attempt to move tasks. If find_busiest_group has found
				5295	* an imbalance but busiest->nr_running <= 1, the group is
				5296	* still unbalanced. ld_moved simply stays zero, so it is
				5297	* correctly treated as an imbalance.
				5298	*/
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5299	env.flags \|= LBF_ALL_PINNED;
Peter Zijlstra	c82513e	2012-04-26 13:12:27 +0200	[diff] [blame]	5300	env.src_cpu = busiest->cpu;
				5301	env.src_rq = busiest;
				5302	env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5303
Peter Zijlstra	5d6523e	2012-03-10 00:07:36 +0100	[diff] [blame]	5304	more_balance:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5305	local_irq_save(flags);
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	5306	double_rq_lock(env.dst_rq, busiest);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5307
				5308	/*
				5309	* cur_ld_moved - load moved in current iteration
				5310	* ld_moved - cumulative load moved across iterations
				5311	*/
				5312	cur_ld_moved = move_tasks(&env);
				5313	ld_moved += cur_ld_moved;
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	5314	double_rq_unlock(env.dst_rq, busiest);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5315	local_irq_restore(flags);
				5316
				5317	/*
				5318	* some other cpu did the load balance for us.
				5319	*/
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5320	if (cur_ld_moved && env.dst_cpu != smp_processor_id())
				5321	resched_cpu(env.dst_cpu);
				5322
Joonsoo Kim	f1cd085	2013-04-23 17:27:37 +0900	[diff] [blame]	5323	if (env.flags & LBF_NEED_BREAK) {
				5324	env.flags &= ~LBF_NEED_BREAK;
				5325	goto more_balance;
				5326	}
				5327
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5328	/*
				5329	* Revisit (affine) tasks on src_cpu that couldn't be moved to
				5330	* us and move them to an alternate dst_cpu in our sched_group
				5331	* where they can run. The upper limit on how many times we
				5332	* iterate on same src_cpu is dependent on number of cpus in our
				5333	* sched_group.
				5334	*
				5335	* This changes load balance semantics a bit on who can move
				5336	* load to a given_cpu. In addition to the given_cpu itself
				5337	* (or a ilb_cpu acting on its behalf where given_cpu is
				5338	* nohz-idle), we now have balance_cpu in a position to move
				5339	* load to given_cpu. In rare situations, this may cause
				5340	* conflicts (balance_cpu and given_cpu/ilb_cpu deciding
				5341	* _independently_ and at _same_ time to move some load to
				5342	* given_cpu) causing exceess load to be moved to given_cpu.
				5343	* This however should not happen so much in practice and
				5344	* moreover subsequent load balance cycles should correct the
				5345	* excess load moved.
				5346	*/
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	5347	if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5348
Vladimir Davydov	7aff2e3	2013-09-15 21:30:13 +0400	[diff] [blame]	5349	/* Prevent to re-select dst_cpu via env's cpus */
				5350	cpumask_clear_cpu(env.dst_cpu, env.cpus);
				5351
Michael Wang	78feefc	2012-08-06 16:41:59 +0800	[diff] [blame]	5352	env.dst_rq = cpu_rq(env.new_dst_cpu);
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5353	env.dst_cpu = env.new_dst_cpu;
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	5354	env.flags &= ~LBF_DST_PINNED;
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5355	env.loop = 0;
				5356	env.loop_break = sched_nr_migrate_break;
Joonsoo Kim	e02e60c	2013-04-23 17:27:42 +0900	[diff] [blame]	5357
Srivatsa Vaddagiri	88b8dac	2012-06-19 17:43:15 +0530	[diff] [blame]	5358	/*
				5359	* Go back to "more_balance" rather than "redo" since we
				5360	* need to continue with same src_cpu.
				5361	*/
				5362	goto more_balance;
				5363	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5364
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	5365	/*
				5366	* We failed to reach balance because of affinity.
				5367	*/
				5368	if (sd_parent) {
				5369	int *group_imbalance = &sd_parent->groups->sgp->imbalance;
				5370
				5371	if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
				5372	*group_imbalance = 1;
				5373	} else if (*group_imbalance)
				5374	*group_imbalance = 0;
				5375	}
				5376
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5377	/* All tasks on this runqueue were pinned by CPU affinity */
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5378	if (unlikely(env.flags & LBF_ALL_PINNED)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5379	cpumask_clear_cpu(cpu_of(busiest), cpus);
Prashanth Nageshappa	bbf18b1	2012-06-19 17:52:07 +0530	[diff] [blame]	5380	if (!cpumask_empty(cpus)) {
				5381	env.loop = 0;
				5382	env.loop_break = sched_nr_migrate_break;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5383	goto redo;
Prashanth Nageshappa	bbf18b1	2012-06-19 17:52:07 +0530	[diff] [blame]	5384	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5385	goto out_balanced;
				5386	}
				5387	}
				5388
				5389	if (!ld_moved) {
				5390	schedstat_inc(sd, lb_failed[idle]);
Venkatesh Pallipadi	58b26c4	2010-09-10 18:19:17 -0700	[diff] [blame]	5391	/*
				5392	* Increment the failure counter only on periodic balance.
				5393	* We do not want newidle balance, which can be very
				5394	* frequent, pollute the failure counter causing
				5395	* excessive cache_hot migrations and active balances.
				5396	*/
				5397	if (idle != CPU_NEWLY_IDLE)
				5398	sd->nr_balance_failed++;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5399
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5400	if (need_active_balance(&env)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5401	raw_spin_lock_irqsave(&busiest->lock, flags);
				5402
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5403	/* don't kick the active_load_balance_cpu_stop,
				5404	* if the curr task on busiest cpu can't be
				5405	* moved to this_cpu
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5406	*/
				5407	if (!cpumask_test_cpu(this_cpu,
Peter Zijlstra	fa17b50	2011-06-16 12:23:22 +0200	[diff] [blame]	5408	tsk_cpus_allowed(busiest->curr))) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5409	raw_spin_unlock_irqrestore(&busiest->lock,
				5410	flags);
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5411	env.flags \|= LBF_ALL_PINNED;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5412	goto out_one_pinned;
				5413	}
				5414
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5415	/*
				5416	* ->active_balance synchronizes accesses to
				5417	* ->active_balance_work. Once set, it's cleared
				5418	* only after active load balance is finished.
				5419	*/
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5420	if (!busiest->active_balance) {
				5421	busiest->active_balance = 1;
				5422	busiest->push_cpu = this_cpu;
				5423	active_balance = 1;
				5424	}
				5425	raw_spin_unlock_irqrestore(&busiest->lock, flags);
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5426
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5427	if (active_balance) {
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5428	stop_one_cpu_nowait(cpu_of(busiest),
				5429	active_load_balance_cpu_stop, busiest,
				5430	&busiest->active_balance_work);
Peter Zijlstra	bd939f4	2012-05-02 14:20:37 +0200	[diff] [blame]	5431	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5432
				5433	/*
				5434	* We've kicked active balancing, reset the failure
				5435	* counter.
				5436	*/
				5437	sd->nr_balance_failed = sd->cache_nice_tries+1;
				5438	}
				5439	} else
				5440	sd->nr_balance_failed = 0;
				5441
				5442	if (likely(!active_balance)) {
				5443	/* We were unbalanced, so reset the balancing interval */
				5444	sd->balance_interval = sd->min_interval;
				5445	} else {
				5446	/*
				5447	* If we've begun active balancing, start to back off. This
				5448	* case may not be covered by the all_pinned logic if there
				5449	* is only 1 task on the busy runqueue (because we don't call
				5450	* move_tasks).
				5451	*/
				5452	if (sd->balance_interval < sd->max_interval)
				5453	sd->balance_interval *= 2;
				5454	}
				5455
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5456	goto out;
				5457
				5458	out_balanced:
				5459	schedstat_inc(sd, lb_balanced[idle]);
				5460
				5461	sd->nr_balance_failed = 0;
				5462
				5463	out_one_pinned:
				5464	/* tune up the balancing interval */
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5465	if (((env.flags & LBF_ALL_PINNED) &&
Peter Zijlstra	5b54b56	2011-09-22 15:23:13 +0200	[diff] [blame]	5466	sd->balance_interval < MAX_PINNED_INTERVAL) \|\|
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5467	(sd->balance_interval < sd->max_interval))
				5468	sd->balance_interval *= 2;
				5469
Venkatesh Pallipadi	46e49b3	2011-02-14 14:38:50 -0800	[diff] [blame]	5470	ld_moved = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5471	out:
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5472	return ld_moved;
				5473	}
				5474
				5475	/*
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5476	* idle_balance is called by schedule() if this_cpu is about to become
				5477	* idle. Attempts to pull tasks from other CPUs.
				5478	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5479	void idle_balance(int this_cpu, struct rq *this_rq)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5480	{
				5481	struct sched_domain *sd;
				5482	int pulled_task = 0;
				5483	unsigned long next_balance = jiffies + HZ;
Jason Low	9bd721c	2013-09-13 11:26:52 -0700	[diff] [blame]	5484	u64 curr_cost = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5485
Frederic Weisbecker	78becc2	2013-04-12 01:51:02 +0200	[diff] [blame]	5486	this_rq->idle_stamp = rq_clock(this_rq);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5487
				5488	if (this_rq->avg_idle < sysctl_sched_migration_cost)
				5489	return;
				5490
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5491	/*
				5492	* Drop the rq->lock, but keep IRQ/preempt disabled.
				5493	*/
				5494	raw_spin_unlock(&this_rq->lock);
				5495
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	5496	update_blocked_averages(this_cpu);
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5497	rcu_read_lock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5498	for_each_domain(this_cpu, sd) {
				5499	unsigned long interval;
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	5500	int continue_balancing = 1;
Jason Low	9bd721c	2013-09-13 11:26:52 -0700	[diff] [blame]	5501	u64 t0, domain_cost;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5502
				5503	if (!(sd->flags & SD_LOAD_BALANCE))
				5504	continue;
				5505
Jason Low	9bd721c	2013-09-13 11:26:52 -0700	[diff] [blame]	5506	if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
				5507	break;
				5508
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5509	if (sd->flags & SD_BALANCE_NEWIDLE) {
Jason Low	9bd721c	2013-09-13 11:26:52 -0700	[diff] [blame]	5510	t0 = sched_clock_cpu(this_cpu);
				5511
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5512	/* If we've pulled tasks over stop searching: */
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5513	pulled_task = load_balance(this_cpu, this_rq,
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	5514	sd, CPU_NEWLY_IDLE,
				5515	&continue_balancing);
Jason Low	9bd721c	2013-09-13 11:26:52 -0700	[diff] [blame]	5516
				5517	domain_cost = sched_clock_cpu(this_cpu) - t0;
				5518	if (domain_cost > sd->max_newidle_lb_cost)
				5519	sd->max_newidle_lb_cost = domain_cost;
				5520
				5521	curr_cost += domain_cost;
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5522	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5523
				5524	interval = msecs_to_jiffies(sd->balance_interval);
				5525	if (time_after(next_balance, sd->last_balance + interval))
				5526	next_balance = sd->last_balance + interval;
Nikhil Rao	d5ad140	2010-11-17 11:42:04 -0800	[diff] [blame]	5527	if (pulled_task) {
				5528	this_rq->idle_stamp = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5529	break;
Nikhil Rao	d5ad140	2010-11-17 11:42:04 -0800	[diff] [blame]	5530	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5531	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5532	rcu_read_unlock();
Peter Zijlstra	f492e12	2009-12-23 15:29:42 +0100	[diff] [blame]	5533
				5534	raw_spin_lock(&this_rq->lock);
				5535
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5536	if (pulled_task \|\| time_after(jiffies, this_rq->next_balance)) {
				5537	/*
				5538	* We are going idle. next_balance may be set based on
				5539	* a busy processor. So reset next_balance.
				5540	*/
				5541	this_rq->next_balance = next_balance;
				5542	}
Jason Low	9bd721c	2013-09-13 11:26:52 -0700	[diff] [blame]	5543
				5544	if (curr_cost > this_rq->max_idle_balance_cost)
				5545	this_rq->max_idle_balance_cost = curr_cost;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5546	}
				5547
				5548	/*
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5549	* active_load_balance_cpu_stop is run by cpu stopper. It pushes
				5550	* running tasks off the busiest CPU onto idle CPUs. It requires at
				5551	* least 1 task to be running on each physical CPU where possible, and
				5552	* avoids physical / logical imbalances.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5553	*/
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5554	static int active_load_balance_cpu_stop(void *data)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5555	{
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5556	struct rq *busiest_rq = data;
				5557	int busiest_cpu = cpu_of(busiest_rq);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5558	int target_cpu = busiest_rq->push_cpu;
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5559	struct rq *target_rq = cpu_rq(target_cpu);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5560	struct sched_domain *sd;
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5561
				5562	raw_spin_lock_irq(&busiest_rq->lock);
				5563
				5564	/* make sure the requested cpu hasn't gone down in the meantime */
				5565	if (unlikely(busiest_cpu != smp_processor_id() \|\|
				5566	!busiest_rq->active_balance))
				5567	goto out_unlock;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5568
				5569	/* Is there any task to move? */
				5570	if (busiest_rq->nr_running <= 1)
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5571	goto out_unlock;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5572
				5573	/*
				5574	* This condition is "impossible", if it occurs
				5575	* we need to fix it. Originally reported by
				5576	* Bjorn Helgaas on a 128-cpu setup.
				5577	*/
				5578	BUG_ON(busiest_rq == target_rq);
				5579
				5580	/* move a task from busiest_rq to target_rq */
				5581	double_lock_balance(busiest_rq, target_rq);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5582
				5583	/* Search for an sd spanning us and the target CPU. */
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5584	rcu_read_lock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5585	for_each_domain(target_cpu, sd) {
				5586	if ((sd->flags & SD_LOAD_BALANCE) &&
				5587	cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
				5588	break;
				5589	}
				5590
				5591	if (likely(sd)) {
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5592	struct lb_env env = {
				5593	.sd = sd,
Peter Zijlstra	ddcdf6e	2012-02-22 19:27:40 +0100	[diff] [blame]	5594	.dst_cpu = target_cpu,
				5595	.dst_rq = target_rq,
				5596	.src_cpu = busiest_rq->cpu,
				5597	.src_rq = busiest_rq,
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5598	.idle = CPU_IDLE,
				5599	};
				5600
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5601	schedstat_inc(sd, alb_count);
				5602
Peter Zijlstra	8e45cb5	2012-02-22 12:47:19 +0100	[diff] [blame]	5603	if (move_one_task(&env))
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5604	schedstat_inc(sd, alb_pushed);
				5605	else
				5606	schedstat_inc(sd, alb_failed);
				5607	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5608	rcu_read_unlock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5609	double_unlock_balance(busiest_rq, target_rq);
Tejun Heo	969c792	2010-05-06 18:49:21 +0200	[diff] [blame]	5610	out_unlock:
				5611	busiest_rq->active_balance = 0;
				5612	raw_spin_unlock_irq(&busiest_rq->lock);
				5613	return 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5614	}
				5615
Frederic Weisbecker	3451d02	2011-08-10 23:21:01 +0200	[diff] [blame]	5616	#ifdef CONFIG_NO_HZ_COMMON
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5617	/*
				5618	* idle load balancing details
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5619	* - When one of the busy CPUs notice that there may be an idle rebalancing
				5620	* needed, they will kick the idle load balancer, which then does idle
				5621	* load balancing for all the idle CPUs.
				5622	*/
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5623	static struct {
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5624	cpumask_var_t idle_cpus_mask;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5625	atomic_t nr_cpus;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5626	unsigned long next_balance; /* in jiffy units */
				5627	} nohz ____cacheline_aligned;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5628
Peter Zijlstra	8e7fbcb	2012-01-09 11:28:35 +0100	[diff] [blame]	5629	static inline int find_new_ilb(int call_cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5630	{
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5631	int ilb = cpumask_first(nohz.idle_cpus_mask);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5632
Suresh Siddha	786d6dc7	2011-12-01 17:07:35 -0800	[diff] [blame]	5633	if (ilb < nr_cpu_ids && idle_cpu(ilb))
				5634	return ilb;
				5635
				5636	return nr_cpu_ids;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5637	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5638
				5639	/*
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5640	* Kick a CPU to do the nohz balancing, if it is time for it. We pick the
				5641	* nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
				5642	* CPU (if there is one).
				5643	*/
				5644	static void nohz_balancer_kick(int cpu)
				5645	{
				5646	int ilb_cpu;
				5647
				5648	nohz.next_balance++;
				5649
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5650	ilb_cpu = find_new_ilb(cpu);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5651
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5652	if (ilb_cpu >= nr_cpu_ids)
				5653	return;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5654
Suresh Siddha	cd490c5	2011-12-06 11:26:34 -0800	[diff] [blame]	5655	if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5656	return;
				5657	/*
				5658	* Use smp_send_reschedule() instead of resched_cpu().
				5659	* This way we generate a sched IPI on the target cpu which
				5660	* is idle. And the softirq performing nohz idle load balance
				5661	* will be run before returning from the IPI.
				5662	*/
				5663	smp_send_reschedule(ilb_cpu);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5664	return;
				5665	}
				5666
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5667	static inline void nohz_balance_exit_idle(int cpu)
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	5668	{
				5669	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
				5670	cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
				5671	atomic_dec(&nohz.nr_cpus);
				5672	clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
				5673	}
				5674	}
				5675
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	5676	static inline void set_cpu_sd_state_busy(void)
				5677	{
				5678	struct sched_domain *sd;
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	5679
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	5680	rcu_read_lock();
Nathan Zimmer	424c93f	2013-05-09 11:24:03 -0500	[diff] [blame]	5681	sd = rcu_dereference_check_sched_domain(this_rq()->sd);
Vincent Guittot	25f55d9	2013-04-23 16:59:02 +0200	[diff] [blame]	5682
				5683	if (!sd \|\| !sd->nohz_idle)
				5684	goto unlock;
				5685	sd->nohz_idle = 0;
				5686
				5687	for (; sd; sd = sd->parent)
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	5688	atomic_inc(&sd->groups->sgp->nr_busy_cpus);
Vincent Guittot	25f55d9	2013-04-23 16:59:02 +0200	[diff] [blame]	5689	unlock:
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	5690	rcu_read_unlock();
				5691	}
				5692
				5693	void set_cpu_sd_state_idle(void)
				5694	{
				5695	struct sched_domain *sd;
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	5696
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	5697	rcu_read_lock();
Nathan Zimmer	424c93f	2013-05-09 11:24:03 -0500	[diff] [blame]	5698	sd = rcu_dereference_check_sched_domain(this_rq()->sd);
Vincent Guittot	25f55d9	2013-04-23 16:59:02 +0200	[diff] [blame]	5699
				5700	if (!sd \|\| sd->nohz_idle)
				5701	goto unlock;
				5702	sd->nohz_idle = 1;
				5703
				5704	for (; sd; sd = sd->parent)
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	5705	atomic_dec(&sd->groups->sgp->nr_busy_cpus);
Vincent Guittot	25f55d9	2013-04-23 16:59:02 +0200	[diff] [blame]	5706	unlock:
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	5707	rcu_read_unlock();
				5708	}
				5709
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5710	/*
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5711	* This routine will record that the cpu is going idle with tick stopped.
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5712	* This info will be used in performing idle load balancing in the future.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5713	*/
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5714	void nohz_balance_enter_idle(int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5715	{
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	5716	/*
				5717	* If this cpu is going down, then nothing needs to be done.
				5718	*/
				5719	if (!cpu_active(cpu))
				5720	return;
				5721
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5722	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
				5723	return;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5724
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5725	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
				5726	atomic_inc(&nohz.nr_cpus);
				5727	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5728	}
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	5729
Paul Gortmaker	0db0628	2013-06-19 14:53:51 -0400	[diff] [blame]	5730	static int sched_ilb_notifier(struct notifier_block *nfb,
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	5731	unsigned long action, void *hcpu)
				5732	{
				5733	switch (action & ~CPU_TASKS_FROZEN) {
				5734	case CPU_DYING:
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5735	nohz_balance_exit_idle(smp_processor_id());
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	5736	return NOTIFY_OK;
				5737	default:
				5738	return NOTIFY_DONE;
				5739	}
				5740	}
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5741	#endif
				5742
				5743	static DEFINE_SPINLOCK(balancing);
				5744
Peter Zijlstra	49c022e	2011-04-05 10:14:25 +0200	[diff] [blame]	5745	/*
				5746	* Scale the max load_balance interval with the number of CPUs in the system.
				5747	* This trades load-balance latency on larger machines for less cross talk.
				5748	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5749	void update_max_interval(void)
Peter Zijlstra	49c022e	2011-04-05 10:14:25 +0200	[diff] [blame]	5750	{
				5751	max_load_balance_interval = HZ*num_online_cpus()/10;
				5752	}
				5753
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5754	/*
				5755	* It checks each scheduling domain to see if it is due to be balanced,
				5756	* and initiates a balancing operation if so.
				5757	*
Libin	b9b0853	2013-04-01 19:14:01 +0800	[diff] [blame]	5758	* Balancing parameters are set up in init_sched_domains.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5759	*/
				5760	static void rebalance_domains(int cpu, enum cpu_idle_type idle)
				5761	{
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	5762	int continue_balancing = 1;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5763	struct rq *rq = cpu_rq(cpu);
				5764	unsigned long interval;
Peter Zijlstra	04f733b	2012-05-11 00:12:02 +0200	[diff] [blame]	5765	struct sched_domain *sd;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5766	/* Earliest time when we have to do rebalance again */
				5767	unsigned long next_balance = jiffies + 60*HZ;
				5768	int update_next_balance = 0;
Jason Low	f48627e	2013-09-13 11:26:53 -0700	[diff] [blame]	5769	int need_serialize, need_decay = 0;
				5770	u64 max_cost = 0;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5771
Paul Turner	48a1675	2012-10-04 13:18:31 +0200	[diff] [blame]	5772	update_blocked_averages(cpu);
Peter Zijlstra	2069dd7	2010-11-15 15:47:00 -0800	[diff] [blame]	5773
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5774	rcu_read_lock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5775	for_each_domain(cpu, sd) {
Jason Low	f48627e	2013-09-13 11:26:53 -0700	[diff] [blame]	5776	/*
				5777	* Decay the newidle max times here because this is a regular
				5778	* visit to all the domains. Decay ~1% per second.
				5779	*/
				5780	if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
				5781	sd->max_newidle_lb_cost =
				5782	(sd->max_newidle_lb_cost * 253) / 256;
				5783	sd->next_decay_max_lb_cost = jiffies + HZ;
				5784	need_decay = 1;
				5785	}
				5786	max_cost += sd->max_newidle_lb_cost;
				5787
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5788	if (!(sd->flags & SD_LOAD_BALANCE))
				5789	continue;
				5790
Jason Low	f48627e	2013-09-13 11:26:53 -0700	[diff] [blame]	5791	/*
				5792	* Stop the load balance at this level. There is another
				5793	* CPU in our sched group which is doing load balancing more
				5794	* actively.
				5795	*/
				5796	if (!continue_balancing) {
				5797	if (need_decay)
				5798	continue;
				5799	break;
				5800	}
				5801
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5802	interval = sd->balance_interval;
				5803	if (idle != CPU_IDLE)
				5804	interval *= sd->busy_factor;
				5805
				5806	/* scale ms to jiffies */
				5807	interval = msecs_to_jiffies(interval);
Peter Zijlstra	49c022e	2011-04-05 10:14:25 +0200	[diff] [blame]	5808	interval = clamp(interval, 1UL, max_load_balance_interval);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5809
				5810	need_serialize = sd->flags & SD_SERIALIZE;
				5811
				5812	if (need_serialize) {
				5813	if (!spin_trylock(&balancing))
				5814	goto out;
				5815	}
				5816
				5817	if (time_after_eq(jiffies, sd->last_balance + interval)) {
Joonsoo Kim	23f0d20	2013-08-06 17:36:42 +0900	[diff] [blame]	5818	if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5819	/*
Peter Zijlstra	6263322	2013-08-19 12:41:09 +0200	[diff] [blame]	5820	* The LBF_DST_PINNED logic could have changed
Joonsoo Kim	de5eb2d	2013-04-23 17:27:38 +0900	[diff] [blame]	5821	* env->dst_cpu, so we can't know our idle
				5822	* state even if we migrated tasks. Update it.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5823	*/
Joonsoo Kim	de5eb2d	2013-04-23 17:27:38 +0900	[diff] [blame]	5824	idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5825	}
				5826	sd->last_balance = jiffies;
				5827	}
				5828	if (need_serialize)
				5829	spin_unlock(&balancing);
				5830	out:
				5831	if (time_after(next_balance, sd->last_balance + interval)) {
				5832	next_balance = sd->last_balance + interval;
				5833	update_next_balance = 1;
				5834	}
Jason Low	f48627e	2013-09-13 11:26:53 -0700	[diff] [blame]	5835	}
				5836	if (need_decay) {
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5837	/*
Jason Low	f48627e	2013-09-13 11:26:53 -0700	[diff] [blame]	5838	* Ensure the rq-wide value also decays but keep it at a
				5839	* reasonable floor to avoid funnies with rq->avg_idle.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5840	*/
Jason Low	f48627e	2013-09-13 11:26:53 -0700	[diff] [blame]	5841	rq->max_idle_balance_cost =
				5842	max((u64)sysctl_sched_migration_cost, max_cost);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5843	}
Peter Zijlstra	dce840a	2011-04-07 14:09:50 +0200	[diff] [blame]	5844	rcu_read_unlock();
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5845
				5846	/*
				5847	* next_balance will be updated only when there is a need.
				5848	* When the cpu is attached to null domain for ex, it will not be
				5849	* updated.
				5850	*/
				5851	if (likely(update_next_balance))
				5852	rq->next_balance = next_balance;
				5853	}
				5854
Frederic Weisbecker	3451d02	2011-08-10 23:21:01 +0200	[diff] [blame]	5855	#ifdef CONFIG_NO_HZ_COMMON
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5856	/*
Frederic Weisbecker	3451d02	2011-08-10 23:21:01 +0200	[diff] [blame]	5857	* In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5858	* rebalancing for all the cpus for whom scheduler ticks are stopped.
				5859	*/
				5860	static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
				5861	{
				5862	struct rq *this_rq = cpu_rq(this_cpu);
				5863	struct rq *rq;
				5864	int balance_cpu;
				5865
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5866	if (idle != CPU_IDLE \|\|
				5867	!test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
				5868	goto end;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5869
				5870	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
Suresh Siddha	8a6d42d	2011-12-06 11:19:37 -0800	[diff] [blame]	5871	if (balance_cpu == this_cpu \|\| !idle_cpu(balance_cpu))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5872	continue;
				5873
				5874	/*
				5875	* If this cpu gets work to do, stop the load balancing
				5876	* work being done for other cpus. Next load
				5877	* balancing owner will pick it up.
				5878	*/
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5879	if (need_resched())
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5880	break;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5881
Vincent Guittot	5ed4f1d	2012-09-13 06:11:26 +0200	[diff] [blame]	5882	rq = cpu_rq(balance_cpu);
				5883
				5884	raw_spin_lock_irq(&rq->lock);
				5885	update_rq_clock(rq);
				5886	update_idle_cpu_load(rq);
				5887	raw_spin_unlock_irq(&rq->lock);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5888
				5889	rebalance_domains(balance_cpu, CPU_IDLE);
				5890
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5891	if (time_after(this_rq->next_balance, rq->next_balance))
				5892	this_rq->next_balance = rq->next_balance;
				5893	}
				5894	nohz.next_balance = this_rq->next_balance;
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5895	end:
				5896	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5897	}
				5898
				5899	/*
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5900	* Current heuristic for kicking the idle load balancer in the presence
				5901	* of an idle cpu is the system.
				5902	* - This rq has more than one task.
				5903	* - At any scheduler domain level, this cpu's scheduler group has multiple
				5904	* busy cpu's exceeding the group's power.
				5905	* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
				5906	* domain span are idle.
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5907	*/
				5908	static inline int nohz_kick_needed(struct rq *rq, int cpu)
				5909	{
				5910	unsigned long now = jiffies;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5911	struct sched_domain *sd;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5912
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5913	if (unlikely(idle_cpu(cpu)))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5914	return 0;
				5915
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5916	/*
				5917	* We may be recently in ticked or tickless idle mode. At the first
				5918	* busy tick after returning from idle, we will update the busy stats.
				5919	*/
Suresh Siddha	69e1e81	2011-12-01 17:07:33 -0800	[diff] [blame]	5920	set_cpu_sd_state_busy();
Alex Shi	c1cc017	2012-09-10 15:10:58 +0800	[diff] [blame]	5921	nohz_balance_exit_idle(cpu);
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5922
				5923	/*
				5924	* None are in tickless mode and hence no need for NOHZ idle load
				5925	* balancing.
				5926	*/
				5927	if (likely(!atomic_read(&nohz.nr_cpus)))
				5928	return 0;
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	5929
				5930	if (time_before(now, nohz.next_balance))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5931	return 0;
				5932
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5933	if (rq->nr_running >= 2)
				5934	goto need_kick;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5935
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5936	rcu_read_lock();
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5937	for_each_domain(cpu, sd) {
				5938	struct sched_group *sg = sd->groups;
				5939	struct sched_group_power *sgp = sg->sgp;
				5940	int nr_busy = atomic_read(&sgp->nr_busy_cpus);
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5941
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5942	if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5943	goto need_kick_unlock;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5944
				5945	if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
				5946	&& (cpumask_first_and(nohz.idle_cpus_mask,
				5947	sched_domain_span(sd)) < cpu))
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5948	goto need_kick_unlock;
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5949
				5950	if (!(sd->flags & (SD_SHARE_PKG_RESOURCES \| SD_ASYM_PACKING)))
				5951	break;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5952	}
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5953	rcu_read_unlock();
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5954	return 0;
Peter Zijlstra	067491b	2011-12-07 14:32:08 +0100	[diff] [blame]	5955
				5956	need_kick_unlock:
				5957	rcu_read_unlock();
Suresh Siddha	0b005cf	2011-12-01 17:07:34 -0800	[diff] [blame]	5958	need_kick:
				5959	return 1;
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5960	}
				5961	#else
				5962	static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
				5963	#endif
				5964
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5965	/*
				5966	* run_rebalance_domains is triggered when needed from the scheduler tick.
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5967	* Also triggered for nohz idle balancing (with nohz_balancing_kick set).
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5968	*/
				5969	static void run_rebalance_domains(struct softirq_action *h)
				5970	{
				5971	int this_cpu = smp_processor_id();
				5972	struct rq *this_rq = cpu_rq(this_cpu);
Suresh Siddha	6eb57e0	2011-10-03 15:09:01 -0700	[diff] [blame]	5973	enum cpu_idle_type idle = this_rq->idle_balance ?
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5974	CPU_IDLE : CPU_NOT_IDLE;
				5975
				5976	rebalance_domains(this_cpu, idle);
				5977
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5978	/*
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5979	* If this cpu has a pending nohz_balance_kick, then do the
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5980	* balancing on behalf of the other idle cpus whose ticks are
				5981	* stopped.
				5982	*/
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	5983	nohz_idle_balance(this_cpu, idle);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5984	}
				5985
				5986	static inline int on_null_domain(int cpu)
				5987	{
Paul E. McKenney	90a6501	2010-02-28 08:32:18 -0800	[diff] [blame]	5988	return !rcu_dereference_sched(cpu_rq(cpu)->sd);
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5989	}
				5990
				5991	/*
				5992	* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5993	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	5994	void trigger_load_balance(struct rq *rq, int cpu)
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5995	{
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	5996	/* Don't need to rebalance while attached to NULL domain */
				5997	if (time_after_eq(jiffies, rq->next_balance) &&
				5998	likely(!on_null_domain(cpu)))
				5999	raise_softirq(SCHED_SOFTIRQ);
Frederic Weisbecker	3451d02	2011-08-10 23:21:01 +0200	[diff] [blame]	6000	#ifdef CONFIG_NO_HZ_COMMON
Suresh Siddha	1c792db	2011-12-01 17:07:32 -0800	[diff] [blame]	6001	if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
Venkatesh Pallipadi	83cd4fe	2010-05-21 17:09:41 -0700	[diff] [blame]	6002	nohz_balancer_kick(cpu);
				6003	#endif
Peter Zijlstra	1e3c88b	2009-12-17 17:00:43 +0100	[diff] [blame]	6004	}
				6005
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	6006	static void rq_online_fair(struct rq *rq)
				6007	{
				6008	update_sysctl();
				6009	}
				6010
				6011	static void rq_offline_fair(struct rq *rq)
				6012	{
				6013	update_sysctl();
Peter Boonstoppel	a4c96ae	2012-08-09 15:34:47 -0700	[diff] [blame]	6014
				6015	/* Ensure any throttled groups are reachable by pick_next_task */
				6016	unthrottle_offline_cfs_rqs(rq);
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	6017	}
				6018
Dhaval Giani	55e12e5	2008-06-24 23:39:43 +0530	[diff] [blame]	6019	#endif /* CONFIG_SMP */
Peter Williams	e1d1484	2007-10-24 18:23:51 +0200	[diff] [blame]	6020
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6021	/*
				6022	* scheduler tick hitting a task of our scheduling class:
				6023	*/
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	6024	static void task_tick_fair(struct rq rq, struct task_struct curr, int queued)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6025	{
				6026	struct cfs_rq *cfs_rq;
				6027	struct sched_entity *se = &curr->se;
				6028
				6029	for_each_sched_entity(se) {
				6030	cfs_rq = cfs_rq_of(se);
Peter Zijlstra	8f4d37e	2008-01-25 21:08:29 +0100	[diff] [blame]	6031	entity_tick(cfs_rq, se, queued);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6032	}
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	6033
Dave Kleikamp	10e84b9	2013-07-31 13:53:35 -0700	[diff] [blame]	6034	if (numabalancing_enabled)
Peter Zijlstra	cbee9f8	2012-10-25 14:16:43 +0200	[diff] [blame]	6035	task_tick_numa(rq, curr);
Linus Torvalds	3d59eeb	2012-12-16 14:33:25 -0800	[diff] [blame]	6036
Ben Segall	18bf280	2012-10-04 12:51:20 +0200	[diff] [blame]	6037	update_rq_runnable_avg(rq, 1);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6038	}
				6039
				6040	/*
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	6041	* called on fork with the child task as argument from the parent's context
				6042	* - child not yet on the tasklist
				6043	* - preemption disabled
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6044	*/
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	6045	static void task_fork_fair(struct task_struct *p)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6046	{
Daisuke Nishimura	4fc420c	2011-12-15 14:36:55 +0900	[diff] [blame]	6047	struct cfs_rq *cfs_rq;
				6048	struct sched_entity se = &p->se, curr;
Ingo Molnar	00bf7bf	2007-10-15 17:00:14 +0200	[diff] [blame]	6049	int this_cpu = smp_processor_id();
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	6050	struct rq *rq = this_rq();
				6051	unsigned long flags;
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6052
Thomas Gleixner	05fa785	2009-11-17 14:28:38 +0100	[diff] [blame]	6053	raw_spin_lock_irqsave(&rq->lock, flags);
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	6054
Peter Zijlstra	861d034	2010-08-19 13:31:43 +0200	[diff] [blame]	6055	update_rq_clock(rq);
				6056
Daisuke Nishimura	4fc420c	2011-12-15 14:36:55 +0900	[diff] [blame]	6057	cfs_rq = task_cfs_rq(current);
				6058	curr = cfs_rq->curr;
				6059
Daisuke Nishimura	6c9a27f	2013-09-10 18:16:36 +0900	[diff] [blame]	6060	/*
				6061	* Not only the cpu but also the task_group of the parent might have
				6062	* been changed after parent->se.parent,cfs_rq were copied to
				6063	* child->se.parent,cfs_rq. So call __set_task_cpu() to make those
				6064	* of child point to valid ones.
				6065	*/
				6066	rcu_read_lock();
				6067	__set_task_cpu(p, this_cpu);
				6068	rcu_read_unlock();
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6069
Ting Yang	7109c44	2007-08-28 12:53:24 +0200	[diff] [blame]	6070	update_curr(cfs_rq);
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	6071
Mike Galbraith	b5d9d73	2009-09-08 11:12:28 +0200	[diff] [blame]	6072	if (curr)
				6073	se->vruntime = curr->vruntime;
Peter Zijlstra	aeb73b0	2007-10-15 17:00:05 +0200	[diff] [blame]	6074	place_entity(cfs_rq, se, 1);
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	6075
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	6076	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
Dmitry Adamushko	87fefa3	2007-10-15 17:00:08 +0200	[diff] [blame]	6077	/*
Ingo Molnar	edcb60a	2007-10-15 17:00:08 +0200	[diff] [blame]	6078	* Upon rescheduling, sched_class::put_prev_task() will place
				6079	* 'current' within the tree based on its new key value.
				6080	*/
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	6081	swap(curr->vruntime, se->vruntime);
Bharata B Rao	aec0a51	2008-08-28 14:42:49 +0530	[diff] [blame]	6082	resched_task(rq->curr);
Peter Zijlstra	4d78e7b	2007-10-15 17:00:04 +0200	[diff] [blame]	6083	}
				6084
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	6085	se->vruntime -= cfs_rq->min_vruntime;
				6086
Thomas Gleixner	05fa785	2009-11-17 14:28:38 +0100	[diff] [blame]	6087	raw_spin_unlock_irqrestore(&rq->lock, flags);
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6088	}
				6089
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6090	/*
				6091	* Priority of the task has changed. Check to see if we preempt
				6092	* the current task.
				6093	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	6094	static void
				6095	prio_changed_fair(struct rq rq, struct task_struct p, int oldprio)
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6096	{
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	6097	if (!p->se.on_rq)
				6098	return;
				6099
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6100	/*
				6101	* Reschedule if we are currently running on this runqueue and
				6102	* our priority decreased, or if we are not currently running on
				6103	* this runqueue and our priority is higher than the current's
				6104	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	6105	if (rq->curr == p) {
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6106	if (p->prio > oldprio)
				6107	resched_task(rq->curr);
				6108	} else
Peter Zijlstra	15afe09	2008-09-20 23:38:02 +0200	[diff] [blame]	6109	check_preempt_curr(rq, p, 0);
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6110	}
				6111
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	6112	static void switched_from_fair(struct rq rq, struct task_struct p)
				6113	{
				6114	struct sched_entity *se = &p->se;
				6115	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				6116
				6117	/*
				6118	* Ensure the task's vruntime is normalized, so that when its
				6119	* switched back to the fair class the enqueue_entity(.flags=0) will
				6120	* do the right thing.
				6121	*
				6122	* If it was on_rq, then the dequeue_entity(.flags=0) will already
				6123	* have normalized the vruntime, if it was !on_rq, then only when
				6124	* the task is sleeping will it still have non-normalized vruntime.
				6125	*/
				6126	if (!se->on_rq && p->state != TASK_RUNNING) {
				6127	/*
				6128	* Fix up our vruntime so that the current sleep doesn't
				6129	* cause 'unlimited' sleep bonus.
				6130	*/
				6131	place_entity(cfs_rq, se, 0);
				6132	se->vruntime -= cfs_rq->min_vruntime;
				6133	}
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	6134
Alex Shi	141965c	2013-06-26 13:05:39 +0800	[diff] [blame]	6135	#ifdef CONFIG_SMP
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	6136	/*
				6137	* Remove our load from contribution when we leave sched_fair
				6138	* and ensure we don't carry in an old decay_count if we
				6139	* switch back.
				6140	*/
Kirill Tkhai	87e3c8a	2013-07-21 04:32:07 +0400	[diff] [blame]	6141	if (se->avg.decay_count) {
				6142	__synchronize_entity_decay(se);
				6143	subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	6144	}
				6145	#endif
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	6146	}
				6147
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6148	/*
				6149	* We switched to the sched_fair class.
				6150	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	6151	static void switched_to_fair(struct rq rq, struct task_struct p)
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6152	{
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	6153	if (!p->se.on_rq)
				6154	return;
				6155
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6156	/*
				6157	* We were most likely switched from sched_rt, so
				6158	* kick off the schedule if running, otherwise just see
				6159	* if we can still preempt the current task.
				6160	*/
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	6161	if (rq->curr == p)
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6162	resched_task(rq->curr);
				6163	else
Peter Zijlstra	15afe09	2008-09-20 23:38:02 +0200	[diff] [blame]	6164	check_preempt_curr(rq, p, 0);
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6165	}
				6166
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	6167	/* Account for a task changing its policy or group.
				6168	*
				6169	* This routine is mostly called to set cfs_rq->curr field when a task
				6170	* migrates between groups/classes.
				6171	*/
				6172	static void set_curr_task_fair(struct rq *rq)
				6173	{
				6174	struct sched_entity *se = &rq->curr->se;
				6175
Paul Turner	ec12cb7	2011-07-21 09:43:30 -0700	[diff] [blame]	6176	for_each_sched_entity(se) {
				6177	struct cfs_rq *cfs_rq = cfs_rq_of(se);
				6178
				6179	set_next_entity(cfs_rq, se);
				6180	/* ensure bandwidth has been allocated on our new cfs_rq */
				6181	account_cfs_rq_runtime(cfs_rq, 0);
				6182	}
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	6183	}
				6184
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6185	void init_cfs_rq(struct cfs_rq *cfs_rq)
				6186	{
				6187	cfs_rq->tasks_timeline = RB_ROOT;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6188	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
				6189	#ifndef CONFIG_64BIT
				6190	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
				6191	#endif
Alex Shi	141965c	2013-06-26 13:05:39 +0800	[diff] [blame]	6192	#ifdef CONFIG_SMP
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	6193	atomic64_set(&cfs_rq->decay_counter, 1);
Alex Shi	2509940	2013-06-20 10:18:55 +0800	[diff] [blame]	6194	atomic_long_set(&cfs_rq->removed_load, 0);
Paul Turner	9ee474f	2012-10-04 13:18:30 +0200	[diff] [blame]	6195	#endif
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6196	}
				6197
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6198	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	6199	static void task_move_group_fair(struct task_struct *p, int on_rq)
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6200	{
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	6201	struct cfs_rq *cfs_rq;
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	6202	/*
				6203	* If the task was not on the rq at the time of this cgroup movement
				6204	* it must have been asleep, sleeping tasks keep their ->vruntime
				6205	* absolute on their old rq until wakeup (needed for the fair sleeper
				6206	* bonus in place_entity()).
				6207	*
				6208	* If it was on the rq, we've just 'preempted' it, which does convert
				6209	* ->vruntime to a relative base.
				6210	*
				6211	* Make sure both cases convert their relative position when migrating
				6212	* to another cgroup's rq. This does somewhat interfere with the
				6213	* fair sleeper stuff for the first placement, but who cares.
				6214	*/
Daisuke Nishimura	7ceff01	2011-12-15 14:36:07 +0900	[diff] [blame]	6215	/*
				6216	* When !on_rq, vruntime of the task has usually NOT been normalized.
				6217	* But there are some cases where it has already been normalized:
				6218	*
				6219	* - Moving a forked child which is waiting for being woken up by
				6220	* wake_up_new_task().
Daisuke Nishimura	62af378	2011-12-15 14:37:41 +0900	[diff] [blame]	6221	* - Moving a task which has been woken up by try_to_wake_up() and
				6222	* waiting for actually being woken up by sched_ttwu_pending().
Daisuke Nishimura	7ceff01	2011-12-15 14:36:07 +0900	[diff] [blame]	6223	*
				6224	* To prevent boost or penalty in the new cfs_rq caused by delta
				6225	* min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
				6226	*/
Daisuke Nishimura	62af378	2011-12-15 14:37:41 +0900	[diff] [blame]	6227	if (!on_rq && (!p->se.sum_exec_runtime \|\| p->state == TASK_WAKING))
Daisuke Nishimura	7ceff01	2011-12-15 14:36:07 +0900	[diff] [blame]	6228	on_rq = 1;
				6229
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	6230	if (!on_rq)
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	6231	p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
				6232	set_task_rq(p, task_cpu(p));
Paul Turner	aff3e49	2012-10-04 13:18:30 +0200	[diff] [blame]	6233	if (!on_rq) {
				6234	cfs_rq = cfs_rq_of(&p->se);
				6235	p->se.vruntime += cfs_rq->min_vruntime;
				6236	#ifdef CONFIG_SMP
				6237	/*
				6238	* migrate_task_rq_fair() will have removed our previous
				6239	* contribution, but we must synchronize for ongoing future
				6240	* decay.
				6241	*/
				6242	p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
				6243	cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
				6244	#endif
				6245	}
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6246	}
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6247
				6248	void free_fair_sched_group(struct task_group *tg)
				6249	{
				6250	int i;
				6251
				6252	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
				6253
				6254	for_each_possible_cpu(i) {
				6255	if (tg->cfs_rq)
				6256	kfree(tg->cfs_rq[i]);
				6257	if (tg->se)
				6258	kfree(tg->se[i]);
				6259	}
				6260
				6261	kfree(tg->cfs_rq);
				6262	kfree(tg->se);
				6263	}
				6264
				6265	int alloc_fair_sched_group(struct task_group tg, struct task_group parent)
				6266	{
				6267	struct cfs_rq *cfs_rq;
				6268	struct sched_entity *se;
				6269	int i;
				6270
				6271	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
				6272	if (!tg->cfs_rq)
				6273	goto err;
				6274	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
				6275	if (!tg->se)
				6276	goto err;
				6277
				6278	tg->shares = NICE_0_LOAD;
				6279
				6280	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
				6281
				6282	for_each_possible_cpu(i) {
				6283	cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
				6284	GFP_KERNEL, cpu_to_node(i));
				6285	if (!cfs_rq)
				6286	goto err;
				6287
				6288	se = kzalloc_node(sizeof(struct sched_entity),
				6289	GFP_KERNEL, cpu_to_node(i));
				6290	if (!se)
				6291	goto err_free_rq;
				6292
				6293	init_cfs_rq(cfs_rq);
				6294	init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
				6295	}
				6296
				6297	return 1;
				6298
				6299	err_free_rq:
				6300	kfree(cfs_rq);
				6301	err:
				6302	return 0;
				6303	}
				6304
				6305	void unregister_fair_sched_group(struct task_group *tg, int cpu)
				6306	{
				6307	struct rq *rq = cpu_rq(cpu);
				6308	unsigned long flags;
				6309
				6310	/*
				6311	* Only empty task groups can be destroyed; so we can speculatively
				6312	* check on_list without danger of it being re-added.
				6313	*/
				6314	if (!tg->cfs_rq[cpu]->on_list)
				6315	return;
				6316
				6317	raw_spin_lock_irqsave(&rq->lock, flags);
				6318	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
				6319	raw_spin_unlock_irqrestore(&rq->lock, flags);
				6320	}
				6321
				6322	void init_tg_cfs_entry(struct task_group tg, struct cfs_rq cfs_rq,
				6323	struct sched_entity *se, int cpu,
				6324	struct sched_entity *parent)
				6325	{
				6326	struct rq *rq = cpu_rq(cpu);
				6327
				6328	cfs_rq->tg = tg;
				6329	cfs_rq->rq = rq;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6330	init_cfs_rq_runtime(cfs_rq);
				6331
				6332	tg->cfs_rq[cpu] = cfs_rq;
				6333	tg->se[cpu] = se;
				6334
				6335	/* se could be NULL for root_task_group */
				6336	if (!se)
				6337	return;
				6338
				6339	if (!parent)
				6340	se->cfs_rq = &rq->cfs;
				6341	else
				6342	se->cfs_rq = parent->my_q;
				6343
				6344	se->my_q = cfs_rq;
				6345	update_load_set(&se->load, 0);
				6346	se->parent = parent;
				6347	}
				6348
				6349	static DEFINE_MUTEX(shares_mutex);
				6350
				6351	int sched_group_set_shares(struct task_group *tg, unsigned long shares)
				6352	{
				6353	int i;
				6354	unsigned long flags;
				6355
				6356	/*
				6357	* We can't change the weight of the root cgroup.
				6358	*/
				6359	if (!tg->se[0])
				6360	return -EINVAL;
				6361
				6362	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
				6363
				6364	mutex_lock(&shares_mutex);
				6365	if (tg->shares == shares)
				6366	goto done;
				6367
				6368	tg->shares = shares;
				6369	for_each_possible_cpu(i) {
				6370	struct rq *rq = cpu_rq(i);
				6371	struct sched_entity *se;
				6372
				6373	se = tg->se[i];
				6374	/* Propagate contribution to hierarchy */
				6375	raw_spin_lock_irqsave(&rq->lock, flags);
Frederic Weisbecker	71b1da4	2013-04-12 01:50:59 +0200	[diff] [blame]	6376
				6377	/* Possible calls to update_curr() need rq clock */
				6378	update_rq_clock(rq);
Linus Torvalds	17bc14b	2012-12-14 07:20:43 -0800	[diff] [blame]	6379	for_each_sched_entity(se)
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6380	update_cfs_shares(group_cfs_rq(se));
				6381	raw_spin_unlock_irqrestore(&rq->lock, flags);
				6382	}
				6383
				6384	done:
				6385	mutex_unlock(&shares_mutex);
				6386	return 0;
				6387	}
				6388	#else /* CONFIG_FAIR_GROUP_SCHED */
				6389
				6390	void free_fair_sched_group(struct task_group *tg) { }
				6391
				6392	int alloc_fair_sched_group(struct task_group tg, struct task_group parent)
				6393	{
				6394	return 1;
				6395	}
				6396
				6397	void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
				6398
				6399	#endif /* CONFIG_FAIR_GROUP_SCHED */
				6400
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6401
H Hartley Sweeten	6d686f4	2010-01-13 20:21:52 -0700	[diff] [blame]	6402	static unsigned int get_rr_interval_fair(struct rq rq, struct task_struct task)
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6403	{
				6404	struct sched_entity *se = &task->se;
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6405	unsigned int rr_interval = 0;
				6406
				6407	/*
				6408	* Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
				6409	* idle runqueue:
				6410	*/
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6411	if (rq->cfs.load.weight)
Zhu Yanhai	a59f4e0	2013-01-08 12:56:52 +0800	[diff] [blame]	6412	rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6413
				6414	return rr_interval;
				6415	}
				6416
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6417	/*
				6418	* All the scheduling class methods:
				6419	*/
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6420	const struct sched_class fair_sched_class = {
Ingo Molnar	5522d5d	2007-10-15 17:00:12 +0200	[diff] [blame]	6421	.next = &idle_sched_class,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6422	.enqueue_task = enqueue_task_fair,
				6423	.dequeue_task = dequeue_task_fair,
				6424	.yield_task = yield_task_fair,
Mike Galbraith	d95f412	2011-02-01 09:50:51 -0500	[diff] [blame]	6425	.yield_to_task = yield_to_task_fair,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6426
Ingo Molnar	2e09bf5	2007-10-15 17:00:05 +0200	[diff] [blame]	6427	.check_preempt_curr = check_preempt_wakeup,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6428
				6429	.pick_next_task = pick_next_task_fair,
				6430	.put_prev_task = put_prev_task_fair,
				6431
Peter Williams	681f3e6	2007-10-24 18:23:51 +0200	[diff] [blame]	6432	#ifdef CONFIG_SMP
Li Zefan	4ce72a2	2008-10-22 15:25:26 +0800	[diff] [blame]	6433	.select_task_rq = select_task_rq_fair,
Paul Turner	0a74bef	2012-10-04 13:18:30 +0200	[diff] [blame]	6434	.migrate_task_rq = migrate_task_rq_fair,
Alex Shi	141965c	2013-06-26 13:05:39 +0800	[diff] [blame]	6435
Christian Ehrhardt	0bcdcf2	2009-11-30 12:16:46 +0100	[diff] [blame]	6436	.rq_online = rq_online_fair,
				6437	.rq_offline = rq_offline_fair,
Peter Zijlstra	88ec22d	2009-12-16 18:04:41 +0100	[diff] [blame]	6438
				6439	.task_waking = task_waking_fair,
Peter Williams	681f3e6	2007-10-24 18:23:51 +0200	[diff] [blame]	6440	#endif
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6441
Srivatsa Vaddagiri	83b699e	2007-10-15 17:00:08 +0200	[diff] [blame]	6442	.set_curr_task = set_curr_task_fair,
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6443	.task_tick = task_tick_fair,
Peter Zijlstra	cd29fe6	2009-11-27 17:32:46 +0100	[diff] [blame]	6444	.task_fork = task_fork_fair,
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6445
				6446	.prio_changed = prio_changed_fair,
Peter Zijlstra	da7a735	2011-01-17 17:03:27 +0100	[diff] [blame]	6447	.switched_from = switched_from_fair,
Steven Rostedt	cb46984	2008-01-25 21:08:22 +0100	[diff] [blame]	6448	.switched_to = switched_to_fair,
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6449
Peter Williams	0d721ce	2009-09-21 01:31:53 +0000	[diff] [blame]	6450	.get_rr_interval = get_rr_interval_fair,
				6451
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6452	#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra	b2b5ce0	2010-10-15 15:24:15 +0200	[diff] [blame]	6453	.task_move_group = task_move_group_fair,
Peter Zijlstra	810b381	2008-02-29 15:21:01 -0500	[diff] [blame]	6454	#endif
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6455	};
				6456
				6457	#ifdef CONFIG_SCHED_DEBUG
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6458	void print_cfs_stats(struct seq_file *m, int cpu)
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6459	{
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6460	struct cfs_rq *cfs_rq;
				6461
Peter Zijlstra	5973e5b	2008-01-25 21:08:34 +0100	[diff] [blame]	6462	rcu_read_lock();
Ingo Molnar	c3b64f1	2007-08-09 11:16:51 +0200	[diff] [blame]	6463	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
Ingo Molnar	5cef9ec	2007-08-09 11:16:47 +0200	[diff] [blame]	6464	print_cfs_rq(m, cpu, cfs_rq);
Peter Zijlstra	5973e5b	2008-01-25 21:08:34 +0100	[diff] [blame]	6465	rcu_read_unlock();
Ingo Molnar	bf0f6f2	2007-07-09 18:51:58 +0200	[diff] [blame]	6466	}
				6467	#endif
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6468
				6469	__init void init_sched_fair_class(void)
				6470	{
				6471	#ifdef CONFIG_SMP
				6472	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
				6473
Frederic Weisbecker	3451d02	2011-08-10 23:21:01 +0200	[diff] [blame]	6474	#ifdef CONFIG_NO_HZ_COMMON
Diwakar Tundlam	554ceca	2012-03-07 14:44:26 -0800	[diff] [blame]	6475	nohz.next_balance = jiffies;
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6476	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
Suresh Siddha	7132596	2012-01-19 18:28:57 -0800	[diff] [blame]	6477	cpu_notifier(sched_ilb_notifier, 0);
Peter Zijlstra	029632f	2011-10-25 10:00:11 +0200	[diff] [blame]	6478	#endif
				6479	#endif /* SMP */
				6480
				6481	}