blob: d6b149ccf925c320841e8a42f31fd23b6ee64dc6 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * kernel/sched.c
3 *
4 * Kernel scheduler and related syscalls
5 *
6 * Copyright (C) 1991-2002 Linus Torvalds
7 *
8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9 * make semaphores SMP safe
10 * 1998-11-19 Implemented schedule_timeout() and related stuff
11 * by Andrea Arcangeli
12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13 * hybrid priority-list and round-robin design with
14 * an array-switch method of distributing timeslices
15 * and per-CPU runqueues. Cleanups and useful suggestions
16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin
Ingo Molnarc31f2e82007-07-09 18:52:01 +020019 * 2007-04-15 Work begun on replacing all interactivity tuning with a
20 * fair scheduling design by Con Kolivas.
21 * 2007-05-05 Load balancing (smp-nice) and other improvements
22 * by Peter Williams
23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
Ingo Molnarb9131762008-01-25 21:08:19 +010025 * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
26 * Thomas Gleixner, Mike Kravetz
Linus Torvalds1da177e2005-04-16 15:20:36 -070027 */
28
29#include <linux/mm.h>
30#include <linux/module.h>
31#include <linux/nmi.h>
32#include <linux/init.h>
Ingo Molnardff06c12007-07-09 18:52:00 +020033#include <linux/uaccess.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070034#include <linux/highmem.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070035#include <asm/mmu_context.h>
36#include <linux/interrupt.h>
Randy.Dunlapc59ede72006-01-11 12:17:46 -080037#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070038#include <linux/completion.h>
39#include <linux/kernel_stat.h>
Ingo Molnar9a11b49a2006-07-03 00:24:33 -070040#include <linux/debug_locks.h>
Ingo Molnarcdd6c482009-09-21 12:02:48 +020041#include <linux/perf_event.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070042#include <linux/security.h>
43#include <linux/notifier.h>
44#include <linux/profile.h>
Nigel Cunningham7dfb7102006-12-06 20:34:23 -080045#include <linux/freezer.h>
akpm@osdl.org198e2f12006-01-12 01:05:30 -080046#include <linux/vmalloc.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070047#include <linux/blkdev.h>
48#include <linux/delay.h>
Pavel Emelyanovb4888932007-10-18 23:40:14 -070049#include <linux/pid_namespace.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070050#include <linux/smp.h>
51#include <linux/threads.h>
52#include <linux/timer.h>
53#include <linux/rcupdate.h>
54#include <linux/cpu.h>
55#include <linux/cpuset.h>
56#include <linux/percpu.h>
Alexey Dobriyanb5aadf72008-10-06 13:23:43 +040057#include <linux/proc_fs.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070058#include <linux/seq_file.h>
Tejun Heo969c7922010-05-06 18:49:21 +020059#include <linux/stop_machine.h>
Nick Piggine692ab52007-07-26 13:40:43 +020060#include <linux/sysctl.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070061#include <linux/syscalls.h>
62#include <linux/times.h>
Jay Lan8f0ab512006-09-30 23:28:59 -070063#include <linux/tsacct_kern.h>
bibo maoc6fd91f2006-03-26 01:38:20 -080064#include <linux/kprobes.h>
Shailabh Nagar0ff92242006-07-14 00:24:37 -070065#include <linux/delayacct.h>
Ingo Molnardff06c12007-07-09 18:52:00 +020066#include <linux/unistd.h>
Jens Axboef5ff8422007-09-21 09:19:54 +020067#include <linux/pagemap.h>
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +010068#include <linux/hrtimer.h>
Reynes Philippe30914a52008-03-17 16:19:05 -070069#include <linux/tick.h>
Peter Zijlstraf00b45c2008-04-19 19:45:00 +020070#include <linux/debugfs.h>
71#include <linux/ctype.h>
Steven Rostedt6cd8a4b2008-05-12 21:20:42 +020072#include <linux/ftrace.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +090073#include <linux/slab.h>
Carsten Emdef1c6f1a2011-10-26 23:14:16 +020074#include <linux/init_task.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070075
Eric Dumazet5517d862007-05-08 00:32:57 -070076#include <asm/tlb.h>
Satyam Sharma838225b2007-10-24 18:23:50 +020077#include <asm/irq_regs.h>
Gerald Schaefer335d7af2010-11-22 15:47:36 +010078#include <asm/mutex.h>
Glauber Costae6e66852011-07-11 15:28:17 -040079#ifdef CONFIG_PARAVIRT
80#include <asm/paravirt.h>
81#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -070082
Gregory Haskins6e0534f2008-05-12 21:21:01 +020083#include "sched_cpupri.h"
Tejun Heo21aa9af2010-06-08 21:40:37 +020084#include "workqueue_sched.h"
Mike Galbraith5091faa2010-11-30 14:18:03 +010085#include "sched_autogroup.h"
Gregory Haskins6e0534f2008-05-12 21:21:01 +020086
Steven Rostedta8d154b2009-04-10 09:36:00 -040087#define CREATE_TRACE_POINTS
Steven Rostedtad8d75f2009-04-14 19:39:12 -040088#include <trace/events/sched.h>
Steven Rostedta8d154b2009-04-10 09:36:00 -040089
Linus Torvalds1da177e2005-04-16 15:20:36 -070090/*
91 * Convert user-nice values [ -20 ... 0 ... 19 ]
92 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
93 * and back.
94 */
95#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
96#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
97#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
98
99/*
100 * 'User priority' is the nice value converted to something we
101 * can work with better when scaling various scheduler parameters,
102 * it's a [ 0 ... 39 ] range.
103 */
104#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
105#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
106#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
107
108/*
Ingo Molnard7876a02008-01-25 21:08:19 +0100109 * Helpers for converting nanosecond timing to jiffy resolution
Linus Torvalds1da177e2005-04-16 15:20:36 -0700110 */
Eric Dumazetd6322fa2007-11-09 22:39:38 +0100111#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700112
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200113#define NICE_0_LOAD SCHED_LOAD_SCALE
114#define NICE_0_SHIFT SCHED_LOAD_SHIFT
115
Linus Torvalds1da177e2005-04-16 15:20:36 -0700116/*
117 * These are the 'tuning knobs' of the scheduler:
118 *
Dmitry Adamushkoa4ec24b2007-10-15 17:00:13 +0200119 * default timeslice is 100 msecs (used only for SCHED_RR tasks).
Linus Torvalds1da177e2005-04-16 15:20:36 -0700120 * Timeslices get refilled after they expire.
121 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700122#define DEF_TIMESLICE (100 * HZ / 1000)
Peter Williams2dd73a42006-06-27 02:54:34 -0700123
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +0200124/*
125 * single value that denotes runtime == period, ie unlimited time.
126 */
127#define RUNTIME_INF ((u64)~0ULL)
128
Ingo Molnare05606d2007-07-09 18:51:59 +0200129static inline int rt_policy(int policy)
130{
Steven Rostedt63f01242010-12-06 14:48:10 -0500131 if (policy == SCHED_FIFO || policy == SCHED_RR)
Ingo Molnare05606d2007-07-09 18:51:59 +0200132 return 1;
133 return 0;
134}
135
136static inline int task_has_rt_policy(struct task_struct *p)
137{
138 return rt_policy(p->policy);
139}
140
Linus Torvalds1da177e2005-04-16 15:20:36 -0700141/*
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200142 * This is the priority-queue data structure of the RT scheduling class:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700143 */
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200144struct rt_prio_array {
145 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
146 struct list_head queue[MAX_RT_PRIO];
147};
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +0200149struct rt_bandwidth {
Ingo Molnarea736ed2008-03-25 13:51:45 +0100150 /* nests inside the rq lock: */
Thomas Gleixner0986b112009-11-17 15:32:06 +0100151 raw_spinlock_t rt_runtime_lock;
Ingo Molnarea736ed2008-03-25 13:51:45 +0100152 ktime_t rt_period;
153 u64 rt_runtime;
154 struct hrtimer rt_period_timer;
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +0200155};
156
157static struct rt_bandwidth def_rt_bandwidth;
158
159static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
160
161static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
162{
163 struct rt_bandwidth *rt_b =
164 container_of(timer, struct rt_bandwidth, rt_period_timer);
165 ktime_t now;
166 int overrun;
167 int idle = 0;
168
169 for (;;) {
170 now = hrtimer_cb_get_time(timer);
171 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
172
173 if (!overrun)
174 break;
175
176 idle = do_sched_rt_period_timer(rt_b, overrun);
177 }
178
179 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
180}
181
182static
183void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
184{
185 rt_b->rt_period = ns_to_ktime(period);
186 rt_b->rt_runtime = runtime;
187
Thomas Gleixner0986b112009-11-17 15:32:06 +0100188 raw_spin_lock_init(&rt_b->rt_runtime_lock);
Peter Zijlstraac086bc2008-04-19 19:44:58 +0200189
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +0200190 hrtimer_init(&rt_b->rt_period_timer,
191 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
192 rt_b->rt_period_timer.function = sched_rt_period_timer;
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +0200193}
194
Krzysztof Heltc8bfff62008-09-05 23:46:19 +0200195static inline int rt_bandwidth_enabled(void)
196{
197 return sysctl_sched_rt_runtime >= 0;
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +0200198}
199
Paul Turner58088ad2011-07-21 09:43:31 -0700200static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
201{
202 unsigned long delta;
203 ktime_t soft, hard, now;
204
205 for (;;) {
206 if (hrtimer_active(period_timer))
207 break;
208
209 now = hrtimer_cb_get_time(period_timer);
210 hrtimer_forward(period_timer, now, period);
211
212 soft = hrtimer_get_softexpires(period_timer);
213 hard = hrtimer_get_expires(period_timer);
214 delta = ktime_to_ns(ktime_sub(hard, soft));
215 __hrtimer_start_range_ns(period_timer, soft, delta,
216 HRTIMER_MODE_ABS_PINNED, 0);
217 }
218}
219
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +0200220static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
221{
Hiroshi Shimamotocac64d02009-02-25 09:59:26 -0800222 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +0200223 return;
224
225 if (hrtimer_active(&rt_b->rt_period_timer))
226 return;
227
Thomas Gleixner0986b112009-11-17 15:32:06 +0100228 raw_spin_lock(&rt_b->rt_runtime_lock);
Paul Turner58088ad2011-07-21 09:43:31 -0700229 start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
Thomas Gleixner0986b112009-11-17 15:32:06 +0100230 raw_spin_unlock(&rt_b->rt_runtime_lock);
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +0200231}
232
233#ifdef CONFIG_RT_GROUP_SCHED
234static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
235{
236 hrtimer_cancel(&rt_b->rt_period_timer);
237}
238#endif
239
Heiko Carstens712555e2008-04-28 11:33:07 +0200240/*
Peter Zijlstrac4a88492011-04-07 14:09:42 +0200241 * sched_domains_mutex serializes calls to init_sched_domains,
Heiko Carstens712555e2008-04-28 11:33:07 +0200242 * detach_destroy_domains and partition_sched_domains.
243 */
244static DEFINE_MUTEX(sched_domains_mutex);
245
Dhaval Giani7c941432010-01-20 13:26:18 +0100246#ifdef CONFIG_CGROUP_SCHED
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +0200247
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -0700248#include <linux/cgroup.h>
249
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +0200250struct cfs_rq;
251
Peter Zijlstra6f505b12008-01-25 21:08:30 +0100252static LIST_HEAD(task_groups);
253
Paul Turnerab84d312011-07-21 09:43:28 -0700254struct cfs_bandwidth {
255#ifdef CONFIG_CFS_BANDWIDTH
256 raw_spinlock_t lock;
257 ktime_t period;
Paul Turnerec12cb72011-07-21 09:43:30 -0700258 u64 quota, runtime;
Paul Turnera790de92011-07-21 09:43:29 -0700259 s64 hierarchal_quota;
Paul Turnera9cf55b2011-07-21 09:43:32 -0700260 u64 runtime_expires;
Paul Turner58088ad2011-07-21 09:43:31 -0700261
262 int idle, timer_active;
Paul Turnerd8b49862011-07-21 09:43:41 -0700263 struct hrtimer period_timer, slack_timer;
Paul Turner85dac902011-07-21 09:43:33 -0700264 struct list_head throttled_cfs_rq;
265
Nikhil Raoe8da1b12011-07-21 09:43:40 -0700266 /* statistics */
267 int nr_periods, nr_throttled;
268 u64 throttled_time;
Paul Turnerab84d312011-07-21 09:43:28 -0700269#endif
270};
271
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +0200272/* task group related information */
Ingo Molnar4cf86d72007-10-15 17:00:14 +0200273struct task_group {
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -0700274 struct cgroup_subsys_state css;
Arun R Bharadwaj6c415b92008-12-01 20:49:05 +0530275
Peter Zijlstra052f1dc2008-02-13 15:45:40 +0100276#ifdef CONFIG_FAIR_GROUP_SCHED
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +0200277 /* schedulable entities of this group on each cpu */
278 struct sched_entity **se;
279 /* runqueue "owned" by this group on each cpu */
280 struct cfs_rq **cfs_rq;
281 unsigned long shares;
Peter Zijlstra2069dd72010-11-15 15:47:00 -0800282
283 atomic_t load_weight;
Peter Zijlstra052f1dc2008-02-13 15:45:40 +0100284#endif
285
286#ifdef CONFIG_RT_GROUP_SCHED
287 struct sched_rt_entity **rt_se;
288 struct rt_rq **rt_rq;
289
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +0200290 struct rt_bandwidth rt_bandwidth;
Peter Zijlstra052f1dc2008-02-13 15:45:40 +0100291#endif
Srivatsa Vaddagiri6b2d7702008-01-25 21:08:00 +0100292
Srivatsa Vaddagiriae8393e2007-10-29 21:18:11 +0100293 struct rcu_head rcu;
Peter Zijlstra6f505b12008-01-25 21:08:30 +0100294 struct list_head list;
Peter Zijlstraf473aa52008-04-19 19:45:00 +0200295
296 struct task_group *parent;
297 struct list_head siblings;
298 struct list_head children;
Mike Galbraith5091faa2010-11-30 14:18:03 +0100299
300#ifdef CONFIG_SCHED_AUTOGROUP
301 struct autogroup *autogroup;
302#endif
Paul Turnerab84d312011-07-21 09:43:28 -0700303
304 struct cfs_bandwidth cfs_bandwidth;
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +0200305};
306
Peter Zijlstra3d4b47b2010-11-15 15:47:01 -0800307/* task_group_lock serializes the addition/removal of task groups */
Peter Zijlstra8ed36992008-02-13 15:45:39 +0100308static DEFINE_SPINLOCK(task_group_lock);
Srivatsa Vaddagiriec2c5072008-01-25 21:07:59 +0100309
Cyrill Gorcunove9036b32009-10-26 22:24:14 +0300310#ifdef CONFIG_FAIR_GROUP_SCHED
311
Yong Zhang07e06b02011-01-07 15:17:36 +0800312# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
Srivatsa Vaddagiri24e377a2007-10-15 17:00:09 +0200313
Miao Xiecb4ad1f2008-04-28 12:54:56 +0800314/*
Lai Jiangshan2e084782008-06-12 16:42:58 +0800315 * A weight of 0 or 1 can cause arithmetics problems.
316 * A weight of a cfs_rq is the sum of weights of which entities
317 * are queued on this cfs_rq, so a weight of a entity should not be
318 * too large, so as the shares value of a task group.
Miao Xiecb4ad1f2008-04-28 12:54:56 +0800319 * (The default weight is 1024 - so there's no practical
320 * limitation from this.)
321 */
Mike Galbraithcd622872011-06-04 15:03:20 +0200322#define MIN_SHARES (1UL << 1)
323#define MAX_SHARES (1UL << 18)
Peter Zijlstra18d95a22008-04-19 19:45:00 +0200324
Yong Zhang07e06b02011-01-07 15:17:36 +0800325static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
Peter Zijlstra052f1dc2008-02-13 15:45:40 +0100326#endif
327
328/* Default task group.
329 * Every task in system belong to this group at bootup.
330 */
Yong Zhang07e06b02011-01-07 15:17:36 +0800331struct task_group root_task_group;
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +0200332
Dhaval Giani7c941432010-01-20 13:26:18 +0100333#endif /* CONFIG_CGROUP_SCHED */
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +0200334
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200335/* CFS-related fields in a runqueue */
336struct cfs_rq {
337 struct load_weight load;
Paul Turner953bfcd2011-07-21 09:43:27 -0700338 unsigned long nr_running, h_nr_running;
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200339
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200340 u64 exec_clock;
Ingo Molnare9acbff2007-10-15 17:00:04 +0200341 u64 min_vruntime;
Peter Zijlstra3fe16982011-04-05 17:23:48 +0200342#ifndef CONFIG_64BIT
343 u64 min_vruntime_copy;
344#endif
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200345
346 struct rb_root tasks_timeline;
347 struct rb_node *rb_leftmost;
Peter Zijlstra4a55bd52008-04-19 19:45:00 +0200348
349 struct list_head tasks;
350 struct list_head *balance_iterator;
351
352 /*
353 * 'curr' points to currently running entity on this cfs_rq.
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200354 * It is set to NULL otherwise (i.e when none are currently running).
355 */
Rik van Rielac53db52011-02-01 09:51:03 -0500356 struct sched_entity *curr, *next, *last, *skip;
Peter Zijlstraddc97292007-10-15 17:00:10 +0200357
Rakib Mullick4934a4d2011-05-04 22:53:46 +0600358#ifdef CONFIG_SCHED_DEBUG
Peter Zijlstra5ac5c4d2008-11-10 10:46:32 +0100359 unsigned int nr_spread_over;
Rakib Mullick4934a4d2011-05-04 22:53:46 +0600360#endif
Peter Zijlstraddc97292007-10-15 17:00:10 +0200361
Ingo Molnar62160e32007-10-15 17:00:03 +0200362#ifdef CONFIG_FAIR_GROUP_SCHED
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200363 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
364
Ingo Molnar41a2d6c2007-12-05 15:46:09 +0100365 /*
366 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200367 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
368 * (like users, containers etc.)
369 *
370 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
371 * list is used during load balance.
372 */
Peter Zijlstra3d4b47b2010-11-15 15:47:01 -0800373 int on_list;
Ingo Molnar41a2d6c2007-12-05 15:46:09 +0100374 struct list_head leaf_cfs_rq_list;
375 struct task_group *tg; /* group that "owns" this runqueue */
Peter Zijlstrac09595f2008-06-27 13:41:14 +0200376
377#ifdef CONFIG_SMP
Peter Zijlstrac09595f2008-06-27 13:41:14 +0200378 /*
Peter Zijlstrac8cba852008-06-27 13:41:23 +0200379 * the part of load.weight contributed by tasks
Peter Zijlstrac09595f2008-06-27 13:41:14 +0200380 */
Peter Zijlstrac8cba852008-06-27 13:41:23 +0200381 unsigned long task_weight;
Peter Zijlstrac09595f2008-06-27 13:41:14 +0200382
Peter Zijlstrac8cba852008-06-27 13:41:23 +0200383 /*
384 * h_load = weight * f(tg)
385 *
386 * Where f(tg) is the recursive weight fraction assigned to
387 * this group.
388 */
389 unsigned long h_load;
Peter Zijlstrac09595f2008-06-27 13:41:14 +0200390
Peter Zijlstrac8cba852008-06-27 13:41:23 +0200391 /*
Paul Turner3b3d1902010-11-15 15:47:08 -0800392 * Maintaining per-cpu shares distribution for group scheduling
393 *
394 * load_stamp is the last time we updated the load average
395 * load_last is the last time we updated the load average and saw load
396 * load_unacc_exec_time is currently unaccounted execution time
Peter Zijlstrac8cba852008-06-27 13:41:23 +0200397 */
Peter Zijlstra2069dd72010-11-15 15:47:00 -0800398 u64 load_avg;
399 u64 load_period;
Paul Turner3b3d1902010-11-15 15:47:08 -0800400 u64 load_stamp, load_last, load_unacc_exec_time;
Peter Zijlstraf1d239f2008-06-27 13:41:38 +0200401
Peter Zijlstra2069dd72010-11-15 15:47:00 -0800402 unsigned long load_contribution;
Peter Zijlstrac09595f2008-06-27 13:41:14 +0200403#endif
Paul Turnerab84d312011-07-21 09:43:28 -0700404#ifdef CONFIG_CFS_BANDWIDTH
405 int runtime_enabled;
Paul Turnera9cf55b2011-07-21 09:43:32 -0700406 u64 runtime_expires;
Paul Turnerab84d312011-07-21 09:43:28 -0700407 s64 runtime_remaining;
Paul Turner85dac902011-07-21 09:43:33 -0700408
Nikhil Raoe8da1b12011-07-21 09:43:40 -0700409 u64 throttled_timestamp;
Paul Turner64660c82011-07-21 09:43:36 -0700410 int throttled, throttle_count;
Paul Turner85dac902011-07-21 09:43:33 -0700411 struct list_head throttled_list;
Paul Turnerab84d312011-07-21 09:43:28 -0700412#endif
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200413#endif
414};
415
Paul Turnerab84d312011-07-21 09:43:28 -0700416#ifdef CONFIG_FAIR_GROUP_SCHED
417#ifdef CONFIG_CFS_BANDWIDTH
418static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
419{
420 return &tg->cfs_bandwidth;
421}
422
423static inline u64 default_cfs_period(void);
Paul Turner58088ad2011-07-21 09:43:31 -0700424static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
Paul Turnerd8b49862011-07-21 09:43:41 -0700425static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
426
427static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
428{
429 struct cfs_bandwidth *cfs_b =
430 container_of(timer, struct cfs_bandwidth, slack_timer);
431 do_sched_cfs_slack_timer(cfs_b);
432
433 return HRTIMER_NORESTART;
434}
Paul Turner58088ad2011-07-21 09:43:31 -0700435
436static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
437{
438 struct cfs_bandwidth *cfs_b =
439 container_of(timer, struct cfs_bandwidth, period_timer);
440 ktime_t now;
441 int overrun;
442 int idle = 0;
443
444 for (;;) {
445 now = hrtimer_cb_get_time(timer);
446 overrun = hrtimer_forward(timer, now, cfs_b->period);
447
448 if (!overrun)
449 break;
450
451 idle = do_sched_cfs_period_timer(cfs_b, overrun);
452 }
453
454 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
455}
Paul Turnerab84d312011-07-21 09:43:28 -0700456
457static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
458{
459 raw_spin_lock_init(&cfs_b->lock);
Paul Turnerec12cb72011-07-21 09:43:30 -0700460 cfs_b->runtime = 0;
Paul Turnerab84d312011-07-21 09:43:28 -0700461 cfs_b->quota = RUNTIME_INF;
462 cfs_b->period = ns_to_ktime(default_cfs_period());
Paul Turner58088ad2011-07-21 09:43:31 -0700463
Paul Turner85dac902011-07-21 09:43:33 -0700464 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
Paul Turner58088ad2011-07-21 09:43:31 -0700465 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
466 cfs_b->period_timer.function = sched_cfs_period_timer;
Paul Turnerd8b49862011-07-21 09:43:41 -0700467 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
468 cfs_b->slack_timer.function = sched_cfs_slack_timer;
Paul Turnerab84d312011-07-21 09:43:28 -0700469}
470
471static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
472{
473 cfs_rq->runtime_enabled = 0;
Paul Turner85dac902011-07-21 09:43:33 -0700474 INIT_LIST_HEAD(&cfs_rq->throttled_list);
Paul Turnerab84d312011-07-21 09:43:28 -0700475}
476
Paul Turner58088ad2011-07-21 09:43:31 -0700477/* requires cfs_b->lock, may release to reprogram timer */
478static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
479{
480 /*
481 * The timer may be active because we're trying to set a new bandwidth
482 * period or because we're racing with the tear-down path
483 * (timer_active==0 becomes visible before the hrtimer call-back
484 * terminates). In either case we ensure that it's re-programmed
485 */
486 while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
487 raw_spin_unlock(&cfs_b->lock);
488 /* ensure cfs_b->lock is available while we wait */
489 hrtimer_cancel(&cfs_b->period_timer);
490
491 raw_spin_lock(&cfs_b->lock);
492 /* if someone else restarted the timer then we're done */
493 if (cfs_b->timer_active)
494 return;
495 }
496
497 cfs_b->timer_active = 1;
498 start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
499}
500
Paul Turnerab84d312011-07-21 09:43:28 -0700501static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
Paul Turner58088ad2011-07-21 09:43:31 -0700502{
503 hrtimer_cancel(&cfs_b->period_timer);
Paul Turnerd8b49862011-07-21 09:43:41 -0700504 hrtimer_cancel(&cfs_b->slack_timer);
Paul Turner58088ad2011-07-21 09:43:31 -0700505}
Paul Turnerab84d312011-07-21 09:43:28 -0700506#else
507static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
508static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
509static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
510
511static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
512{
513 return NULL;
514}
515#endif /* CONFIG_CFS_BANDWIDTH */
516#endif /* CONFIG_FAIR_GROUP_SCHED */
517
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200518/* Real-Time classes' related field in a runqueue: */
519struct rt_rq {
520 struct rt_prio_array active;
Steven Rostedt63489e42008-01-25 21:08:03 +0100521 unsigned long rt_nr_running;
Peter Zijlstra052f1dc2008-02-13 15:45:40 +0100522#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
Gregory Haskinse864c492008-12-29 09:39:49 -0500523 struct {
524 int curr; /* highest queued rt task prio */
Gregory Haskins398a1532009-01-14 09:10:04 -0500525#ifdef CONFIG_SMP
Gregory Haskinse864c492008-12-29 09:39:49 -0500526 int next; /* next highest */
Gregory Haskins398a1532009-01-14 09:10:04 -0500527#endif
Gregory Haskinse864c492008-12-29 09:39:49 -0500528 } highest_prio;
Peter Zijlstra6f505b12008-01-25 21:08:30 +0100529#endif
Peter Zijlstrafa85ae22008-01-25 21:08:29 +0100530#ifdef CONFIG_SMP
Gregory Haskins73fe6aa2008-01-25 21:08:07 +0100531 unsigned long rt_nr_migratory;
Peter Zijlstraa1ba4d82009-04-01 18:40:15 +0200532 unsigned long rt_nr_total;
Gregory Haskinsa22d7fc2008-01-25 21:08:12 +0100533 int overloaded;
Gregory Haskins917b6272008-12-29 09:39:53 -0500534 struct plist_head pushable_tasks;
Peter Zijlstrafa85ae22008-01-25 21:08:29 +0100535#endif
Peter Zijlstra6f505b12008-01-25 21:08:30 +0100536 int rt_throttled;
Peter Zijlstrafa85ae22008-01-25 21:08:29 +0100537 u64 rt_time;
Peter Zijlstraac086bc2008-04-19 19:44:58 +0200538 u64 rt_runtime;
Ingo Molnarea736ed2008-03-25 13:51:45 +0100539 /* Nests inside the rq lock: */
Thomas Gleixner0986b112009-11-17 15:32:06 +0100540 raw_spinlock_t rt_runtime_lock;
Peter Zijlstra6f505b12008-01-25 21:08:30 +0100541
Peter Zijlstra052f1dc2008-02-13 15:45:40 +0100542#ifdef CONFIG_RT_GROUP_SCHED
Peter Zijlstra23b0fdf2008-02-13 15:45:39 +0100543 unsigned long rt_nr_boosted;
544
Peter Zijlstra6f505b12008-01-25 21:08:30 +0100545 struct rq *rq;
546 struct list_head leaf_rt_rq_list;
547 struct task_group *tg;
Peter Zijlstra6f505b12008-01-25 21:08:30 +0100548#endif
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200549};
550
Gregory Haskins57d885f2008-01-25 21:08:18 +0100551#ifdef CONFIG_SMP
552
553/*
554 * We add the notion of a root-domain which will be used to define per-domain
Ingo Molnar0eab9142008-01-25 21:08:19 +0100555 * variables. Each exclusive cpuset essentially defines an island domain by
556 * fully partitioning the member cpus from any other cpuset. Whenever a new
Gregory Haskins57d885f2008-01-25 21:08:18 +0100557 * exclusive cpuset is created, we also create and attach a new root-domain
558 * object.
559 *
Gregory Haskins57d885f2008-01-25 21:08:18 +0100560 */
561struct root_domain {
562 atomic_t refcount;
Richard Kennedy26a148e2011-07-15 11:41:31 +0100563 atomic_t rto_count;
Peter Zijlstradce840a2011-04-07 14:09:50 +0200564 struct rcu_head rcu;
Rusty Russellc6c49272008-11-25 02:35:05 +1030565 cpumask_var_t span;
566 cpumask_var_t online;
Gregory Haskins637f5082008-01-25 21:08:18 +0100567
Ingo Molnar0eab9142008-01-25 21:08:19 +0100568 /*
Gregory Haskins637f5082008-01-25 21:08:18 +0100569 * The "RT overload" flag: it gets set if a CPU has more than
570 * one runnable RT task.
571 */
Rusty Russellc6c49272008-11-25 02:35:05 +1030572 cpumask_var_t rto_mask;
Gregory Haskins6e0534f2008-05-12 21:21:01 +0200573 struct cpupri cpupri;
Gregory Haskins57d885f2008-01-25 21:08:18 +0100574};
575
Gregory Haskinsdc938522008-01-25 21:08:26 +0100576/*
577 * By default the system creates a single root-domain with all cpus as
578 * members (mimicking the global state we have today).
579 */
Gregory Haskins57d885f2008-01-25 21:08:18 +0100580static struct root_domain def_root_domain;
581
Christian Dietriched2d3722010-09-06 16:37:05 +0200582#endif /* CONFIG_SMP */
Gregory Haskins57d885f2008-01-25 21:08:18 +0100583
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200584/*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700585 * This is the main, per-CPU runqueue data structure.
586 *
587 * Locking rule: those places that want to lock multiple runqueues
588 * (such as the load balancing or the thread migration code), lock
589 * acquire operations must be ordered by ascending &runqueue.
590 */
Ingo Molnar70b97a72006-07-03 00:25:42 -0700591struct rq {
Ingo Molnard8016492007-10-18 21:32:55 +0200592 /* runqueue lock: */
Thomas Gleixner05fa7852009-11-17 14:28:38 +0100593 raw_spinlock_t lock;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700594
595 /*
596 * nr_running and cpu_load should be in the same cacheline because
597 * remote CPUs use both these fields when doing load calculation.
598 */
599 unsigned long nr_running;
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200600 #define CPU_LOAD_IDX_MAX 5
601 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
Venkatesh Pallipadifdf3e952010-05-17 18:14:43 -0700602 unsigned long last_load_update_tick;
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -0700603#ifdef CONFIG_NO_HZ
Mike Galbraith39c0cbe2010-03-11 17:17:13 +0100604 u64 nohz_stamp;
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -0700605 unsigned char nohz_balance_kick;
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -0700606#endif
Mike Galbraith61eadef2011-04-29 08:36:50 +0200607 int skip_clock_update;
Mike Galbraitha64692a2010-03-11 17:16:20 +0100608
Ingo Molnard8016492007-10-18 21:32:55 +0200609 /* capture load from *all* tasks on this cpu: */
610 struct load_weight load;
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200611 unsigned long nr_load_updates;
612 u64 nr_switches;
613
614 struct cfs_rq cfs;
Peter Zijlstra6f505b12008-01-25 21:08:30 +0100615 struct rt_rq rt;
Peter Zijlstra6f505b12008-01-25 21:08:30 +0100616
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200617#ifdef CONFIG_FAIR_GROUP_SCHED
Ingo Molnard8016492007-10-18 21:32:55 +0200618 /* list of leaf cfs_rq on this cpu: */
619 struct list_head leaf_cfs_rq_list;
Peter Zijlstra052f1dc2008-02-13 15:45:40 +0100620#endif
621#ifdef CONFIG_RT_GROUP_SCHED
Peter Zijlstra6f505b12008-01-25 21:08:30 +0100622 struct list_head leaf_rt_rq_list;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700623#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700624
625 /*
626 * This is part of a global counter where only the total sum
627 * over all CPUs matters. A task can increase this counter on
628 * one CPU and if it got migrated afterwards it may decrease
629 * it on another CPU. Always updated under the runqueue lock:
630 */
631 unsigned long nr_uninterruptible;
632
Peter Zijlstra34f971f2010-09-22 13:53:15 +0200633 struct task_struct *curr, *idle, *stop;
Christoph Lameterc9819f42006-12-10 02:20:25 -0800634 unsigned long next_balance;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700635 struct mm_struct *prev_mm;
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200636
Peter Zijlstra3e51f332008-05-03 18:29:28 +0200637 u64 clock;
Venkatesh Pallipadi305e6832010-10-04 17:03:21 -0700638 u64 clock_task;
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200639
Linus Torvalds1da177e2005-04-16 15:20:36 -0700640 atomic_t nr_iowait;
641
642#ifdef CONFIG_SMP
Ingo Molnar0eab9142008-01-25 21:08:19 +0100643 struct root_domain *rd;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700644 struct sched_domain *sd;
645
Peter Zijlstrae51fd5e2010-05-31 12:37:30 +0200646 unsigned long cpu_power;
647
Suresh Siddha6eb57e02011-10-03 15:09:01 -0700648 unsigned char idle_balance;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700649 /* For active balancing */
Gregory Haskins3f029d32009-07-29 11:08:47 -0400650 int post_schedule;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700651 int active_balance;
652 int push_cpu;
Tejun Heo969c7922010-05-06 18:49:21 +0200653 struct cpu_stop_work active_balance_work;
Ingo Molnard8016492007-10-18 21:32:55 +0200654 /* cpu of this runqueue: */
655 int cpu;
Gregory Haskins1f11eb62008-06-04 15:04:05 -0400656 int online;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700657
Peter Zijlstrae9e92502009-09-01 10:34:37 +0200658 u64 rt_avg;
659 u64 age_stamp;
Mike Galbraith1b9508f2009-11-04 17:53:50 +0100660 u64 idle_stamp;
661 u64 avg_idle;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700662#endif
663
Venkatesh Pallipadiaa483802010-10-04 17:03:22 -0700664#ifdef CONFIG_IRQ_TIME_ACCOUNTING
665 u64 prev_irq_time;
666#endif
Glauber Costae6e66852011-07-11 15:28:17 -0400667#ifdef CONFIG_PARAVIRT
668 u64 prev_steal_time;
669#endif
Glauber Costa095c0aa2011-07-11 15:28:18 -0400670#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
671 u64 prev_steal_time_rq;
672#endif
Venkatesh Pallipadiaa483802010-10-04 17:03:22 -0700673
Thomas Gleixnerdce48a82009-04-11 10:43:41 +0200674 /* calc_load related fields */
675 unsigned long calc_load_update;
676 long calc_load_active;
677
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +0100678#ifdef CONFIG_SCHED_HRTICK
Peter Zijlstra31656512008-07-18 18:01:23 +0200679#ifdef CONFIG_SMP
680 int hrtick_csd_pending;
681 struct call_single_data hrtick_csd;
682#endif
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +0100683 struct hrtimer hrtick_timer;
684#endif
685
Linus Torvalds1da177e2005-04-16 15:20:36 -0700686#ifdef CONFIG_SCHEDSTATS
687 /* latency stats */
688 struct sched_info rq_sched_info;
Ken Chen9c2c4802008-12-16 23:41:22 -0800689 unsigned long long rq_cpu_time;
690 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700691
692 /* sys_sched_yield() stats */
Ken Chen480b9432007-10-18 21:32:56 +0200693 unsigned int yld_count;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700694
695 /* schedule() stats */
Ken Chen480b9432007-10-18 21:32:56 +0200696 unsigned int sched_switch;
697 unsigned int sched_count;
698 unsigned int sched_goidle;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700699
700 /* try_to_wake_up() stats */
Ken Chen480b9432007-10-18 21:32:56 +0200701 unsigned int ttwu_count;
702 unsigned int ttwu_local;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700703#endif
Peter Zijlstra317f3942011-04-05 17:23:58 +0200704
705#ifdef CONFIG_SMP
Peter Zijlstrafa14ff42011-09-12 13:06:17 +0200706 struct llist_head wake_list;
Peter Zijlstra317f3942011-04-05 17:23:58 +0200707#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700708};
709
Fenghua Yuf34e3b62007-07-19 01:48:13 -0700710static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700711
Mike Galbraitha64692a2010-03-11 17:16:20 +0100712
Peter Zijlstra1e5a7402010-10-31 12:37:04 +0100713static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
Ingo Molnardd41f592007-07-09 18:51:59 +0200714
Christoph Lameter0a2966b2006-09-25 23:30:51 -0700715static inline int cpu_of(struct rq *rq)
716{
717#ifdef CONFIG_SMP
718 return rq->cpu;
719#else
720 return 0;
721#endif
722}
723
Paul E. McKenney497f0ab2010-02-22 17:04:51 -0800724#define rcu_dereference_check_sched_domain(p) \
Paul E. McKenneyd11c5632010-02-22 17:04:50 -0800725 rcu_dereference_check((p), \
Paul E. McKenneyd11c5632010-02-22 17:04:50 -0800726 lockdep_is_held(&sched_domains_mutex))
727
Ingo Molnar20d315d2007-07-09 18:51:58 +0200728/*
Nick Piggin674311d2005-06-25 14:57:27 -0700729 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -0700730 * See detach_destroy_domains: synchronize_sched for details.
Nick Piggin674311d2005-06-25 14:57:27 -0700731 *
732 * The domain tree of any CPU may only be accessed from within
733 * preempt-disabled sections.
734 */
Ingo Molnar48f24c42006-07-03 00:25:40 -0700735#define for_each_domain(cpu, __sd) \
Paul E. McKenney497f0ab2010-02-22 17:04:51 -0800736 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700737
738#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
739#define this_rq() (&__get_cpu_var(runqueues))
740#define task_rq(p) cpu_rq(task_cpu(p))
741#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
Hitoshi Mitake54d35f22009-06-29 14:44:57 +0900742#define raw_rq() (&__raw_get_cpu_var(runqueues))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700743
Peter Zijlstradc61b1d2010-06-08 11:40:42 +0200744#ifdef CONFIG_CGROUP_SCHED
745
746/*
747 * Return the group to which this tasks belongs.
748 *
Peter Zijlstra6c6c54e2011-06-03 17:37:07 +0200749 * We use task_subsys_state_check() and extend the RCU verification with
750 * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
751 * task it moves into the cgroup. Therefore by holding either of those locks,
752 * we pin the task to the current cgroup.
Peter Zijlstradc61b1d2010-06-08 11:40:42 +0200753 */
754static inline struct task_group *task_group(struct task_struct *p)
755{
Mike Galbraith5091faa2010-11-30 14:18:03 +0100756 struct task_group *tg;
Peter Zijlstradc61b1d2010-06-08 11:40:42 +0200757 struct cgroup_subsys_state *css;
758
759 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
Peter Zijlstra6c6c54e2011-06-03 17:37:07 +0200760 lockdep_is_held(&p->pi_lock) ||
761 lockdep_is_held(&task_rq(p)->lock));
Mike Galbraith5091faa2010-11-30 14:18:03 +0100762 tg = container_of(css, struct task_group, css);
763
764 return autogroup_task_group(p, tg);
Peter Zijlstradc61b1d2010-06-08 11:40:42 +0200765}
766
767/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
768static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
769{
770#ifdef CONFIG_FAIR_GROUP_SCHED
771 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
772 p->se.parent = task_group(p)->se[cpu];
773#endif
774
775#ifdef CONFIG_RT_GROUP_SCHED
776 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
777 p->rt.parent = task_group(p)->rt_se[cpu];
778#endif
779}
780
781#else /* CONFIG_CGROUP_SCHED */
782
783static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
784static inline struct task_group *task_group(struct task_struct *p)
785{
786 return NULL;
787}
788
789#endif /* CONFIG_CGROUP_SCHED */
790
Peter Zijlstrafe44d622010-12-09 14:15:34 +0100791static void update_rq_clock_task(struct rq *rq, s64 delta);
Venkatesh Pallipadi305e6832010-10-04 17:03:21 -0700792
Peter Zijlstrafe44d622010-12-09 14:15:34 +0100793static void update_rq_clock(struct rq *rq)
Peter Zijlstra3e51f332008-05-03 18:29:28 +0200794{
Peter Zijlstrafe44d622010-12-09 14:15:34 +0100795 s64 delta;
Venkatesh Pallipadi305e6832010-10-04 17:03:21 -0700796
Mike Galbraith61eadef2011-04-29 08:36:50 +0200797 if (rq->skip_clock_update > 0)
Mike Galbraithf26f9af2010-12-08 11:05:42 +0100798 return;
Venkatesh Pallipadiaa483802010-10-04 17:03:22 -0700799
Peter Zijlstrafe44d622010-12-09 14:15:34 +0100800 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
801 rq->clock += delta;
802 update_rq_clock_task(rq, delta);
Peter Zijlstra3e51f332008-05-03 18:29:28 +0200803}
804
Ingo Molnare436d802007-07-19 21:28:35 +0200805/*
Ingo Molnarbf5c91b2007-10-15 17:00:04 +0200806 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
807 */
808#ifdef CONFIG_SCHED_DEBUG
809# define const_debug __read_mostly
810#else
811# define const_debug static const
812#endif
813
Ingo Molnar017730c2008-05-12 21:20:52 +0200814/**
Randy Dunlap1fd06bb2011-03-15 16:12:30 -0700815 * runqueue_is_locked - Returns true if the current cpu runqueue is locked
Randy Dunlape17b38b2009-10-11 19:12:00 -0700816 * @cpu: the processor in question.
Ingo Molnar017730c2008-05-12 21:20:52 +0200817 *
Ingo Molnar017730c2008-05-12 21:20:52 +0200818 * This interface allows printk to be called with the runqueue lock
819 * held and know whether or not it is OK to wake up the klogd.
820 */
Andrew Morton89f19f02009-09-19 11:55:44 -0700821int runqueue_is_locked(int cpu)
Ingo Molnar017730c2008-05-12 21:20:52 +0200822{
Thomas Gleixner05fa7852009-11-17 14:28:38 +0100823 return raw_spin_is_locked(&cpu_rq(cpu)->lock);
Ingo Molnar017730c2008-05-12 21:20:52 +0200824}
825
Ingo Molnarbf5c91b2007-10-15 17:00:04 +0200826/*
827 * Debugging: various feature bits
828 */
Peter Zijlstraf00b45c2008-04-19 19:45:00 +0200829
830#define SCHED_FEAT(name, enabled) \
831 __SCHED_FEAT_##name ,
832
Ingo Molnarbf5c91b2007-10-15 17:00:04 +0200833enum {
Peter Zijlstraf00b45c2008-04-19 19:45:00 +0200834#include "sched_features.h"
Ingo Molnarbf5c91b2007-10-15 17:00:04 +0200835};
836
Peter Zijlstraf00b45c2008-04-19 19:45:00 +0200837#undef SCHED_FEAT
Ingo Molnarbf5c91b2007-10-15 17:00:04 +0200838
Peter Zijlstraf00b45c2008-04-19 19:45:00 +0200839#define SCHED_FEAT(name, enabled) \
840 (1UL << __SCHED_FEAT_##name) * enabled |
841
842const_debug unsigned int sysctl_sched_features =
843#include "sched_features.h"
844 0;
845
846#undef SCHED_FEAT
847
848#ifdef CONFIG_SCHED_DEBUG
849#define SCHED_FEAT(name, enabled) \
850 #name ,
851
Harvey Harrison983ed7a2008-04-24 18:17:55 -0700852static __read_mostly char *sched_feat_names[] = {
Peter Zijlstraf00b45c2008-04-19 19:45:00 +0200853#include "sched_features.h"
854 NULL
855};
856
857#undef SCHED_FEAT
858
Li Zefan34f3a812008-10-30 15:23:32 +0800859static int sched_feat_show(struct seq_file *m, void *v)
Peter Zijlstraf00b45c2008-04-19 19:45:00 +0200860{
Peter Zijlstraf00b45c2008-04-19 19:45:00 +0200861 int i;
862
863 for (i = 0; sched_feat_names[i]; i++) {
Li Zefan34f3a812008-10-30 15:23:32 +0800864 if (!(sysctl_sched_features & (1UL << i)))
865 seq_puts(m, "NO_");
866 seq_printf(m, "%s ", sched_feat_names[i]);
Peter Zijlstraf00b45c2008-04-19 19:45:00 +0200867 }
Li Zefan34f3a812008-10-30 15:23:32 +0800868 seq_puts(m, "\n");
Peter Zijlstraf00b45c2008-04-19 19:45:00 +0200869
Li Zefan34f3a812008-10-30 15:23:32 +0800870 return 0;
Peter Zijlstraf00b45c2008-04-19 19:45:00 +0200871}
872
873static ssize_t
874sched_feat_write(struct file *filp, const char __user *ubuf,
875 size_t cnt, loff_t *ppos)
876{
877 char buf[64];
Mathieu Desnoyers77401912010-09-13 17:47:00 -0400878 char *cmp;
Peter Zijlstraf00b45c2008-04-19 19:45:00 +0200879 int neg = 0;
880 int i;
881
882 if (cnt > 63)
883 cnt = 63;
884
885 if (copy_from_user(&buf, ubuf, cnt))
886 return -EFAULT;
887
888 buf[cnt] = 0;
Mathieu Desnoyers77401912010-09-13 17:47:00 -0400889 cmp = strstrip(buf);
Peter Zijlstraf00b45c2008-04-19 19:45:00 +0200890
Hillf Danton524429c2011-01-06 20:58:12 +0800891 if (strncmp(cmp, "NO_", 3) == 0) {
Peter Zijlstraf00b45c2008-04-19 19:45:00 +0200892 neg = 1;
893 cmp += 3;
894 }
895
896 for (i = 0; sched_feat_names[i]; i++) {
Mathieu Desnoyers77401912010-09-13 17:47:00 -0400897 if (strcmp(cmp, sched_feat_names[i]) == 0) {
Peter Zijlstraf00b45c2008-04-19 19:45:00 +0200898 if (neg)
899 sysctl_sched_features &= ~(1UL << i);
900 else
901 sysctl_sched_features |= (1UL << i);
902 break;
903 }
904 }
905
906 if (!sched_feat_names[i])
907 return -EINVAL;
908
Jan Blunck42994722009-11-20 17:40:37 +0100909 *ppos += cnt;
Peter Zijlstraf00b45c2008-04-19 19:45:00 +0200910
911 return cnt;
912}
913
Li Zefan34f3a812008-10-30 15:23:32 +0800914static int sched_feat_open(struct inode *inode, struct file *filp)
915{
916 return single_open(filp, sched_feat_show, NULL);
917}
918
Alexey Dobriyan828c0952009-10-01 15:43:56 -0700919static const struct file_operations sched_feat_fops = {
Li Zefan34f3a812008-10-30 15:23:32 +0800920 .open = sched_feat_open,
921 .write = sched_feat_write,
922 .read = seq_read,
923 .llseek = seq_lseek,
924 .release = single_release,
Peter Zijlstraf00b45c2008-04-19 19:45:00 +0200925};
926
927static __init int sched_init_debug(void)
928{
Peter Zijlstraf00b45c2008-04-19 19:45:00 +0200929 debugfs_create_file("sched_features", 0644, NULL, NULL,
930 &sched_feat_fops);
931
932 return 0;
933}
934late_initcall(sched_init_debug);
935
936#endif
937
938#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
Ingo Molnarbf5c91b2007-10-15 17:00:04 +0200939
940/*
Peter Zijlstrab82d9fd2007-11-09 22:39:39 +0100941 * Number of tasks to iterate in a single balance run.
942 * Limited because this is done with IRQs disabled.
943 */
944const_debug unsigned int sysctl_sched_nr_migrate = 32;
945
946/*
Peter Zijlstrae9e92502009-09-01 10:34:37 +0200947 * period over which we average the RT time consumption, measured
948 * in ms.
949 *
950 * default: 1s
951 */
952const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
953
954/*
Peter Zijlstra9f0c1e52008-02-13 15:45:39 +0100955 * period over which we measure -rt task cpu usage in us.
Peter Zijlstrafa85ae22008-01-25 21:08:29 +0100956 * default: 1s
957 */
Peter Zijlstra9f0c1e52008-02-13 15:45:39 +0100958unsigned int sysctl_sched_rt_period = 1000000;
Peter Zijlstrafa85ae22008-01-25 21:08:29 +0100959
Ingo Molnar6892b752008-02-13 14:02:36 +0100960static __read_mostly int scheduler_running;
961
Peter Zijlstrafa85ae22008-01-25 21:08:29 +0100962/*
Peter Zijlstra9f0c1e52008-02-13 15:45:39 +0100963 * part of the period that we allow rt tasks to run in us.
964 * default: 0.95s
Peter Zijlstrafa85ae22008-01-25 21:08:29 +0100965 */
Peter Zijlstra9f0c1e52008-02-13 15:45:39 +0100966int sysctl_sched_rt_runtime = 950000;
967
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +0200968static inline u64 global_rt_period(void)
969{
970 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
971}
972
973static inline u64 global_rt_runtime(void)
974{
roel kluine26873b2008-07-22 16:51:15 -0400975 if (sysctl_sched_rt_runtime < 0)
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +0200976 return RUNTIME_INF;
977
978 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
979}
Peter Zijlstrafa85ae22008-01-25 21:08:29 +0100980
Linus Torvalds1da177e2005-04-16 15:20:36 -0700981#ifndef prepare_arch_switch
Nick Piggin4866cde2005-06-25 14:57:23 -0700982# define prepare_arch_switch(next) do { } while (0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700983#endif
Nick Piggin4866cde2005-06-25 14:57:23 -0700984#ifndef finish_arch_switch
985# define finish_arch_switch(prev) do { } while (0)
986#endif
987
Dmitry Adamushko051a1d12007-12-18 15:21:13 +0100988static inline int task_current(struct rq *rq, struct task_struct *p)
989{
990 return rq->curr == p;
991}
992
Ingo Molnar70b97a72006-07-03 00:25:42 -0700993static inline int task_running(struct rq *rq, struct task_struct *p)
Nick Piggin4866cde2005-06-25 14:57:23 -0700994{
Peter Zijlstra3ca7a442011-04-05 17:23:40 +0200995#ifdef CONFIG_SMP
996 return p->on_cpu;
997#else
Dmitry Adamushko051a1d12007-12-18 15:21:13 +0100998 return task_current(rq, p);
Peter Zijlstra3ca7a442011-04-05 17:23:40 +0200999#endif
Nick Piggin4866cde2005-06-25 14:57:23 -07001000}
1001
Peter Zijlstra3ca7a442011-04-05 17:23:40 +02001002#ifndef __ARCH_WANT_UNLOCKED_CTXSW
Ingo Molnar70b97a72006-07-03 00:25:42 -07001003static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
Nick Piggin4866cde2005-06-25 14:57:23 -07001004{
Peter Zijlstra3ca7a442011-04-05 17:23:40 +02001005#ifdef CONFIG_SMP
1006 /*
1007 * We can optimise this out completely for !SMP, because the
1008 * SMP rebalancing from interrupt is the only thing that cares
1009 * here.
1010 */
1011 next->on_cpu = 1;
1012#endif
Nick Piggin4866cde2005-06-25 14:57:23 -07001013}
1014
Ingo Molnar70b97a72006-07-03 00:25:42 -07001015static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
Nick Piggin4866cde2005-06-25 14:57:23 -07001016{
Peter Zijlstra3ca7a442011-04-05 17:23:40 +02001017#ifdef CONFIG_SMP
1018 /*
1019 * After ->on_cpu is cleared, the task can be moved to a different CPU.
1020 * We must ensure this doesn't happen until the switch is completely
1021 * finished.
1022 */
1023 smp_wmb();
1024 prev->on_cpu = 0;
1025#endif
Ingo Molnarda04c032005-09-13 11:17:59 +02001026#ifdef CONFIG_DEBUG_SPINLOCK
1027 /* this is a valid case when another task releases the spinlock */
1028 rq->lock.owner = current;
1029#endif
Ingo Molnar8a25d5d2006-07-03 00:24:54 -07001030 /*
1031 * If we are tracking spinlock dependencies then we have to
1032 * fix up the runqueue lock - which gets 'carried over' from
1033 * prev into current:
1034 */
1035 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
1036
Thomas Gleixner05fa7852009-11-17 14:28:38 +01001037 raw_spin_unlock_irq(&rq->lock);
Nick Piggin4866cde2005-06-25 14:57:23 -07001038}
1039
1040#else /* __ARCH_WANT_UNLOCKED_CTXSW */
Ingo Molnar70b97a72006-07-03 00:25:42 -07001041static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
Nick Piggin4866cde2005-06-25 14:57:23 -07001042{
1043#ifdef CONFIG_SMP
1044 /*
1045 * We can optimise this out completely for !SMP, because the
1046 * SMP rebalancing from interrupt is the only thing that cares
1047 * here.
1048 */
Peter Zijlstra3ca7a442011-04-05 17:23:40 +02001049 next->on_cpu = 1;
Nick Piggin4866cde2005-06-25 14:57:23 -07001050#endif
1051#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
Thomas Gleixner05fa7852009-11-17 14:28:38 +01001052 raw_spin_unlock_irq(&rq->lock);
Nick Piggin4866cde2005-06-25 14:57:23 -07001053#else
Thomas Gleixner05fa7852009-11-17 14:28:38 +01001054 raw_spin_unlock(&rq->lock);
Nick Piggin4866cde2005-06-25 14:57:23 -07001055#endif
1056}
1057
Ingo Molnar70b97a72006-07-03 00:25:42 -07001058static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
Nick Piggin4866cde2005-06-25 14:57:23 -07001059{
1060#ifdef CONFIG_SMP
1061 /*
Peter Zijlstra3ca7a442011-04-05 17:23:40 +02001062 * After ->on_cpu is cleared, the task can be moved to a different CPU.
Nick Piggin4866cde2005-06-25 14:57:23 -07001063 * We must ensure this doesn't happen until the switch is completely
1064 * finished.
1065 */
1066 smp_wmb();
Peter Zijlstra3ca7a442011-04-05 17:23:40 +02001067 prev->on_cpu = 0;
Nick Piggin4866cde2005-06-25 14:57:23 -07001068#endif
1069#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1070 local_irq_enable();
1071#endif
1072}
1073#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001074
1075/*
Peter Zijlstra0122ec52011-04-05 17:23:51 +02001076 * __task_rq_lock - lock the rq @p resides on.
Ingo Molnarb29739f2006-06-27 02:54:51 -07001077 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07001078static inline struct rq *__task_rq_lock(struct task_struct *p)
Ingo Molnarb29739f2006-06-27 02:54:51 -07001079 __acquires(rq->lock)
1080{
Peter Zijlstra0970d292010-02-15 14:45:54 +01001081 struct rq *rq;
1082
Peter Zijlstra0122ec52011-04-05 17:23:51 +02001083 lockdep_assert_held(&p->pi_lock);
1084
Andi Kleen3a5c3592007-10-15 17:00:14 +02001085 for (;;) {
Peter Zijlstra0970d292010-02-15 14:45:54 +01001086 rq = task_rq(p);
Thomas Gleixner05fa7852009-11-17 14:28:38 +01001087 raw_spin_lock(&rq->lock);
Peter Zijlstra65cc8e42010-03-25 21:05:16 +01001088 if (likely(rq == task_rq(p)))
Andi Kleen3a5c3592007-10-15 17:00:14 +02001089 return rq;
Thomas Gleixner05fa7852009-11-17 14:28:38 +01001090 raw_spin_unlock(&rq->lock);
Ingo Molnarb29739f2006-06-27 02:54:51 -07001091 }
Ingo Molnarb29739f2006-06-27 02:54:51 -07001092}
1093
1094/*
Peter Zijlstra0122ec52011-04-05 17:23:51 +02001095 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001096 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07001097static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
Peter Zijlstra0122ec52011-04-05 17:23:51 +02001098 __acquires(p->pi_lock)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001099 __acquires(rq->lock)
1100{
Ingo Molnar70b97a72006-07-03 00:25:42 -07001101 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001102
Andi Kleen3a5c3592007-10-15 17:00:14 +02001103 for (;;) {
Peter Zijlstra0122ec52011-04-05 17:23:51 +02001104 raw_spin_lock_irqsave(&p->pi_lock, *flags);
Andi Kleen3a5c3592007-10-15 17:00:14 +02001105 rq = task_rq(p);
Thomas Gleixner05fa7852009-11-17 14:28:38 +01001106 raw_spin_lock(&rq->lock);
Peter Zijlstra65cc8e42010-03-25 21:05:16 +01001107 if (likely(rq == task_rq(p)))
Andi Kleen3a5c3592007-10-15 17:00:14 +02001108 return rq;
Peter Zijlstra0122ec52011-04-05 17:23:51 +02001109 raw_spin_unlock(&rq->lock);
1110 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001111 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001112}
1113
Alexey Dobriyana9957442007-10-15 17:00:13 +02001114static void __task_rq_unlock(struct rq *rq)
Ingo Molnarb29739f2006-06-27 02:54:51 -07001115 __releases(rq->lock)
1116{
Thomas Gleixner05fa7852009-11-17 14:28:38 +01001117 raw_spin_unlock(&rq->lock);
Ingo Molnarb29739f2006-06-27 02:54:51 -07001118}
1119
Peter Zijlstra0122ec52011-04-05 17:23:51 +02001120static inline void
1121task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001122 __releases(rq->lock)
Peter Zijlstra0122ec52011-04-05 17:23:51 +02001123 __releases(p->pi_lock)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001124{
Peter Zijlstra0122ec52011-04-05 17:23:51 +02001125 raw_spin_unlock(&rq->lock);
1126 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001127}
1128
Linus Torvalds1da177e2005-04-16 15:20:36 -07001129/*
Robert P. J. Daycc2a73b2006-12-10 02:20:00 -08001130 * this_rq_lock - lock this runqueue and disable interrupts.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001131 */
Alexey Dobriyana9957442007-10-15 17:00:13 +02001132static struct rq *this_rq_lock(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001133 __acquires(rq->lock)
1134{
Ingo Molnar70b97a72006-07-03 00:25:42 -07001135 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001136
1137 local_irq_disable();
1138 rq = this_rq();
Thomas Gleixner05fa7852009-11-17 14:28:38 +01001139 raw_spin_lock(&rq->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001140
1141 return rq;
1142}
1143
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01001144#ifdef CONFIG_SCHED_HRTICK
1145/*
1146 * Use HR-timers to deliver accurate preemption points.
1147 *
1148 * Its all a bit involved since we cannot program an hrt while holding the
1149 * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
1150 * reschedule event.
1151 *
1152 * When we get rescheduled we reprogram the hrtick_timer outside of the
1153 * rq->lock.
1154 */
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01001155
1156/*
1157 * Use hrtick when:
1158 * - enabled by features
1159 * - hrtimer is actually high res
1160 */
1161static inline int hrtick_enabled(struct rq *rq)
1162{
1163 if (!sched_feat(HRTICK))
1164 return 0;
Ingo Molnarba420592008-07-20 11:02:06 +02001165 if (!cpu_active(cpu_of(rq)))
Peter Zijlstrab328ca12008-04-29 10:02:46 +02001166 return 0;
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01001167 return hrtimer_is_hres_active(&rq->hrtick_timer);
1168}
1169
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01001170static void hrtick_clear(struct rq *rq)
1171{
1172 if (hrtimer_active(&rq->hrtick_timer))
1173 hrtimer_cancel(&rq->hrtick_timer);
1174}
1175
1176/*
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01001177 * High-resolution timer tick.
1178 * Runs from hardirq context with interrupts disabled.
1179 */
1180static enum hrtimer_restart hrtick(struct hrtimer *timer)
1181{
1182 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
1183
1184 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1185
Thomas Gleixner05fa7852009-11-17 14:28:38 +01001186 raw_spin_lock(&rq->lock);
Peter Zijlstra3e51f332008-05-03 18:29:28 +02001187 update_rq_clock(rq);
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01001188 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
Thomas Gleixner05fa7852009-11-17 14:28:38 +01001189 raw_spin_unlock(&rq->lock);
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01001190
1191 return HRTIMER_NORESTART;
1192}
1193
Rabin Vincent95e904c2008-05-11 05:55:33 +05301194#ifdef CONFIG_SMP
Peter Zijlstra31656512008-07-18 18:01:23 +02001195/*
1196 * called from hardirq (IPI) context
1197 */
1198static void __hrtick_start(void *arg)
Peter Zijlstrab328ca12008-04-29 10:02:46 +02001199{
Peter Zijlstra31656512008-07-18 18:01:23 +02001200 struct rq *rq = arg;
Peter Zijlstrab328ca12008-04-29 10:02:46 +02001201
Thomas Gleixner05fa7852009-11-17 14:28:38 +01001202 raw_spin_lock(&rq->lock);
Peter Zijlstra31656512008-07-18 18:01:23 +02001203 hrtimer_restart(&rq->hrtick_timer);
1204 rq->hrtick_csd_pending = 0;
Thomas Gleixner05fa7852009-11-17 14:28:38 +01001205 raw_spin_unlock(&rq->lock);
Peter Zijlstrab328ca12008-04-29 10:02:46 +02001206}
1207
Peter Zijlstra31656512008-07-18 18:01:23 +02001208/*
1209 * Called to set the hrtick timer state.
1210 *
1211 * called with rq->lock held and irqs disabled
1212 */
1213static void hrtick_start(struct rq *rq, u64 delay)
Peter Zijlstrab328ca12008-04-29 10:02:46 +02001214{
Peter Zijlstra31656512008-07-18 18:01:23 +02001215 struct hrtimer *timer = &rq->hrtick_timer;
1216 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
Peter Zijlstrab328ca12008-04-29 10:02:46 +02001217
Arjan van de Vencc584b22008-09-01 15:02:30 -07001218 hrtimer_set_expires(timer, time);
Peter Zijlstra31656512008-07-18 18:01:23 +02001219
1220 if (rq == this_rq()) {
1221 hrtimer_restart(timer);
1222 } else if (!rq->hrtick_csd_pending) {
Peter Zijlstra6e275632009-02-25 13:59:48 +01001223 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
Peter Zijlstra31656512008-07-18 18:01:23 +02001224 rq->hrtick_csd_pending = 1;
1225 }
Peter Zijlstrab328ca12008-04-29 10:02:46 +02001226}
1227
1228static int
1229hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1230{
1231 int cpu = (int)(long)hcpu;
1232
1233 switch (action) {
1234 case CPU_UP_CANCELED:
1235 case CPU_UP_CANCELED_FROZEN:
1236 case CPU_DOWN_PREPARE:
1237 case CPU_DOWN_PREPARE_FROZEN:
1238 case CPU_DEAD:
1239 case CPU_DEAD_FROZEN:
Peter Zijlstra31656512008-07-18 18:01:23 +02001240 hrtick_clear(cpu_rq(cpu));
Peter Zijlstrab328ca12008-04-29 10:02:46 +02001241 return NOTIFY_OK;
1242 }
1243
1244 return NOTIFY_DONE;
1245}
1246
Rakib Mullickfa748202008-09-22 14:55:45 -07001247static __init void init_hrtick(void)
Peter Zijlstrab328ca12008-04-29 10:02:46 +02001248{
1249 hotcpu_notifier(hotplug_hrtick, 0);
1250}
Peter Zijlstra31656512008-07-18 18:01:23 +02001251#else
1252/*
1253 * Called to set the hrtick timer state.
1254 *
1255 * called with rq->lock held and irqs disabled
1256 */
1257static void hrtick_start(struct rq *rq, u64 delay)
1258{
Peter Zijlstra7f1e2ca2009-03-13 12:21:27 +01001259 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
Arun R Bharadwaj5c333862009-04-16 12:14:37 +05301260 HRTIMER_MODE_REL_PINNED, 0);
Peter Zijlstra31656512008-07-18 18:01:23 +02001261}
1262
Andrew Morton006c75f2008-09-22 14:55:46 -07001263static inline void init_hrtick(void)
Peter Zijlstra31656512008-07-18 18:01:23 +02001264{
1265}
Rabin Vincent95e904c2008-05-11 05:55:33 +05301266#endif /* CONFIG_SMP */
Peter Zijlstrab328ca12008-04-29 10:02:46 +02001267
1268static void init_rq_hrtick(struct rq *rq)
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01001269{
Peter Zijlstra31656512008-07-18 18:01:23 +02001270#ifdef CONFIG_SMP
1271 rq->hrtick_csd_pending = 0;
1272
1273 rq->hrtick_csd.flags = 0;
1274 rq->hrtick_csd.func = __hrtick_start;
1275 rq->hrtick_csd.info = rq;
1276#endif
1277
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01001278 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1279 rq->hrtick_timer.function = hrtick;
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01001280}
Andrew Morton006c75f2008-09-22 14:55:46 -07001281#else /* CONFIG_SCHED_HRTICK */
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01001282static inline void hrtick_clear(struct rq *rq)
1283{
1284}
1285
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01001286static inline void init_rq_hrtick(struct rq *rq)
1287{
1288}
1289
Peter Zijlstrab328ca12008-04-29 10:02:46 +02001290static inline void init_hrtick(void)
1291{
1292}
Andrew Morton006c75f2008-09-22 14:55:46 -07001293#endif /* CONFIG_SCHED_HRTICK */
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01001294
Ingo Molnar1b9f19c2007-07-09 18:51:59 +02001295/*
Ingo Molnarc24d20d2007-07-09 18:51:59 +02001296 * resched_task - mark a task 'to be rescheduled now'.
1297 *
1298 * On UP this means the setting of the need_resched flag, on SMP it
1299 * might also involve a cross-CPU call to trigger the scheduler on
1300 * the target CPU.
1301 */
1302#ifdef CONFIG_SMP
1303
1304#ifndef tsk_is_polling
1305#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1306#endif
1307
Peter Zijlstra31656512008-07-18 18:01:23 +02001308static void resched_task(struct task_struct *p)
Ingo Molnarc24d20d2007-07-09 18:51:59 +02001309{
1310 int cpu;
1311
Thomas Gleixner05fa7852009-11-17 14:28:38 +01001312 assert_raw_spin_locked(&task_rq(p)->lock);
Ingo Molnarc24d20d2007-07-09 18:51:59 +02001313
Lai Jiangshan5ed0cec2009-03-06 19:40:20 +08001314 if (test_tsk_need_resched(p))
Ingo Molnarc24d20d2007-07-09 18:51:59 +02001315 return;
1316
Lai Jiangshan5ed0cec2009-03-06 19:40:20 +08001317 set_tsk_need_resched(p);
Ingo Molnarc24d20d2007-07-09 18:51:59 +02001318
1319 cpu = task_cpu(p);
1320 if (cpu == smp_processor_id())
1321 return;
1322
1323 /* NEED_RESCHED must be visible before we test polling */
1324 smp_mb();
1325 if (!tsk_is_polling(p))
1326 smp_send_reschedule(cpu);
1327}
1328
1329static void resched_cpu(int cpu)
1330{
1331 struct rq *rq = cpu_rq(cpu);
1332 unsigned long flags;
1333
Thomas Gleixner05fa7852009-11-17 14:28:38 +01001334 if (!raw_spin_trylock_irqsave(&rq->lock, flags))
Ingo Molnarc24d20d2007-07-09 18:51:59 +02001335 return;
1336 resched_task(cpu_curr(cpu));
Thomas Gleixner05fa7852009-11-17 14:28:38 +01001337 raw_spin_unlock_irqrestore(&rq->lock, flags);
Ingo Molnarc24d20d2007-07-09 18:51:59 +02001338}
Thomas Gleixner06d83082008-03-22 09:20:24 +01001339
1340#ifdef CONFIG_NO_HZ
1341/*
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07001342 * In the semi idle case, use the nearest busy cpu for migrating timers
1343 * from an idle cpu. This is good for power-savings.
1344 *
1345 * We don't do similar optimization for completely idle system, as
1346 * selecting an idle cpu will add more delays to the timers than intended
1347 * (as that cpu's timer base may not be uptodate wrt jiffies etc).
1348 */
1349int get_nohz_timer_target(void)
1350{
1351 int cpu = smp_processor_id();
1352 int i;
1353 struct sched_domain *sd;
1354
Peter Zijlstra057f3fa2011-04-18 11:24:34 +02001355 rcu_read_lock();
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07001356 for_each_domain(cpu, sd) {
Peter Zijlstra057f3fa2011-04-18 11:24:34 +02001357 for_each_cpu(i, sched_domain_span(sd)) {
1358 if (!idle_cpu(i)) {
1359 cpu = i;
1360 goto unlock;
1361 }
1362 }
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07001363 }
Peter Zijlstra057f3fa2011-04-18 11:24:34 +02001364unlock:
1365 rcu_read_unlock();
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07001366 return cpu;
1367}
1368/*
Thomas Gleixner06d83082008-03-22 09:20:24 +01001369 * When add_timer_on() enqueues a timer into the timer wheel of an
1370 * idle CPU then this timer might expire before the next timer event
1371 * which is scheduled to wake up that CPU. In case of a completely
1372 * idle system the next event might even be infinite time into the
1373 * future. wake_up_idle_cpu() ensures that the CPU is woken up and
1374 * leaves the inner idle loop so the newly added timer is taken into
1375 * account when the CPU goes back to idle and evaluates the timer
1376 * wheel for the next timer event.
1377 */
1378void wake_up_idle_cpu(int cpu)
1379{
1380 struct rq *rq = cpu_rq(cpu);
1381
1382 if (cpu == smp_processor_id())
1383 return;
1384
1385 /*
1386 * This is safe, as this function is called with the timer
1387 * wheel base lock of (cpu) held. When the CPU is on the way
1388 * to idle and has not yet set rq->curr to idle then it will
1389 * be serialized on the timer wheel base lock and take the new
1390 * timer into account automatically.
1391 */
1392 if (rq->curr != rq->idle)
1393 return;
1394
1395 /*
1396 * We can set TIF_RESCHED on the idle task of the other CPU
1397 * lockless. The worst case is that the other CPU runs the
1398 * idle task through an additional NOOP schedule()
1399 */
Lai Jiangshan5ed0cec2009-03-06 19:40:20 +08001400 set_tsk_need_resched(rq->idle);
Thomas Gleixner06d83082008-03-22 09:20:24 +01001401
1402 /* NEED_RESCHED must be visible before we test polling */
1403 smp_mb();
1404 if (!tsk_is_polling(rq->idle))
1405 smp_send_reschedule(cpu);
1406}
Mike Galbraith39c0cbe2010-03-11 17:17:13 +01001407
Suresh Siddhaca380622011-10-03 15:09:00 -07001408static inline bool got_nohz_idle_kick(void)
1409{
1410 return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick;
1411}
1412
1413#else /* CONFIG_NO_HZ */
1414
1415static inline bool got_nohz_idle_kick(void)
1416{
1417 return false;
1418}
1419
Dhaval Giani6d6bc0a2008-05-30 14:23:45 +02001420#endif /* CONFIG_NO_HZ */
Thomas Gleixner06d83082008-03-22 09:20:24 +01001421
Peter Zijlstrae9e92502009-09-01 10:34:37 +02001422static u64 sched_avg_period(void)
1423{
1424 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
1425}
1426
1427static void sched_avg_update(struct rq *rq)
1428{
1429 s64 period = sched_avg_period();
1430
1431 while ((s64)(rq->clock - rq->age_stamp) > period) {
Will Deacon0d98bb22010-05-24 12:11:43 -07001432 /*
1433 * Inline assembly required to prevent the compiler
1434 * optimising this loop into a divmod call.
1435 * See __iter_div_u64_rem() for another example of this.
1436 */
1437 asm("" : "+rm" (rq->age_stamp));
Peter Zijlstrae9e92502009-09-01 10:34:37 +02001438 rq->age_stamp += period;
1439 rq->rt_avg /= 2;
1440 }
1441}
1442
1443static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1444{
1445 rq->rt_avg += rt_delta;
1446 sched_avg_update(rq);
1447}
1448
Dhaval Giani6d6bc0a2008-05-30 14:23:45 +02001449#else /* !CONFIG_SMP */
Peter Zijlstra31656512008-07-18 18:01:23 +02001450static void resched_task(struct task_struct *p)
Ingo Molnarc24d20d2007-07-09 18:51:59 +02001451{
Thomas Gleixner05fa7852009-11-17 14:28:38 +01001452 assert_raw_spin_locked(&task_rq(p)->lock);
Peter Zijlstra31656512008-07-18 18:01:23 +02001453 set_tsk_need_resched(p);
Ingo Molnarc24d20d2007-07-09 18:51:59 +02001454}
Peter Zijlstrae9e92502009-09-01 10:34:37 +02001455
1456static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1457{
1458}
Suresh Siddhada2b71e2010-08-23 13:42:51 -07001459
1460static void sched_avg_update(struct rq *rq)
1461{
1462}
Dhaval Giani6d6bc0a2008-05-30 14:23:45 +02001463#endif /* CONFIG_SMP */
Ingo Molnarc24d20d2007-07-09 18:51:59 +02001464
Ingo Molnar45bf76d2007-07-09 18:51:59 +02001465#if BITS_PER_LONG == 32
1466# define WMULT_CONST (~0UL)
1467#else
1468# define WMULT_CONST (1UL << 32)
1469#endif
1470
1471#define WMULT_SHIFT 32
1472
Ingo Molnar194081e2007-08-09 11:16:51 +02001473/*
1474 * Shift right and round:
1475 */
Ingo Molnarcf2ab462007-09-05 14:32:49 +02001476#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
Ingo Molnar194081e2007-08-09 11:16:51 +02001477
Peter Zijlstraa7be37a2008-06-27 13:41:11 +02001478/*
1479 * delta *= weight / lw
1480 */
Ingo Molnarcb1c4fc2007-08-02 17:41:40 +02001481static unsigned long
Ingo Molnar45bf76d2007-07-09 18:51:59 +02001482calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1483 struct load_weight *lw)
1484{
1485 u64 tmp;
1486
Nikhil Raoc8b28112011-05-18 14:37:48 -07001487 /*
1488 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
1489 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
1490 * 2^SCHED_LOAD_RESOLUTION.
1491 */
1492 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
1493 tmp = (u64)delta_exec * scale_load_down(weight);
1494 else
1495 tmp = (u64)delta_exec;
Stephan Baerwolfdb670da2011-05-11 18:03:29 +02001496
Lai Jiangshan7a232e02008-06-12 16:43:07 +08001497 if (!lw->inv_weight) {
Nikhil Raoc8b28112011-05-18 14:37:48 -07001498 unsigned long w = scale_load_down(lw->weight);
1499
1500 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
Lai Jiangshan7a232e02008-06-12 16:43:07 +08001501 lw->inv_weight = 1;
Nikhil Raoc8b28112011-05-18 14:37:48 -07001502 else if (unlikely(!w))
1503 lw->inv_weight = WMULT_CONST;
Lai Jiangshan7a232e02008-06-12 16:43:07 +08001504 else
Nikhil Raoc8b28112011-05-18 14:37:48 -07001505 lw->inv_weight = WMULT_CONST / w;
Lai Jiangshan7a232e02008-06-12 16:43:07 +08001506 }
Ingo Molnar45bf76d2007-07-09 18:51:59 +02001507
Ingo Molnar45bf76d2007-07-09 18:51:59 +02001508 /*
1509 * Check whether we'd overflow the 64-bit multiplication:
1510 */
Ingo Molnar194081e2007-08-09 11:16:51 +02001511 if (unlikely(tmp > WMULT_CONST))
Ingo Molnarcf2ab462007-09-05 14:32:49 +02001512 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
Ingo Molnar194081e2007-08-09 11:16:51 +02001513 WMULT_SHIFT/2);
1514 else
Ingo Molnarcf2ab462007-09-05 14:32:49 +02001515 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
Ingo Molnar45bf76d2007-07-09 18:51:59 +02001516
Ingo Molnarecf691d2007-08-02 17:41:40 +02001517 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
Ingo Molnar45bf76d2007-07-09 18:51:59 +02001518}
1519
Ingo Molnar10919852007-10-15 17:00:04 +02001520static inline void update_load_add(struct load_weight *lw, unsigned long inc)
Ingo Molnar45bf76d2007-07-09 18:51:59 +02001521{
1522 lw->weight += inc;
Ingo Molnare89996a2008-03-14 23:48:28 +01001523 lw->inv_weight = 0;
Ingo Molnar45bf76d2007-07-09 18:51:59 +02001524}
1525
Ingo Molnar10919852007-10-15 17:00:04 +02001526static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
Ingo Molnar45bf76d2007-07-09 18:51:59 +02001527{
1528 lw->weight -= dec;
Ingo Molnare89996a2008-03-14 23:48:28 +01001529 lw->inv_weight = 0;
Ingo Molnar45bf76d2007-07-09 18:51:59 +02001530}
1531
Peter Zijlstra2069dd72010-11-15 15:47:00 -08001532static inline void update_load_set(struct load_weight *lw, unsigned long w)
1533{
1534 lw->weight = w;
1535 lw->inv_weight = 0;
1536}
1537
Linus Torvalds1da177e2005-04-16 15:20:36 -07001538/*
Peter Williams2dd73a42006-06-27 02:54:34 -07001539 * To aid in avoiding the subversion of "niceness" due to uneven distribution
1540 * of tasks with abnormal "nice" values across CPUs the contribution that
1541 * each task makes to its run queue's load is weighted according to its
Ingo Molnar41a2d6c2007-12-05 15:46:09 +01001542 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
Peter Williams2dd73a42006-06-27 02:54:34 -07001543 * scaled version of the new time slice allocation that they receive on time
1544 * slice expiry etc.
1545 */
1546
Peter Zijlstracce7ade2009-01-15 14:53:37 +01001547#define WEIGHT_IDLEPRIO 3
1548#define WMULT_IDLEPRIO 1431655765
Ingo Molnardd41f592007-07-09 18:51:59 +02001549
1550/*
1551 * Nice levels are multiplicative, with a gentle 10% change for every
1552 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
1553 * nice 1, it will get ~10% less CPU time than another CPU-bound task
1554 * that remained on nice 0.
1555 *
1556 * The "10% effect" is relative and cumulative: from _any_ nice level,
1557 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
Ingo Molnarf9153ee2007-07-16 09:46:30 +02001558 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
1559 * If a task goes up by ~10% and another task goes down by ~10% then
1560 * the relative distance between them is ~25%.)
Ingo Molnardd41f592007-07-09 18:51:59 +02001561 */
1562static const int prio_to_weight[40] = {
Ingo Molnar254753d2007-08-09 11:16:51 +02001563 /* -20 */ 88761, 71755, 56483, 46273, 36291,
1564 /* -15 */ 29154, 23254, 18705, 14949, 11916,
1565 /* -10 */ 9548, 7620, 6100, 4904, 3906,
1566 /* -5 */ 3121, 2501, 1991, 1586, 1277,
1567 /* 0 */ 1024, 820, 655, 526, 423,
1568 /* 5 */ 335, 272, 215, 172, 137,
1569 /* 10 */ 110, 87, 70, 56, 45,
1570 /* 15 */ 36, 29, 23, 18, 15,
Ingo Molnardd41f592007-07-09 18:51:59 +02001571};
1572
Ingo Molnar5714d2d2007-07-16 09:46:31 +02001573/*
1574 * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
1575 *
1576 * In cases where the weight does not change often, we can use the
1577 * precalculated inverse to speed up arithmetics by turning divisions
1578 * into multiplications:
1579 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001580static const u32 prio_to_wmult[40] = {
Ingo Molnar254753d2007-08-09 11:16:51 +02001581 /* -20 */ 48388, 59856, 76040, 92818, 118348,
1582 /* -15 */ 147320, 184698, 229616, 287308, 360437,
1583 /* -10 */ 449829, 563644, 704093, 875809, 1099582,
1584 /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
1585 /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
1586 /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
1587 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
1588 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
Ingo Molnardd41f592007-07-09 18:51:59 +02001589};
Peter Williams2dd73a42006-06-27 02:54:34 -07001590
Bharata B Raoef12fef2009-03-31 10:02:22 +05301591/* Time spent by the tasks of the cpu accounting group executing in ... */
1592enum cpuacct_stat_index {
1593 CPUACCT_STAT_USER, /* ... user mode */
1594 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
1595
1596 CPUACCT_STAT_NSTATS,
1597};
1598
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +01001599#ifdef CONFIG_CGROUP_CPUACCT
1600static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
Bharata B Raoef12fef2009-03-31 10:02:22 +05301601static void cpuacct_update_stats(struct task_struct *tsk,
1602 enum cpuacct_stat_index idx, cputime_t val);
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +01001603#else
1604static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
Bharata B Raoef12fef2009-03-31 10:02:22 +05301605static inline void cpuacct_update_stats(struct task_struct *tsk,
1606 enum cpuacct_stat_index idx, cputime_t val) {}
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +01001607#endif
1608
Peter Zijlstra18d95a22008-04-19 19:45:00 +02001609static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1610{
1611 update_load_add(&rq->load, load);
1612}
1613
1614static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1615{
1616 update_load_sub(&rq->load, load);
1617}
1618
Paul Turnera790de92011-07-21 09:43:29 -07001619#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
1620 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
Peter Zijlstraeb755802008-08-19 12:33:05 +02001621typedef int (*tg_visitor)(struct task_group *, void *);
1622
1623/*
Paul Turner82774342011-07-21 09:43:35 -07001624 * Iterate task_group tree rooted at *from, calling @down when first entering a
1625 * node and @up when leaving it for the final time.
1626 *
1627 * Caller must hold rcu_lock or sufficient equivalent.
Peter Zijlstraeb755802008-08-19 12:33:05 +02001628 */
Paul Turner82774342011-07-21 09:43:35 -07001629static int walk_tg_tree_from(struct task_group *from,
1630 tg_visitor down, tg_visitor up, void *data)
Peter Zijlstraeb755802008-08-19 12:33:05 +02001631{
1632 struct task_group *parent, *child;
1633 int ret;
1634
Paul Turner82774342011-07-21 09:43:35 -07001635 parent = from;
1636
Peter Zijlstraeb755802008-08-19 12:33:05 +02001637down:
1638 ret = (*down)(parent, data);
1639 if (ret)
Paul Turner82774342011-07-21 09:43:35 -07001640 goto out;
Peter Zijlstraeb755802008-08-19 12:33:05 +02001641 list_for_each_entry_rcu(child, &parent->children, siblings) {
1642 parent = child;
1643 goto down;
1644
1645up:
1646 continue;
1647 }
1648 ret = (*up)(parent, data);
Paul Turner82774342011-07-21 09:43:35 -07001649 if (ret || parent == from)
1650 goto out;
Peter Zijlstraeb755802008-08-19 12:33:05 +02001651
1652 child = parent;
1653 parent = parent->parent;
1654 if (parent)
1655 goto up;
Paul Turner82774342011-07-21 09:43:35 -07001656out:
Peter Zijlstraeb755802008-08-19 12:33:05 +02001657 return ret;
1658}
1659
Paul Turner82774342011-07-21 09:43:35 -07001660/*
1661 * Iterate the full tree, calling @down when first entering a node and @up when
1662 * leaving it for the final time.
1663 *
1664 * Caller must hold rcu_lock or sufficient equivalent.
1665 */
1666
1667static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1668{
1669 return walk_tg_tree_from(&root_task_group, down, up, data);
1670}
1671
Peter Zijlstraeb755802008-08-19 12:33:05 +02001672static int tg_nop(struct task_group *tg, void *data)
1673{
1674 return 0;
1675}
1676#endif
1677
Gregory Haskinse7693a32008-01-25 21:08:09 +01001678#ifdef CONFIG_SMP
Peter Zijlstraf5f08f32009-09-10 13:35:28 +02001679/* Used instead of source_load when we know the type == 0 */
1680static unsigned long weighted_cpuload(const int cpu)
1681{
1682 return cpu_rq(cpu)->load.weight;
1683}
1684
1685/*
1686 * Return a low guess at the load of a migration-source cpu weighted
1687 * according to the scheduling class and "nice" value.
1688 *
1689 * We want to under-estimate the load of migration sources, to
1690 * balance conservatively.
1691 */
1692static unsigned long source_load(int cpu, int type)
1693{
1694 struct rq *rq = cpu_rq(cpu);
1695 unsigned long total = weighted_cpuload(cpu);
1696
1697 if (type == 0 || !sched_feat(LB_BIAS))
1698 return total;
1699
1700 return min(rq->cpu_load[type-1], total);
1701}
1702
1703/*
1704 * Return a high guess at the load of a migration-target cpu weighted
1705 * according to the scheduling class and "nice" value.
1706 */
1707static unsigned long target_load(int cpu, int type)
1708{
1709 struct rq *rq = cpu_rq(cpu);
1710 unsigned long total = weighted_cpuload(cpu);
1711
1712 if (type == 0 || !sched_feat(LB_BIAS))
1713 return total;
1714
1715 return max(rq->cpu_load[type-1], total);
1716}
1717
Peter Zijlstraae154be2009-09-10 14:40:57 +02001718static unsigned long power_of(int cpu)
1719{
Peter Zijlstrae51fd5e2010-05-31 12:37:30 +02001720 return cpu_rq(cpu)->cpu_power;
Peter Zijlstraae154be2009-09-10 14:40:57 +02001721}
1722
Gregory Haskinse7693a32008-01-25 21:08:09 +01001723static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
Peter Zijlstrac09595f2008-06-27 13:41:14 +02001724
Peter Zijlstraa8a51d52008-06-27 13:41:26 +02001725static unsigned long cpu_avg_load_per_task(int cpu)
1726{
1727 struct rq *rq = cpu_rq(cpu);
Ingo Molnaraf6d5962008-11-29 20:45:15 +01001728 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
Peter Zijlstraa8a51d52008-06-27 13:41:26 +02001729
Steven Rostedt4cd42622008-11-26 21:04:24 -05001730 if (nr_running)
Jan H. Schönherre2b245f2011-08-01 11:03:28 +02001731 return rq->load.weight / nr_running;
Peter Zijlstraa8a51d52008-06-27 13:41:26 +02001732
Jan H. Schönherre2b245f2011-08-01 11:03:28 +02001733 return 0;
Peter Zijlstraa8a51d52008-06-27 13:41:26 +02001734}
1735
Gregory Haskins8f45e2b2008-12-29 09:39:51 -05001736#ifdef CONFIG_PREEMPT
1737
Peter Zijlstrab78bb862009-09-15 14:23:18 +02001738static void double_rq_lock(struct rq *rq1, struct rq *rq2);
1739
Alexey Dobriyan70574a92008-11-28 22:08:00 +03001740/*
Gregory Haskins8f45e2b2008-12-29 09:39:51 -05001741 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1742 * way at the expense of forcing extra atomic operations in all
1743 * invocations. This assures that the double_lock is acquired using the
1744 * same underlying policy as the spinlock_t on this architecture, which
1745 * reduces latency compared to the unfair variant below. However, it
1746 * also adds more overhead and therefore may reduce throughput.
Alexey Dobriyan70574a92008-11-28 22:08:00 +03001747 */
Gregory Haskins8f45e2b2008-12-29 09:39:51 -05001748static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1749 __releases(this_rq->lock)
1750 __acquires(busiest->lock)
1751 __acquires(this_rq->lock)
1752{
Thomas Gleixner05fa7852009-11-17 14:28:38 +01001753 raw_spin_unlock(&this_rq->lock);
Gregory Haskins8f45e2b2008-12-29 09:39:51 -05001754 double_rq_lock(this_rq, busiest);
1755
1756 return 1;
1757}
1758
1759#else
1760/*
1761 * Unfair double_lock_balance: Optimizes throughput at the expense of
1762 * latency by eliminating extra atomic operations when the locks are
1763 * already in proper order on entry. This favors lower cpu-ids and will
1764 * grant the double lock to lower cpus over higher ids under contention,
1765 * regardless of entry order into the function.
1766 */
1767static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
Alexey Dobriyan70574a92008-11-28 22:08:00 +03001768 __releases(this_rq->lock)
1769 __acquires(busiest->lock)
1770 __acquires(this_rq->lock)
1771{
1772 int ret = 0;
1773
Thomas Gleixner05fa7852009-11-17 14:28:38 +01001774 if (unlikely(!raw_spin_trylock(&busiest->lock))) {
Alexey Dobriyan70574a92008-11-28 22:08:00 +03001775 if (busiest < this_rq) {
Thomas Gleixner05fa7852009-11-17 14:28:38 +01001776 raw_spin_unlock(&this_rq->lock);
1777 raw_spin_lock(&busiest->lock);
1778 raw_spin_lock_nested(&this_rq->lock,
1779 SINGLE_DEPTH_NESTING);
Alexey Dobriyan70574a92008-11-28 22:08:00 +03001780 ret = 1;
1781 } else
Thomas Gleixner05fa7852009-11-17 14:28:38 +01001782 raw_spin_lock_nested(&busiest->lock,
1783 SINGLE_DEPTH_NESTING);
Alexey Dobriyan70574a92008-11-28 22:08:00 +03001784 }
1785 return ret;
1786}
1787
Gregory Haskins8f45e2b2008-12-29 09:39:51 -05001788#endif /* CONFIG_PREEMPT */
1789
1790/*
1791 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1792 */
1793static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1794{
1795 if (unlikely(!irqs_disabled())) {
1796 /* printk() doesn't work good under rq->lock */
Thomas Gleixner05fa7852009-11-17 14:28:38 +01001797 raw_spin_unlock(&this_rq->lock);
Gregory Haskins8f45e2b2008-12-29 09:39:51 -05001798 BUG_ON(1);
1799 }
1800
1801 return _double_lock_balance(this_rq, busiest);
1802}
1803
Alexey Dobriyan70574a92008-11-28 22:08:00 +03001804static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1805 __releases(busiest->lock)
1806{
Thomas Gleixner05fa7852009-11-17 14:28:38 +01001807 raw_spin_unlock(&busiest->lock);
Alexey Dobriyan70574a92008-11-28 22:08:00 +03001808 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1809}
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01001810
1811/*
1812 * double_rq_lock - safely lock two runqueues
1813 *
1814 * Note this does not disable interrupts like task_rq_lock,
1815 * you need to do so manually before calling.
1816 */
1817static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1818 __acquires(rq1->lock)
1819 __acquires(rq2->lock)
1820{
1821 BUG_ON(!irqs_disabled());
1822 if (rq1 == rq2) {
1823 raw_spin_lock(&rq1->lock);
1824 __acquire(rq2->lock); /* Fake it out ;) */
1825 } else {
1826 if (rq1 < rq2) {
1827 raw_spin_lock(&rq1->lock);
1828 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1829 } else {
1830 raw_spin_lock(&rq2->lock);
1831 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1832 }
1833 }
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01001834}
1835
1836/*
1837 * double_rq_unlock - safely unlock two runqueues
1838 *
1839 * Note this does not restore interrupts like task_rq_unlock,
1840 * you need to do so manually after calling.
1841 */
1842static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1843 __releases(rq1->lock)
1844 __releases(rq2->lock)
1845{
1846 raw_spin_unlock(&rq1->lock);
1847 if (rq1 != rq2)
1848 raw_spin_unlock(&rq2->lock);
1849 else
1850 __release(rq2->lock);
1851}
1852
Mike Galbraithd95f4122011-02-01 09:50:51 -05001853#else /* CONFIG_SMP */
1854
1855/*
1856 * double_rq_lock - safely lock two runqueues
1857 *
1858 * Note this does not disable interrupts like task_rq_lock,
1859 * you need to do so manually before calling.
1860 */
1861static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1862 __acquires(rq1->lock)
1863 __acquires(rq2->lock)
1864{
1865 BUG_ON(!irqs_disabled());
1866 BUG_ON(rq1 != rq2);
1867 raw_spin_lock(&rq1->lock);
1868 __acquire(rq2->lock); /* Fake it out ;) */
1869}
1870
1871/*
1872 * double_rq_unlock - safely unlock two runqueues
1873 *
1874 * Note this does not restore interrupts like task_rq_unlock,
1875 * you need to do so manually after calling.
1876 */
1877static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1878 __releases(rq1->lock)
1879 __releases(rq2->lock)
1880{
1881 BUG_ON(rq1 != rq2);
1882 raw_spin_unlock(&rq1->lock);
1883 __release(rq2->lock);
1884}
1885
Peter Zijlstra18d95a22008-04-19 19:45:00 +02001886#endif
Peter Zijlstra18d95a22008-04-19 19:45:00 +02001887
Peter Zijlstra74f51872010-04-22 21:50:19 +02001888static void calc_load_account_idle(struct rq *this_rq);
Christian Ehrhardt0bcdcf22009-11-30 12:16:46 +01001889static void update_sysctl(void);
Christian Ehrhardtacb4a842009-11-30 12:16:48 +01001890static int get_update_sysctl_factor(void);
Venkatesh Pallipadifdf3e952010-05-17 18:14:43 -07001891static void update_cpu_load(struct rq *this_rq);
Thomas Gleixnerdce48a82009-04-11 10:43:41 +02001892
Peter Zijlstracd29fe62009-11-27 17:32:46 +01001893static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1894{
1895 set_task_rq(p, cpu);
1896#ifdef CONFIG_SMP
1897 /*
1898 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
Joe Perchesbfb90352011-08-17 06:58:04 -07001899 * successfully executed on another CPU. We must ensure that updates of
Peter Zijlstracd29fe62009-11-27 17:32:46 +01001900 * per-task data have been completed by this moment.
1901 */
1902 smp_wmb();
1903 task_thread_info(p)->cpu = cpu;
1904#endif
1905}
Peter Zijlstra18d95a22008-04-19 19:45:00 +02001906
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01001907static const struct sched_class rt_sched_class;
Ingo Molnardd41f592007-07-09 18:51:59 +02001908
Peter Zijlstra34f971f2010-09-22 13:53:15 +02001909#define sched_class_highest (&stop_sched_class)
Gregory Haskins1f11eb62008-06-04 15:04:05 -04001910#define for_each_class(class) \
1911 for (class = sched_class_highest; class; class = class->next)
Ingo Molnardd41f592007-07-09 18:51:59 +02001912
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01001913#include "sched_stats.h"
1914
Peter Zijlstrac09595f2008-06-27 13:41:14 +02001915static void inc_nr_running(struct rq *rq)
Ingo Molnar6363ca52008-05-29 11:28:57 +02001916{
1917 rq->nr_running++;
Ingo Molnar6363ca52008-05-29 11:28:57 +02001918}
1919
Peter Zijlstrac09595f2008-06-27 13:41:14 +02001920static void dec_nr_running(struct rq *rq)
Ingo Molnar9c217242007-08-02 17:41:40 +02001921{
1922 rq->nr_running--;
Ingo Molnar9c217242007-08-02 17:41:40 +02001923}
1924
Ingo Molnar45bf76d2007-07-09 18:51:59 +02001925static void set_load_weight(struct task_struct *p)
1926{
Nikhil Raof05998d2011-05-18 10:09:38 -07001927 int prio = p->static_prio - MAX_RT_PRIO;
1928 struct load_weight *load = &p->se.load;
1929
Ingo Molnardd41f592007-07-09 18:51:59 +02001930 /*
1931 * SCHED_IDLE tasks get minimal weight:
1932 */
1933 if (p->policy == SCHED_IDLE) {
Nikhil Raoc8b28112011-05-18 14:37:48 -07001934 load->weight = scale_load(WEIGHT_IDLEPRIO);
Nikhil Raof05998d2011-05-18 10:09:38 -07001935 load->inv_weight = WMULT_IDLEPRIO;
Ingo Molnardd41f592007-07-09 18:51:59 +02001936 return;
1937 }
1938
Nikhil Raoc8b28112011-05-18 14:37:48 -07001939 load->weight = scale_load(prio_to_weight[prio]);
Nikhil Raof05998d2011-05-18 10:09:38 -07001940 load->inv_weight = prio_to_wmult[prio];
Ingo Molnar45bf76d2007-07-09 18:51:59 +02001941}
1942
Peter Zijlstra371fd7e2010-03-24 16:38:48 +01001943static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
Gregory Haskins2087a1a2008-06-27 14:30:00 -06001944{
Mike Galbraitha64692a2010-03-11 17:16:20 +01001945 update_rq_clock(rq);
Ingo Molnar71f8bd42007-07-09 18:51:59 +02001946 sched_info_queued(p);
Peter Zijlstra371fd7e2010-03-24 16:38:48 +01001947 p->sched_class->enqueue_task(rq, p, flags);
Ingo Molnardd41f592007-07-09 18:51:59 +02001948}
1949
Peter Zijlstra371fd7e2010-03-24 16:38:48 +01001950static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
Ingo Molnardd41f592007-07-09 18:51:59 +02001951{
Mike Galbraitha64692a2010-03-11 17:16:20 +01001952 update_rq_clock(rq);
Ankita Garg46ac22b2008-07-01 14:30:06 +05301953 sched_info_dequeued(p);
Peter Zijlstra371fd7e2010-03-24 16:38:48 +01001954 p->sched_class->dequeue_task(rq, p, flags);
Ingo Molnar71f8bd42007-07-09 18:51:59 +02001955}
1956
1957/*
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01001958 * activate_task - move a task to the runqueue.
1959 */
Peter Zijlstra371fd7e2010-03-24 16:38:48 +01001960static void activate_task(struct rq *rq, struct task_struct *p, int flags)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01001961{
1962 if (task_contributes_to_load(p))
1963 rq->nr_uninterruptible--;
1964
Peter Zijlstra371fd7e2010-03-24 16:38:48 +01001965 enqueue_task(rq, p, flags);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01001966}
1967
1968/*
1969 * deactivate_task - remove a task from the runqueue.
1970 */
Peter Zijlstra371fd7e2010-03-24 16:38:48 +01001971static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01001972{
1973 if (task_contributes_to_load(p))
1974 rq->nr_uninterruptible++;
1975
Peter Zijlstra371fd7e2010-03-24 16:38:48 +01001976 dequeue_task(rq, p, flags);
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01001977}
1978
Venkatesh Pallipadib52bfee2010-10-04 17:03:19 -07001979#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1980
Venkatesh Pallipadi305e6832010-10-04 17:03:21 -07001981/*
1982 * There are no locks covering percpu hardirq/softirq time.
1983 * They are only modified in account_system_vtime, on corresponding CPU
1984 * with interrupts disabled. So, writes are safe.
1985 * They are read and saved off onto struct rq in update_rq_clock().
1986 * This may result in other CPU reading this CPU's irq time and can
1987 * race with irq/account_system_vtime on this CPU. We would either get old
Peter Zijlstra8e92c202010-12-09 14:15:34 +01001988 * or new value with a side effect of accounting a slice of irq time to wrong
1989 * task when irq is in progress while we read rq->clock. That is a worthy
1990 * compromise in place of having locks on each irq in account_system_time.
Venkatesh Pallipadi305e6832010-10-04 17:03:21 -07001991 */
Venkatesh Pallipadib52bfee2010-10-04 17:03:19 -07001992static DEFINE_PER_CPU(u64, cpu_hardirq_time);
1993static DEFINE_PER_CPU(u64, cpu_softirq_time);
1994
1995static DEFINE_PER_CPU(u64, irq_start_time);
1996static int sched_clock_irqtime;
1997
1998void enable_sched_clock_irqtime(void)
1999{
2000 sched_clock_irqtime = 1;
2001}
2002
2003void disable_sched_clock_irqtime(void)
2004{
2005 sched_clock_irqtime = 0;
2006}
2007
Peter Zijlstra8e92c202010-12-09 14:15:34 +01002008#ifndef CONFIG_64BIT
2009static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
2010
2011static inline void irq_time_write_begin(void)
2012{
2013 __this_cpu_inc(irq_time_seq.sequence);
2014 smp_wmb();
2015}
2016
2017static inline void irq_time_write_end(void)
2018{
2019 smp_wmb();
2020 __this_cpu_inc(irq_time_seq.sequence);
2021}
2022
2023static inline u64 irq_time_read(int cpu)
2024{
2025 u64 irq_time;
2026 unsigned seq;
2027
2028 do {
2029 seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
2030 irq_time = per_cpu(cpu_softirq_time, cpu) +
2031 per_cpu(cpu_hardirq_time, cpu);
2032 } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
2033
2034 return irq_time;
2035}
2036#else /* CONFIG_64BIT */
2037static inline void irq_time_write_begin(void)
2038{
2039}
2040
2041static inline void irq_time_write_end(void)
2042{
2043}
2044
2045static inline u64 irq_time_read(int cpu)
Venkatesh Pallipadi305e6832010-10-04 17:03:21 -07002046{
Venkatesh Pallipadi305e6832010-10-04 17:03:21 -07002047 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
2048}
Peter Zijlstra8e92c202010-12-09 14:15:34 +01002049#endif /* CONFIG_64BIT */
Venkatesh Pallipadi305e6832010-10-04 17:03:21 -07002050
Peter Zijlstrafe44d622010-12-09 14:15:34 +01002051/*
2052 * Called before incrementing preempt_count on {soft,}irq_enter
2053 * and before decrementing preempt_count on {soft,}irq_exit.
2054 */
Venkatesh Pallipadib52bfee2010-10-04 17:03:19 -07002055void account_system_vtime(struct task_struct *curr)
2056{
2057 unsigned long flags;
Peter Zijlstrafe44d622010-12-09 14:15:34 +01002058 s64 delta;
Venkatesh Pallipadib52bfee2010-10-04 17:03:19 -07002059 int cpu;
Venkatesh Pallipadib52bfee2010-10-04 17:03:19 -07002060
2061 if (!sched_clock_irqtime)
2062 return;
2063
2064 local_irq_save(flags);
2065
Venkatesh Pallipadib52bfee2010-10-04 17:03:19 -07002066 cpu = smp_processor_id();
Peter Zijlstrafe44d622010-12-09 14:15:34 +01002067 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
2068 __this_cpu_add(irq_start_time, delta);
2069
Peter Zijlstra8e92c202010-12-09 14:15:34 +01002070 irq_time_write_begin();
Venkatesh Pallipadib52bfee2010-10-04 17:03:19 -07002071 /*
2072 * We do not account for softirq time from ksoftirqd here.
2073 * We want to continue accounting softirq time to ksoftirqd thread
2074 * in that case, so as not to confuse scheduler with a special task
2075 * that do not consume any time, but still wants to run.
2076 */
2077 if (hardirq_count())
Peter Zijlstrafe44d622010-12-09 14:15:34 +01002078 __this_cpu_add(cpu_hardirq_time, delta);
Venkatesh Pallipadi4dd53d82010-12-21 17:09:00 -08002079 else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
Peter Zijlstrafe44d622010-12-09 14:15:34 +01002080 __this_cpu_add(cpu_softirq_time, delta);
Venkatesh Pallipadib52bfee2010-10-04 17:03:19 -07002081
Peter Zijlstra8e92c202010-12-09 14:15:34 +01002082 irq_time_write_end();
Venkatesh Pallipadib52bfee2010-10-04 17:03:19 -07002083 local_irq_restore(flags);
2084}
Ingo Molnarb7dadc32010-10-18 20:00:37 +02002085EXPORT_SYMBOL_GPL(account_system_vtime);
Venkatesh Pallipadib52bfee2010-10-04 17:03:19 -07002086
Glauber Costae6e66852011-07-11 15:28:17 -04002087#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
2088
2089#ifdef CONFIG_PARAVIRT
2090static inline u64 steal_ticks(u64 steal)
2091{
2092 if (unlikely(steal > NSEC_PER_SEC))
2093 return div_u64(steal, TICK_NSEC);
2094
2095 return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
2096}
2097#endif
2098
Peter Zijlstrafe44d622010-12-09 14:15:34 +01002099static void update_rq_clock_task(struct rq *rq, s64 delta)
Venkatesh Pallipadiaa483802010-10-04 17:03:22 -07002100{
Glauber Costa095c0aa2011-07-11 15:28:18 -04002101/*
2102 * In theory, the compile should just see 0 here, and optimize out the call
2103 * to sched_rt_avg_update. But I don't trust it...
2104 */
2105#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
2106 s64 steal = 0, irq_delta = 0;
2107#endif
2108#ifdef CONFIG_IRQ_TIME_ACCOUNTING
Peter Zijlstra8e92c202010-12-09 14:15:34 +01002109 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
Peter Zijlstrafe44d622010-12-09 14:15:34 +01002110
2111 /*
2112 * Since irq_time is only updated on {soft,}irq_exit, we might run into
2113 * this case when a previous update_rq_clock() happened inside a
2114 * {soft,}irq region.
2115 *
2116 * When this happens, we stop ->clock_task and only update the
2117 * prev_irq_time stamp to account for the part that fit, so that a next
2118 * update will consume the rest. This ensures ->clock_task is
2119 * monotonic.
2120 *
2121 * It does however cause some slight miss-attribution of {soft,}irq
2122 * time, a more accurate solution would be to update the irq_time using
2123 * the current rq->clock timestamp, except that would require using
2124 * atomic ops.
2125 */
2126 if (irq_delta > delta)
2127 irq_delta = delta;
2128
2129 rq->prev_irq_time += irq_delta;
2130 delta -= irq_delta;
Glauber Costa095c0aa2011-07-11 15:28:18 -04002131#endif
2132#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
2133 if (static_branch((&paravirt_steal_rq_enabled))) {
2134 u64 st;
2135
2136 steal = paravirt_steal_clock(cpu_of(rq));
2137 steal -= rq->prev_steal_time_rq;
2138
2139 if (unlikely(steal > delta))
2140 steal = delta;
2141
2142 st = steal_ticks(steal);
2143 steal = st * TICK_NSEC;
2144
2145 rq->prev_steal_time_rq += steal;
2146
2147 delta -= steal;
2148 }
2149#endif
2150
Peter Zijlstrafe44d622010-12-09 14:15:34 +01002151 rq->clock_task += delta;
2152
Glauber Costa095c0aa2011-07-11 15:28:18 -04002153#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
2154 if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
2155 sched_rt_avg_update(rq, irq_delta + steal);
2156#endif
Venkatesh Pallipadiaa483802010-10-04 17:03:22 -07002157}
2158
Glauber Costa095c0aa2011-07-11 15:28:18 -04002159#ifdef CONFIG_IRQ_TIME_ACCOUNTING
Venkatesh Pallipadiabb74ce2010-12-21 17:09:03 -08002160static int irqtime_account_hi_update(void)
2161{
2162 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2163 unsigned long flags;
2164 u64 latest_ns;
2165 int ret = 0;
2166
2167 local_irq_save(flags);
2168 latest_ns = this_cpu_read(cpu_hardirq_time);
2169 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
2170 ret = 1;
2171 local_irq_restore(flags);
2172 return ret;
2173}
2174
2175static int irqtime_account_si_update(void)
2176{
2177 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2178 unsigned long flags;
2179 u64 latest_ns;
2180 int ret = 0;
2181
2182 local_irq_save(flags);
2183 latest_ns = this_cpu_read(cpu_softirq_time);
2184 if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
2185 ret = 1;
2186 local_irq_restore(flags);
2187 return ret;
2188}
2189
Peter Zijlstrafe44d622010-12-09 14:15:34 +01002190#else /* CONFIG_IRQ_TIME_ACCOUNTING */
Venkatesh Pallipadi305e6832010-10-04 17:03:21 -07002191
Venkatesh Pallipadiabb74ce2010-12-21 17:09:03 -08002192#define sched_clock_irqtime (0)
2193
Glauber Costa095c0aa2011-07-11 15:28:18 -04002194#endif
Venkatesh Pallipadib52bfee2010-10-04 17:03:19 -07002195
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01002196#include "sched_idletask.c"
2197#include "sched_fair.c"
2198#include "sched_rt.c"
Mike Galbraith5091faa2010-11-30 14:18:03 +01002199#include "sched_autogroup.c"
Peter Zijlstra34f971f2010-09-22 13:53:15 +02002200#include "sched_stoptask.c"
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01002201#ifdef CONFIG_SCHED_DEBUG
2202# include "sched_debug.c"
2203#endif
2204
Peter Zijlstra34f971f2010-09-22 13:53:15 +02002205void sched_set_stop_task(int cpu, struct task_struct *stop)
2206{
2207 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
2208 struct task_struct *old_stop = cpu_rq(cpu)->stop;
2209
2210 if (stop) {
2211 /*
2212 * Make it appear like a SCHED_FIFO task, its something
2213 * userspace knows about and won't get confused about.
2214 *
2215 * Also, it will make PI more or less work without too
2216 * much confusion -- but then, stop work should not
2217 * rely on PI working anyway.
2218 */
2219 sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
2220
2221 stop->sched_class = &stop_sched_class;
2222 }
2223
2224 cpu_rq(cpu)->stop = stop;
2225
2226 if (old_stop) {
2227 /*
2228 * Reset it back to a normal scheduling class so that
2229 * it can die in pieces.
2230 */
2231 old_stop->sched_class = &rt_sched_class;
2232 }
2233}
2234
Peter Zijlstra1e3c88b2009-12-17 17:00:43 +01002235/*
Ingo Molnardd41f592007-07-09 18:51:59 +02002236 * __normal_prio - return the priority that is based on the static prio
Ingo Molnar71f8bd42007-07-09 18:51:59 +02002237 */
Ingo Molnar14531182007-07-09 18:51:59 +02002238static inline int __normal_prio(struct task_struct *p)
2239{
Ingo Molnardd41f592007-07-09 18:51:59 +02002240 return p->static_prio;
Ingo Molnar14531182007-07-09 18:51:59 +02002241}
2242
2243/*
Ingo Molnarb29739f2006-06-27 02:54:51 -07002244 * Calculate the expected normal priority: i.e. priority
2245 * without taking RT-inheritance into account. Might be
2246 * boosted by interactivity modifiers. Changes upon fork,
2247 * setprio syscalls, and whenever the interactivity
2248 * estimator recalculates.
2249 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07002250static inline int normal_prio(struct task_struct *p)
Ingo Molnarb29739f2006-06-27 02:54:51 -07002251{
2252 int prio;
2253
Ingo Molnare05606d2007-07-09 18:51:59 +02002254 if (task_has_rt_policy(p))
Ingo Molnarb29739f2006-06-27 02:54:51 -07002255 prio = MAX_RT_PRIO-1 - p->rt_priority;
2256 else
2257 prio = __normal_prio(p);
2258 return prio;
2259}
2260
2261/*
2262 * Calculate the current priority, i.e. the priority
2263 * taken into account by the scheduler. This value might
2264 * be boosted by RT tasks, or might be boosted by
2265 * interactivity modifiers. Will be RT if the task got
2266 * RT-boosted. If not then it returns p->normal_prio.
2267 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07002268static int effective_prio(struct task_struct *p)
Ingo Molnarb29739f2006-06-27 02:54:51 -07002269{
2270 p->normal_prio = normal_prio(p);
2271 /*
2272 * If we are RT tasks or we were boosted to RT priority,
2273 * keep the priority unchanged. Otherwise, update priority
2274 * to the normal priority:
2275 */
2276 if (!rt_prio(p->prio))
2277 return p->normal_prio;
2278 return p->prio;
2279}
2280
Linus Torvalds1da177e2005-04-16 15:20:36 -07002281/**
2282 * task_curr - is this task currently executing on a CPU?
2283 * @p: the task in question.
2284 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07002285inline int task_curr(const struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002286{
2287 return cpu_curr(task_cpu(p)) == p;
2288}
2289
Steven Rostedtcb469842008-01-25 21:08:22 +01002290static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2291 const struct sched_class *prev_class,
Peter Zijlstrada7a7352011-01-17 17:03:27 +01002292 int oldprio)
Steven Rostedtcb469842008-01-25 21:08:22 +01002293{
2294 if (prev_class != p->sched_class) {
2295 if (prev_class->switched_from)
Peter Zijlstrada7a7352011-01-17 17:03:27 +01002296 prev_class->switched_from(rq, p);
2297 p->sched_class->switched_to(rq, p);
2298 } else if (oldprio != p->prio)
2299 p->sched_class->prio_changed(rq, p, oldprio);
Steven Rostedtcb469842008-01-25 21:08:22 +01002300}
2301
Peter Zijlstra1e5a7402010-10-31 12:37:04 +01002302static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2303{
2304 const struct sched_class *class;
2305
2306 if (p->sched_class == rq->curr->sched_class) {
2307 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
2308 } else {
2309 for_each_class(class) {
2310 if (class == rq->curr->sched_class)
2311 break;
2312 if (class == p->sched_class) {
2313 resched_task(rq->curr);
2314 break;
2315 }
2316 }
2317 }
2318
2319 /*
2320 * A queue event has occurred, and we're going to schedule. In
2321 * this case, we can save a useless back to back clock update.
2322 */
Peter Zijlstrafd2f4412011-04-05 17:23:44 +02002323 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
Peter Zijlstra1e5a7402010-10-31 12:37:04 +01002324 rq->skip_clock_update = 1;
2325}
2326
Linus Torvalds1da177e2005-04-16 15:20:36 -07002327#ifdef CONFIG_SMP
Ingo Molnarcc367732007-10-15 17:00:18 +02002328/*
2329 * Is this task likely cache-hot:
2330 */
Gregory Haskinse7693a32008-01-25 21:08:09 +01002331static int
Ingo Molnarcc367732007-10-15 17:00:18 +02002332task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2333{
2334 s64 delta;
2335
Peter Zijlstrae6c8fba2009-12-16 18:04:33 +01002336 if (p->sched_class != &fair_sched_class)
2337 return 0;
2338
Nikhil Raoef8002f2010-10-13 12:09:35 -07002339 if (unlikely(p->policy == SCHED_IDLE))
2340 return 0;
2341
Ingo Molnarf540a602008-03-15 17:10:34 +01002342 /*
2343 * Buddy candidates are cache hot:
2344 */
Mike Galbraithf685cea2009-10-23 23:09:22 +02002345 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
Peter Zijlstra47932412008-11-04 21:25:09 +01002346 (&p->se == cfs_rq_of(&p->se)->next ||
2347 &p->se == cfs_rq_of(&p->se)->last))
Ingo Molnarf540a602008-03-15 17:10:34 +01002348 return 1;
2349
Ingo Molnar6bc16652007-10-15 17:00:18 +02002350 if (sysctl_sched_migration_cost == -1)
2351 return 1;
2352 if (sysctl_sched_migration_cost == 0)
2353 return 0;
2354
Ingo Molnarcc367732007-10-15 17:00:18 +02002355 delta = now - p->se.exec_start;
2356
2357 return delta < (s64)sysctl_sched_migration_cost;
2358}
2359
Ingo Molnardd41f592007-07-09 18:51:59 +02002360void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
Ingo Molnarc65cc872007-07-09 18:51:58 +02002361{
Peter Zijlstrae2912002009-12-16 18:04:36 +01002362#ifdef CONFIG_SCHED_DEBUG
2363 /*
2364 * We should never call set_task_cpu() on a blocked task,
2365 * ttwu() will sort out the placement.
2366 */
Peter Zijlstra077614e2009-12-17 13:16:31 +01002367 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
2368 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
Peter Zijlstra0122ec52011-04-05 17:23:51 +02002369
2370#ifdef CONFIG_LOCKDEP
Peter Zijlstra6c6c54e2011-06-03 17:37:07 +02002371 /*
2372 * The caller should hold either p->pi_lock or rq->lock, when changing
2373 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
2374 *
2375 * sched_move_task() holds both and thus holding either pins the cgroup,
2376 * see set_task_rq().
2377 *
2378 * Furthermore, all task_rq users should acquire both locks, see
2379 * task_rq_lock().
2380 */
Peter Zijlstra0122ec52011-04-05 17:23:51 +02002381 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
2382 lockdep_is_held(&task_rq(p)->lock)));
2383#endif
Peter Zijlstrae2912002009-12-16 18:04:36 +01002384#endif
2385
Mathieu Desnoyersde1d7282009-05-05 16:49:59 +08002386 trace_sched_migrate_task(p, new_cpu);
Peter Zijlstracbc34ed2008-12-10 08:08:22 +01002387
Peter Zijlstra0c697742009-12-22 15:43:19 +01002388 if (task_cpu(p) != new_cpu) {
2389 p->se.nr_migrations++;
Peter Zijlstraa8b0ca12011-06-27 14:41:57 +02002390 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
Peter Zijlstra0c697742009-12-22 15:43:19 +01002391 }
Ingo Molnardd41f592007-07-09 18:51:59 +02002392
2393 __set_task_cpu(p, new_cpu);
Ingo Molnarc65cc872007-07-09 18:51:58 +02002394}
2395
Tejun Heo969c7922010-05-06 18:49:21 +02002396struct migration_arg {
Ingo Molnar36c8b582006-07-03 00:25:41 -07002397 struct task_struct *task;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002398 int dest_cpu;
Ingo Molnar70b97a72006-07-03 00:25:42 -07002399};
Linus Torvalds1da177e2005-04-16 15:20:36 -07002400
Tejun Heo969c7922010-05-06 18:49:21 +02002401static int migration_cpu_stop(void *data);
2402
Linus Torvalds1da177e2005-04-16 15:20:36 -07002403/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07002404 * wait_task_inactive - wait for a thread to unschedule.
2405 *
Roland McGrath85ba2d82008-07-25 19:45:58 -07002406 * If @match_state is nonzero, it's the @p->state value just checked and
2407 * not expected to change. If it changes, i.e. @p might have woken up,
2408 * then return zero. When we succeed in waiting for @p to be off its CPU,
2409 * we return a positive number (its total switch count). If a second call
2410 * a short while later returns the same number, the caller can be sure that
2411 * @p has remained unscheduled the whole time.
2412 *
Linus Torvalds1da177e2005-04-16 15:20:36 -07002413 * The caller must ensure that the task *will* unschedule sometime soon,
2414 * else this function might spin for a *long* time. This function can't
2415 * be called with interrupts off, or it may introduce deadlock with
2416 * smp_call_function() if an IPI is sent by the same process we are
2417 * waiting to become inactive.
2418 */
Roland McGrath85ba2d82008-07-25 19:45:58 -07002419unsigned long wait_task_inactive(struct task_struct *p, long match_state)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002420{
2421 unsigned long flags;
Ingo Molnardd41f592007-07-09 18:51:59 +02002422 int running, on_rq;
Roland McGrath85ba2d82008-07-25 19:45:58 -07002423 unsigned long ncsw;
Ingo Molnar70b97a72006-07-03 00:25:42 -07002424 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002425
Andi Kleen3a5c3592007-10-15 17:00:14 +02002426 for (;;) {
2427 /*
2428 * We do the initial early heuristics without holding
2429 * any task-queue locks at all. We'll only try to get
2430 * the runqueue lock when things look like they will
2431 * work out!
2432 */
2433 rq = task_rq(p);
Linus Torvaldsfa490cf2007-06-18 09:34:40 -07002434
Andi Kleen3a5c3592007-10-15 17:00:14 +02002435 /*
2436 * If the task is actively running on another CPU
2437 * still, just relax and busy-wait without holding
2438 * any locks.
2439 *
2440 * NOTE! Since we don't hold any locks, it's not
2441 * even sure that "rq" stays as the right runqueue!
2442 * But we don't care, since "task_running()" will
2443 * return false if the runqueue has changed and p
2444 * is actually now running somewhere else!
2445 */
Roland McGrath85ba2d82008-07-25 19:45:58 -07002446 while (task_running(rq, p)) {
2447 if (match_state && unlikely(p->state != match_state))
2448 return 0;
Andi Kleen3a5c3592007-10-15 17:00:14 +02002449 cpu_relax();
Roland McGrath85ba2d82008-07-25 19:45:58 -07002450 }
Linus Torvaldsfa490cf2007-06-18 09:34:40 -07002451
Andi Kleen3a5c3592007-10-15 17:00:14 +02002452 /*
2453 * Ok, time to look more closely! We need the rq
2454 * lock now, to be *sure*. If we're wrong, we'll
2455 * just go back and repeat.
2456 */
2457 rq = task_rq_lock(p, &flags);
Peter Zijlstra27a9da62010-05-04 20:36:56 +02002458 trace_sched_wait_task(p);
Andi Kleen3a5c3592007-10-15 17:00:14 +02002459 running = task_running(rq, p);
Peter Zijlstrafd2f4412011-04-05 17:23:44 +02002460 on_rq = p->on_rq;
Roland McGrath85ba2d82008-07-25 19:45:58 -07002461 ncsw = 0;
Oleg Nesterovf31e11d2008-08-20 16:54:44 -07002462 if (!match_state || p->state == match_state)
Oleg Nesterov93dcf552008-08-20 16:54:44 -07002463 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
Peter Zijlstra0122ec52011-04-05 17:23:51 +02002464 task_rq_unlock(rq, p, &flags);
Linus Torvaldsfa490cf2007-06-18 09:34:40 -07002465
Andi Kleen3a5c3592007-10-15 17:00:14 +02002466 /*
Roland McGrath85ba2d82008-07-25 19:45:58 -07002467 * If it changed from the expected state, bail out now.
2468 */
2469 if (unlikely(!ncsw))
2470 break;
2471
2472 /*
Andi Kleen3a5c3592007-10-15 17:00:14 +02002473 * Was it really running after all now that we
2474 * checked with the proper locks actually held?
2475 *
2476 * Oops. Go back and try again..
2477 */
2478 if (unlikely(running)) {
2479 cpu_relax();
2480 continue;
2481 }
2482
2483 /*
2484 * It's not enough that it's not actively running,
2485 * it must be off the runqueue _entirely_, and not
2486 * preempted!
2487 *
Luis Henriques80dd99b2009-03-16 19:58:09 +00002488 * So if it was still runnable (but just not actively
Andi Kleen3a5c3592007-10-15 17:00:14 +02002489 * running right now), it's preempted, and we should
2490 * yield - it could be a while.
2491 */
2492 if (unlikely(on_rq)) {
Thomas Gleixner8eb90c32011-02-23 23:52:21 +00002493 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
2494
2495 set_current_state(TASK_UNINTERRUPTIBLE);
2496 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
Andi Kleen3a5c3592007-10-15 17:00:14 +02002497 continue;
2498 }
2499
2500 /*
2501 * Ahh, all good. It wasn't running, and it wasn't
2502 * runnable, which means that it will never become
2503 * running in the future either. We're all done!
2504 */
2505 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002506 }
Roland McGrath85ba2d82008-07-25 19:45:58 -07002507
2508 return ncsw;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002509}
2510
2511/***
2512 * kick_process - kick a running thread to enter/exit the kernel
2513 * @p: the to-be-kicked thread
2514 *
2515 * Cause a process which is running on another CPU to enter
2516 * kernel-mode, without any delay. (to get signals handled.)
2517 *
Lucas De Marchi25985ed2011-03-30 22:57:33 -03002518 * NOTE: this function doesn't have to take the runqueue lock,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002519 * because all it wants to ensure is that the remote task enters
2520 * the kernel. If the IPI races and the task has been migrated
2521 * to another CPU then no harm is done and the purpose has been
2522 * achieved as well.
2523 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07002524void kick_process(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002525{
2526 int cpu;
2527
2528 preempt_disable();
2529 cpu = task_cpu(p);
2530 if ((cpu != smp_processor_id()) && task_curr(p))
2531 smp_send_reschedule(cpu);
2532 preempt_enable();
2533}
Rusty Russellb43e3522009-06-12 22:27:00 -06002534EXPORT_SYMBOL_GPL(kick_process);
Nick Piggin476d1392005-06-25 14:57:29 -07002535#endif /* CONFIG_SMP */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002536
Peter Zijlstra970b13b2009-11-25 13:31:39 +01002537#ifdef CONFIG_SMP
Oleg Nesterov30da6882010-03-15 10:10:19 +01002538/*
Peter Zijlstra013fdb82011-04-05 17:23:45 +02002539 * ->cpus_allowed is protected by both rq->lock and p->pi_lock
Oleg Nesterov30da6882010-03-15 10:10:19 +01002540 */
Peter Zijlstra5da9a0f2009-12-16 18:04:38 +01002541static int select_fallback_rq(int cpu, struct task_struct *p)
2542{
2543 int dest_cpu;
2544 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
2545
2546 /* Look for allowed, online CPU in same node. */
2547 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
Peter Zijlstrafa17b502011-06-16 12:23:22 +02002548 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
Peter Zijlstra5da9a0f2009-12-16 18:04:38 +01002549 return dest_cpu;
2550
2551 /* Any allowed, online CPU? */
Peter Zijlstrafa17b502011-06-16 12:23:22 +02002552 dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask);
Peter Zijlstra5da9a0f2009-12-16 18:04:38 +01002553 if (dest_cpu < nr_cpu_ids)
2554 return dest_cpu;
2555
2556 /* No more Mr. Nice Guy. */
Peter Zijlstra48c5cca2010-11-13 19:32:29 +01002557 dest_cpu = cpuset_cpus_allowed_fallback(p);
2558 /*
2559 * Don't tell them about moving exiting tasks or
2560 * kernel threads (both mm NULL), since they never
2561 * leave kernel.
2562 */
2563 if (p->mm && printk_ratelimit()) {
2564 printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
2565 task_pid_nr(p), p->comm, cpu);
Peter Zijlstra5da9a0f2009-12-16 18:04:38 +01002566 }
2567
2568 return dest_cpu;
2569}
2570
Peter Zijlstrae2912002009-12-16 18:04:36 +01002571/*
Peter Zijlstra013fdb82011-04-05 17:23:45 +02002572 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
Peter Zijlstrae2912002009-12-16 18:04:36 +01002573 */
Peter Zijlstra970b13b2009-11-25 13:31:39 +01002574static inline
Peter Zijlstra7608dec2011-04-05 17:23:46 +02002575int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
Peter Zijlstra970b13b2009-11-25 13:31:39 +01002576{
Peter Zijlstra7608dec2011-04-05 17:23:46 +02002577 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
Peter Zijlstrae2912002009-12-16 18:04:36 +01002578
2579 /*
2580 * In order not to call set_task_cpu() on a blocking task we need
2581 * to rely on ttwu() to place the task on a valid ->cpus_allowed
2582 * cpu.
2583 *
2584 * Since this is common to all placement strategies, this lives here.
2585 *
2586 * [ this allows ->select_task() to simply return task_cpu(p) and
2587 * not worry about this generic constraint ]
2588 */
Peter Zijlstrafa17b502011-06-16 12:23:22 +02002589 if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
Peter Zijlstra70f11202009-12-20 17:36:27 +01002590 !cpu_online(cpu)))
Peter Zijlstra5da9a0f2009-12-16 18:04:38 +01002591 cpu = select_fallback_rq(task_cpu(p), p);
Peter Zijlstrae2912002009-12-16 18:04:36 +01002592
2593 return cpu;
Peter Zijlstra970b13b2009-11-25 13:31:39 +01002594}
Mike Galbraith09a40af2010-04-15 07:29:59 +02002595
2596static void update_avg(u64 *avg, u64 sample)
2597{
2598 s64 diff = sample - *avg;
2599 *avg += diff >> 3;
2600}
Peter Zijlstra970b13b2009-11-25 13:31:39 +01002601#endif
2602
Peter Zijlstrad7c01d22011-04-05 17:23:43 +02002603static void
Peter Zijlstrab84cb5d2011-04-05 17:23:55 +02002604ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
Tejun Heo9ed38112009-12-03 15:08:03 +09002605{
Peter Zijlstrad7c01d22011-04-05 17:23:43 +02002606#ifdef CONFIG_SCHEDSTATS
Peter Zijlstrab84cb5d2011-04-05 17:23:55 +02002607 struct rq *rq = this_rq();
Tejun Heo9ed38112009-12-03 15:08:03 +09002608
Peter Zijlstrad7c01d22011-04-05 17:23:43 +02002609#ifdef CONFIG_SMP
2610 int this_cpu = smp_processor_id();
Tejun Heo9ed38112009-12-03 15:08:03 +09002611
Peter Zijlstrad7c01d22011-04-05 17:23:43 +02002612 if (cpu == this_cpu) {
2613 schedstat_inc(rq, ttwu_local);
2614 schedstat_inc(p, se.statistics.nr_wakeups_local);
2615 } else {
2616 struct sched_domain *sd;
2617
2618 schedstat_inc(p, se.statistics.nr_wakeups_remote);
Peter Zijlstra057f3fa2011-04-18 11:24:34 +02002619 rcu_read_lock();
Peter Zijlstrad7c01d22011-04-05 17:23:43 +02002620 for_each_domain(this_cpu, sd) {
2621 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2622 schedstat_inc(sd, ttwu_wake_remote);
2623 break;
2624 }
2625 }
Peter Zijlstra057f3fa2011-04-18 11:24:34 +02002626 rcu_read_unlock();
Peter Zijlstrad7c01d22011-04-05 17:23:43 +02002627 }
Peter Zijlstraf339b9d2011-05-31 10:49:20 +02002628
2629 if (wake_flags & WF_MIGRATED)
2630 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2631
Peter Zijlstrad7c01d22011-04-05 17:23:43 +02002632#endif /* CONFIG_SMP */
2633
2634 schedstat_inc(rq, ttwu_count);
2635 schedstat_inc(p, se.statistics.nr_wakeups);
2636
2637 if (wake_flags & WF_SYNC)
2638 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2639
Peter Zijlstrad7c01d22011-04-05 17:23:43 +02002640#endif /* CONFIG_SCHEDSTATS */
Tejun Heo9ed38112009-12-03 15:08:03 +09002641}
2642
Peter Zijlstrad7c01d22011-04-05 17:23:43 +02002643static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
Tejun Heo9ed38112009-12-03 15:08:03 +09002644{
Tejun Heo9ed38112009-12-03 15:08:03 +09002645 activate_task(rq, p, en_flags);
Peter Zijlstrafd2f4412011-04-05 17:23:44 +02002646 p->on_rq = 1;
Peter Zijlstrac2f71152011-04-13 13:28:56 +02002647
2648 /* if a worker is waking up, notify workqueue */
2649 if (p->flags & PF_WQ_WORKER)
2650 wq_worker_waking_up(p, cpu_of(rq));
Tejun Heo9ed38112009-12-03 15:08:03 +09002651}
2652
Peter Zijlstra23f41ee2011-04-05 17:23:56 +02002653/*
2654 * Mark the task runnable and perform wakeup-preemption.
2655 */
Peter Zijlstra89363382011-04-05 17:23:42 +02002656static void
Peter Zijlstra23f41ee2011-04-05 17:23:56 +02002657ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
Tejun Heo9ed38112009-12-03 15:08:03 +09002658{
Peter Zijlstra89363382011-04-05 17:23:42 +02002659 trace_sched_wakeup(p, true);
Tejun Heo9ed38112009-12-03 15:08:03 +09002660 check_preempt_curr(rq, p, wake_flags);
2661
2662 p->state = TASK_RUNNING;
2663#ifdef CONFIG_SMP
2664 if (p->sched_class->task_woken)
2665 p->sched_class->task_woken(rq, p);
2666
Steven Rostedte69c6342010-12-06 17:10:31 -05002667 if (rq->idle_stamp) {
Tejun Heo9ed38112009-12-03 15:08:03 +09002668 u64 delta = rq->clock - rq->idle_stamp;
2669 u64 max = 2*sysctl_sched_migration_cost;
2670
2671 if (delta > max)
2672 rq->avg_idle = max;
2673 else
2674 update_avg(&rq->avg_idle, delta);
2675 rq->idle_stamp = 0;
2676 }
2677#endif
2678}
2679
Peter Zijlstrac05fbaf2011-04-05 17:23:57 +02002680static void
2681ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
2682{
2683#ifdef CONFIG_SMP
2684 if (p->sched_contributes_to_load)
2685 rq->nr_uninterruptible--;
2686#endif
2687
2688 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
2689 ttwu_do_wakeup(rq, p, wake_flags);
2690}
2691
2692/*
2693 * Called in case the task @p isn't fully descheduled from its runqueue,
2694 * in this case we must do a remote wakeup. Its a 'light' wakeup though,
2695 * since all we need to do is flip p->state to TASK_RUNNING, since
2696 * the task is still ->on_rq.
2697 */
2698static int ttwu_remote(struct task_struct *p, int wake_flags)
2699{
2700 struct rq *rq;
2701 int ret = 0;
2702
2703 rq = __task_rq_lock(p);
2704 if (p->on_rq) {
2705 ttwu_do_wakeup(rq, p, wake_flags);
2706 ret = 1;
2707 }
2708 __task_rq_unlock(rq);
2709
2710 return ret;
2711}
2712
Peter Zijlstra317f3942011-04-05 17:23:58 +02002713#ifdef CONFIG_SMP
Peter Zijlstrafa14ff42011-09-12 13:06:17 +02002714static void sched_ttwu_pending(void)
Peter Zijlstra317f3942011-04-05 17:23:58 +02002715{
2716 struct rq *rq = this_rq();
Peter Zijlstrafa14ff42011-09-12 13:06:17 +02002717 struct llist_node *llist = llist_del_all(&rq->wake_list);
2718 struct task_struct *p;
Peter Zijlstra317f3942011-04-05 17:23:58 +02002719
2720 raw_spin_lock(&rq->lock);
2721
Peter Zijlstrafa14ff42011-09-12 13:06:17 +02002722 while (llist) {
2723 p = llist_entry(llist, struct task_struct, wake_entry);
2724 llist = llist_next(llist);
Peter Zijlstra317f3942011-04-05 17:23:58 +02002725 ttwu_do_activate(rq, p, 0);
2726 }
2727
2728 raw_spin_unlock(&rq->lock);
2729}
2730
2731void scheduler_ipi(void)
2732{
Suresh Siddhaca380622011-10-03 15:09:00 -07002733 if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
Peter Zijlstrac5d753a2011-07-19 15:07:25 -07002734 return;
2735
2736 /*
2737 * Not all reschedule IPI handlers call irq_enter/irq_exit, since
2738 * traditionally all their work was done from the interrupt return
2739 * path. Now that we actually do some work, we need to make sure
2740 * we do call them.
2741 *
2742 * Some archs already do call them, luckily irq_enter/exit nest
2743 * properly.
2744 *
2745 * Arguably we should visit all archs and update all handlers,
2746 * however a fair share of IPIs are still resched only so this would
2747 * somewhat pessimize the simple resched case.
2748 */
2749 irq_enter();
Peter Zijlstrafa14ff42011-09-12 13:06:17 +02002750 sched_ttwu_pending();
Suresh Siddhaca380622011-10-03 15:09:00 -07002751
2752 /*
2753 * Check if someone kicked us for doing the nohz idle load balance.
2754 */
Suresh Siddha6eb57e02011-10-03 15:09:01 -07002755 if (unlikely(got_nohz_idle_kick() && !need_resched())) {
2756 this_rq()->idle_balance = 1;
Suresh Siddhaca380622011-10-03 15:09:00 -07002757 raise_softirq_irqoff(SCHED_SOFTIRQ);
Suresh Siddha6eb57e02011-10-03 15:09:01 -07002758 }
Peter Zijlstrac5d753a2011-07-19 15:07:25 -07002759 irq_exit();
Peter Zijlstra317f3942011-04-05 17:23:58 +02002760}
2761
2762static void ttwu_queue_remote(struct task_struct *p, int cpu)
2763{
Peter Zijlstrafa14ff42011-09-12 13:06:17 +02002764 if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
Peter Zijlstra317f3942011-04-05 17:23:58 +02002765 smp_send_reschedule(cpu);
2766}
Peter Zijlstrad6aa8f82011-05-26 14:21:33 +02002767
2768#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2769static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
2770{
2771 struct rq *rq;
2772 int ret = 0;
2773
2774 rq = __task_rq_lock(p);
2775 if (p->on_cpu) {
2776 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
2777 ttwu_do_wakeup(rq, p, wake_flags);
2778 ret = 1;
2779 }
2780 __task_rq_unlock(rq);
2781
2782 return ret;
2783
2784}
2785#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2786#endif /* CONFIG_SMP */
Peter Zijlstra317f3942011-04-05 17:23:58 +02002787
Peter Zijlstrac05fbaf2011-04-05 17:23:57 +02002788static void ttwu_queue(struct task_struct *p, int cpu)
2789{
2790 struct rq *rq = cpu_rq(cpu);
2791
Daniel Hellstrom17d9f312011-05-20 04:01:10 +00002792#if defined(CONFIG_SMP)
Peter Zijlstra317f3942011-04-05 17:23:58 +02002793 if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
Peter Zijlstraf01114c2011-05-31 12:26:55 +02002794 sched_clock_cpu(cpu); /* sync clocks x-cpu */
Peter Zijlstra317f3942011-04-05 17:23:58 +02002795 ttwu_queue_remote(p, cpu);
2796 return;
2797 }
2798#endif
2799
Peter Zijlstrac05fbaf2011-04-05 17:23:57 +02002800 raw_spin_lock(&rq->lock);
2801 ttwu_do_activate(rq, p, 0);
2802 raw_spin_unlock(&rq->lock);
Tejun Heo9ed38112009-12-03 15:08:03 +09002803}
2804
2805/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07002806 * try_to_wake_up - wake up a thread
Tejun Heo9ed38112009-12-03 15:08:03 +09002807 * @p: the thread to be awakened
Linus Torvalds1da177e2005-04-16 15:20:36 -07002808 * @state: the mask of task states that can be woken
Tejun Heo9ed38112009-12-03 15:08:03 +09002809 * @wake_flags: wake modifier flags (WF_*)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002810 *
2811 * Put it on the run-queue if it's not already there. The "current"
2812 * thread is always on the run-queue (except when the actual
2813 * re-schedule is in progress), and as such you're allowed to do
2814 * the simpler "current->state = TASK_RUNNING" to mark yourself
2815 * runnable without the overhead of this.
2816 *
Tejun Heo9ed38112009-12-03 15:08:03 +09002817 * Returns %true if @p was woken up, %false if it was already running
2818 * or @state didn't match @p's state.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002819 */
Peter Zijlstrae4a52bc2011-04-05 17:23:54 +02002820static int
2821try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002822{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002823 unsigned long flags;
Peter Zijlstrac05fbaf2011-04-05 17:23:57 +02002824 int cpu, success = 0;
Peter Zijlstra2398f2c2008-06-27 13:41:35 +02002825
Linus Torvalds04e2f172008-02-23 18:05:03 -08002826 smp_wmb();
Peter Zijlstra013fdb82011-04-05 17:23:45 +02002827 raw_spin_lock_irqsave(&p->pi_lock, flags);
Peter Zijlstrae9c84312009-09-15 14:43:03 +02002828 if (!(p->state & state))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002829 goto out;
2830
Peter Zijlstrac05fbaf2011-04-05 17:23:57 +02002831 success = 1; /* we're going to change ->state */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002832 cpu = task_cpu(p);
Peter Zijlstrad7c01d22011-04-05 17:23:43 +02002833
Peter Zijlstrac05fbaf2011-04-05 17:23:57 +02002834 if (p->on_rq && ttwu_remote(p, wake_flags))
2835 goto stat;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002836
2837#ifdef CONFIG_SMP
Peter Zijlstrae9c84312009-09-15 14:43:03 +02002838 /*
Peter Zijlstrac05fbaf2011-04-05 17:23:57 +02002839 * If the owning (remote) cpu is still in the middle of schedule() with
2840 * this task as prev, wait until its done referencing the task.
Peter Zijlstrae9c84312009-09-15 14:43:03 +02002841 */
Peter Zijlstrae4a52bc2011-04-05 17:23:54 +02002842 while (p->on_cpu) {
2843#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2844 /*
Peter Zijlstrad6aa8f82011-05-26 14:21:33 +02002845 * In case the architecture enables interrupts in
2846 * context_switch(), we cannot busy wait, since that
2847 * would lead to deadlocks when an interrupt hits and
2848 * tries to wake up @prev. So bail and do a complete
2849 * remote wakeup.
Peter Zijlstrae4a52bc2011-04-05 17:23:54 +02002850 */
Peter Zijlstrad6aa8f82011-05-26 14:21:33 +02002851 if (ttwu_activate_remote(p, wake_flags))
Peter Zijlstrac05fbaf2011-04-05 17:23:57 +02002852 goto stat;
Peter Zijlstrad6aa8f82011-05-26 14:21:33 +02002853#else
Peter Zijlstrae4a52bc2011-04-05 17:23:54 +02002854 cpu_relax();
Peter Zijlstrad6aa8f82011-05-26 14:21:33 +02002855#endif
Peter Zijlstracc87f762010-03-26 12:22:14 +01002856 }
Peter Zijlstrae4a52bc2011-04-05 17:23:54 +02002857 /*
2858 * Pairs with the smp_wmb() in finish_lock_switch().
2859 */
2860 smp_rmb();
Linus Torvalds1da177e2005-04-16 15:20:36 -07002861
Peter Zijlstraa8e4f2e2011-04-05 17:23:49 +02002862 p->sched_contributes_to_load = !!task_contributes_to_load(p);
Peter Zijlstrae9c84312009-09-15 14:43:03 +02002863 p->state = TASK_WAKING;
Peter Zijlstraefbbd052009-12-16 18:04:40 +01002864
Peter Zijlstrae4a52bc2011-04-05 17:23:54 +02002865 if (p->sched_class->task_waking)
Peter Zijlstra74f8e4b2011-04-05 17:23:47 +02002866 p->sched_class->task_waking(p);
Peter Zijlstraab19cb22009-11-27 15:44:43 +01002867
Peter Zijlstra7608dec2011-04-05 17:23:46 +02002868 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
Peter Zijlstraf339b9d2011-05-31 10:49:20 +02002869 if (task_cpu(p) != cpu) {
2870 wake_flags |= WF_MIGRATED;
Mike Galbraithf5dc3752009-10-09 08:35:03 +02002871 set_task_cpu(p, cpu);
Peter Zijlstraf339b9d2011-05-31 10:49:20 +02002872 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002873#endif /* CONFIG_SMP */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002874
Peter Zijlstrac05fbaf2011-04-05 17:23:57 +02002875 ttwu_queue(p, cpu);
2876stat:
Peter Zijlstrab84cb5d2011-04-05 17:23:55 +02002877 ttwu_stat(p, cpu, wake_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002878out:
Peter Zijlstra013fdb82011-04-05 17:23:45 +02002879 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002880
2881 return success;
2882}
2883
David Howells50fa6102009-04-28 15:01:38 +01002884/**
Tejun Heo21aa9af2010-06-08 21:40:37 +02002885 * try_to_wake_up_local - try to wake up a local task with rq lock held
2886 * @p: the thread to be awakened
2887 *
Peter Zijlstra2acca552011-04-05 17:23:50 +02002888 * Put @p on the run-queue if it's not already there. The caller must
Tejun Heo21aa9af2010-06-08 21:40:37 +02002889 * ensure that this_rq() is locked, @p is bound to this_rq() and not
Peter Zijlstra2acca552011-04-05 17:23:50 +02002890 * the current task.
Tejun Heo21aa9af2010-06-08 21:40:37 +02002891 */
2892static void try_to_wake_up_local(struct task_struct *p)
2893{
2894 struct rq *rq = task_rq(p);
Tejun Heo21aa9af2010-06-08 21:40:37 +02002895
2896 BUG_ON(rq != this_rq());
2897 BUG_ON(p == current);
2898 lockdep_assert_held(&rq->lock);
2899
Peter Zijlstra2acca552011-04-05 17:23:50 +02002900 if (!raw_spin_trylock(&p->pi_lock)) {
2901 raw_spin_unlock(&rq->lock);
2902 raw_spin_lock(&p->pi_lock);
2903 raw_spin_lock(&rq->lock);
Tejun Heo21aa9af2010-06-08 21:40:37 +02002904 }
Peter Zijlstra2acca552011-04-05 17:23:50 +02002905
Tejun Heo21aa9af2010-06-08 21:40:37 +02002906 if (!(p->state & TASK_NORMAL))
Peter Zijlstra2acca552011-04-05 17:23:50 +02002907 goto out;
Tejun Heo21aa9af2010-06-08 21:40:37 +02002908
Peter Zijlstrafd2f4412011-04-05 17:23:44 +02002909 if (!p->on_rq)
Peter Zijlstrad7c01d22011-04-05 17:23:43 +02002910 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
2911
Peter Zijlstra23f41ee2011-04-05 17:23:56 +02002912 ttwu_do_wakeup(rq, p, 0);
Peter Zijlstrab84cb5d2011-04-05 17:23:55 +02002913 ttwu_stat(p, smp_processor_id(), 0);
Peter Zijlstra2acca552011-04-05 17:23:50 +02002914out:
2915 raw_spin_unlock(&p->pi_lock);
Tejun Heo21aa9af2010-06-08 21:40:37 +02002916}
2917
2918/**
David Howells50fa6102009-04-28 15:01:38 +01002919 * wake_up_process - Wake up a specific process
2920 * @p: The process to be woken up.
2921 *
2922 * Attempt to wake up the nominated process and move it to the set of runnable
2923 * processes. Returns 1 if the process was woken up, 0 if it was already
2924 * running.
2925 *
2926 * It may be assumed that this function implies a write memory barrier before
2927 * changing the task state if and only if any tasks are woken up.
2928 */
Harvey Harrison7ad5b3a2008-02-08 04:19:53 -08002929int wake_up_process(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002930{
Matthew Wilcoxd9514f62007-12-06 11:07:07 -05002931 return try_to_wake_up(p, TASK_ALL, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002932}
Linus Torvalds1da177e2005-04-16 15:20:36 -07002933EXPORT_SYMBOL(wake_up_process);
2934
Harvey Harrison7ad5b3a2008-02-08 04:19:53 -08002935int wake_up_state(struct task_struct *p, unsigned int state)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002936{
2937 return try_to_wake_up(p, state, 0);
2938}
2939
Linus Torvalds1da177e2005-04-16 15:20:36 -07002940/*
2941 * Perform scheduler related setup for a newly forked process p.
2942 * p is forked by current.
Ingo Molnardd41f592007-07-09 18:51:59 +02002943 *
2944 * __sched_fork() is basic setup used by init_idle() too:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002945 */
Ingo Molnardd41f592007-07-09 18:51:59 +02002946static void __sched_fork(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002947{
Peter Zijlstrafd2f4412011-04-05 17:23:44 +02002948 p->on_rq = 0;
2949
2950 p->se.on_rq = 0;
Ingo Molnardd41f592007-07-09 18:51:59 +02002951 p->se.exec_start = 0;
2952 p->se.sum_exec_runtime = 0;
Ingo Molnarf6cf8912007-08-28 12:53:24 +02002953 p->se.prev_sum_exec_runtime = 0;
Ingo Molnar6c594c22008-12-14 12:34:15 +01002954 p->se.nr_migrations = 0;
Peter Zijlstrada7a7352011-01-17 17:03:27 +01002955 p->se.vruntime = 0;
Peter Zijlstrafd2f4412011-04-05 17:23:44 +02002956 INIT_LIST_HEAD(&p->se.group_node);
Ingo Molnar6cfb0d52007-08-02 17:41:40 +02002957
2958#ifdef CONFIG_SCHEDSTATS
Lucas De Marchi41acab82010-03-10 23:37:45 -03002959 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
Ingo Molnar6cfb0d52007-08-02 17:41:40 +02002960#endif
Nick Piggin476d1392005-06-25 14:57:29 -07002961
Peter Zijlstrafa717062008-01-25 21:08:27 +01002962 INIT_LIST_HEAD(&p->rt.run_list);
Nick Piggin476d1392005-06-25 14:57:29 -07002963
Avi Kivitye107be32007-07-26 13:40:43 +02002964#ifdef CONFIG_PREEMPT_NOTIFIERS
2965 INIT_HLIST_HEAD(&p->preempt_notifiers);
2966#endif
Ingo Molnardd41f592007-07-09 18:51:59 +02002967}
2968
2969/*
2970 * fork()/clone()-time setup:
2971 */
Samir Bellabes3e51e3e2011-05-11 18:18:05 +02002972void sched_fork(struct task_struct *p)
Ingo Molnardd41f592007-07-09 18:51:59 +02002973{
Peter Zijlstra0122ec52011-04-05 17:23:51 +02002974 unsigned long flags;
Ingo Molnardd41f592007-07-09 18:51:59 +02002975 int cpu = get_cpu();
2976
2977 __sched_fork(p);
Peter Zijlstra06b83b52009-12-16 18:04:35 +01002978 /*
Peter Zijlstra0017d732010-03-24 18:34:10 +01002979 * We mark the process as running here. This guarantees that
Peter Zijlstra06b83b52009-12-16 18:04:35 +01002980 * nobody will actually run it, and a signal or other external
2981 * event cannot wake it up and insert it on the runqueue either.
2982 */
Peter Zijlstra0017d732010-03-24 18:34:10 +01002983 p->state = TASK_RUNNING;
Ingo Molnardd41f592007-07-09 18:51:59 +02002984
Ingo Molnarb29739f2006-06-27 02:54:51 -07002985 /*
Mike Galbraithc350a042011-07-27 17:14:55 +02002986 * Make sure we do not leak PI boosting priority to the child.
2987 */
2988 p->prio = current->normal_prio;
2989
2990 /*
Mike Galbraithb9dc29e2009-06-17 10:46:01 +02002991 * Revert to default priority/policy on fork if requested.
2992 */
2993 if (unlikely(p->sched_reset_on_fork)) {
Mike Galbraithc350a042011-07-27 17:14:55 +02002994 if (task_has_rt_policy(p)) {
Mike Galbraithb9dc29e2009-06-17 10:46:01 +02002995 p->policy = SCHED_NORMAL;
Mike Galbraith6c697bd2009-06-17 10:48:02 +02002996 p->static_prio = NICE_TO_PRIO(0);
Mike Galbraithc350a042011-07-27 17:14:55 +02002997 p->rt_priority = 0;
2998 } else if (PRIO_TO_NICE(p->static_prio) < 0)
2999 p->static_prio = NICE_TO_PRIO(0);
3000
3001 p->prio = p->normal_prio = __normal_prio(p);
3002 set_load_weight(p);
Mike Galbraith6c697bd2009-06-17 10:48:02 +02003003
Mike Galbraithb9dc29e2009-06-17 10:46:01 +02003004 /*
3005 * We don't need the reset flag anymore after the fork. It has
3006 * fulfilled its duty:
3007 */
3008 p->sched_reset_on_fork = 0;
3009 }
Lennart Poetteringca94c442009-06-15 17:17:47 +02003010
Hiroshi Shimamoto2ddbf952007-10-15 17:00:11 +02003011 if (!rt_prio(p->prio))
3012 p->sched_class = &fair_sched_class;
Ingo Molnarb29739f2006-06-27 02:54:51 -07003013
Peter Zijlstracd29fe62009-11-27 17:32:46 +01003014 if (p->sched_class->task_fork)
3015 p->sched_class->task_fork(p);
3016
Peter Zijlstra86951592010-06-22 11:44:53 +02003017 /*
3018 * The child is not yet in the pid-hash so no cgroup attach races,
3019 * and the cgroup is pinned to this child due to cgroup_fork()
3020 * is ran before sched_fork().
3021 *
3022 * Silence PROVE_RCU.
3023 */
Peter Zijlstra0122ec52011-04-05 17:23:51 +02003024 raw_spin_lock_irqsave(&p->pi_lock, flags);
Peter Zijlstra5f3edc12009-09-10 13:42:00 +02003025 set_task_cpu(p, cpu);
Peter Zijlstra0122ec52011-04-05 17:23:51 +02003026 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
Peter Zijlstra5f3edc12009-09-10 13:42:00 +02003027
Chandra Seetharaman52f17b62006-07-14 00:24:38 -07003028#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
Ingo Molnardd41f592007-07-09 18:51:59 +02003029 if (likely(sched_info_on()))
Chandra Seetharaman52f17b62006-07-14 00:24:38 -07003030 memset(&p->sched_info, 0, sizeof(p->sched_info));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003031#endif
Peter Zijlstra3ca7a442011-04-05 17:23:40 +02003032#if defined(CONFIG_SMP)
3033 p->on_cpu = 0;
Nick Piggin4866cde2005-06-25 14:57:23 -07003034#endif
Frederic Weisbeckerbdd4e852011-06-08 01:13:27 +02003035#ifdef CONFIG_PREEMPT_COUNT
Nick Piggin4866cde2005-06-25 14:57:23 -07003036 /* Want to start with kernel preemption disabled. */
Al Viroa1261f52005-11-13 16:06:55 -08003037 task_thread_info(p)->preempt_count = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003038#endif
Dario Faggioli806c09a2010-11-30 19:51:33 +01003039#ifdef CONFIG_SMP
Gregory Haskins917b6272008-12-29 09:39:53 -05003040 plist_node_init(&p->pushable_tasks, MAX_PRIO);
Dario Faggioli806c09a2010-11-30 19:51:33 +01003041#endif
Gregory Haskins917b6272008-12-29 09:39:53 -05003042
Nick Piggin476d1392005-06-25 14:57:29 -07003043 put_cpu();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003044}
3045
3046/*
3047 * wake_up_new_task - wake up a newly created task for the first time.
3048 *
3049 * This function will do some initial scheduler statistics housekeeping
3050 * that must be done for every newly created context, then puts the task
3051 * on the runqueue and wakes it.
3052 */
Samir Bellabes3e51e3e2011-05-11 18:18:05 +02003053void wake_up_new_task(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003054{
3055 unsigned long flags;
Ingo Molnardd41f592007-07-09 18:51:59 +02003056 struct rq *rq;
Peter Zijlstrafabf3182010-01-21 21:04:57 +01003057
Peter Zijlstraab2515c2011-04-05 17:23:52 +02003058 raw_spin_lock_irqsave(&p->pi_lock, flags);
Peter Zijlstrafabf3182010-01-21 21:04:57 +01003059#ifdef CONFIG_SMP
3060 /*
3061 * Fork balancing, do it here and not earlier because:
3062 * - cpus_allowed can change in the fork path
3063 * - any previously selected cpu might disappear through hotplug
Peter Zijlstrafabf3182010-01-21 21:04:57 +01003064 */
Peter Zijlstraab2515c2011-04-05 17:23:52 +02003065 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
Peter Zijlstrafabf3182010-01-21 21:04:57 +01003066#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07003067
Peter Zijlstraab2515c2011-04-05 17:23:52 +02003068 rq = __task_rq_lock(p);
Peter Zijlstracd29fe62009-11-27 17:32:46 +01003069 activate_task(rq, p, 0);
Peter Zijlstrafd2f4412011-04-05 17:23:44 +02003070 p->on_rq = 1;
Peter Zijlstra89363382011-04-05 17:23:42 +02003071 trace_sched_wakeup_new(p, true);
Peter Zijlstraa7558e02009-09-14 20:02:34 +02003072 check_preempt_curr(rq, p, WF_FORK);
Steven Rostedt9a897c52008-01-25 21:08:22 +01003073#ifdef CONFIG_SMP
Peter Zijlstraefbbd052009-12-16 18:04:40 +01003074 if (p->sched_class->task_woken)
3075 p->sched_class->task_woken(rq, p);
Steven Rostedt9a897c52008-01-25 21:08:22 +01003076#endif
Peter Zijlstra0122ec52011-04-05 17:23:51 +02003077 task_rq_unlock(rq, p, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003078}
3079
Avi Kivitye107be32007-07-26 13:40:43 +02003080#ifdef CONFIG_PREEMPT_NOTIFIERS
3081
3082/**
Luis Henriques80dd99b2009-03-16 19:58:09 +00003083 * preempt_notifier_register - tell me when current is being preempted & rescheduled
Randy Dunlap421cee22007-07-31 00:37:50 -07003084 * @notifier: notifier struct to register
Avi Kivitye107be32007-07-26 13:40:43 +02003085 */
3086void preempt_notifier_register(struct preempt_notifier *notifier)
3087{
3088 hlist_add_head(&notifier->link, &current->preempt_notifiers);
3089}
3090EXPORT_SYMBOL_GPL(preempt_notifier_register);
3091
3092/**
3093 * preempt_notifier_unregister - no longer interested in preemption notifications
Randy Dunlap421cee22007-07-31 00:37:50 -07003094 * @notifier: notifier struct to unregister
Avi Kivitye107be32007-07-26 13:40:43 +02003095 *
3096 * This is safe to call from within a preemption notifier.
3097 */
3098void preempt_notifier_unregister(struct preempt_notifier *notifier)
3099{
3100 hlist_del(&notifier->link);
3101}
3102EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
3103
3104static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
3105{
3106 struct preempt_notifier *notifier;
3107 struct hlist_node *node;
3108
3109 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
3110 notifier->ops->sched_in(notifier, raw_smp_processor_id());
3111}
3112
3113static void
3114fire_sched_out_preempt_notifiers(struct task_struct *curr,
3115 struct task_struct *next)
3116{
3117 struct preempt_notifier *notifier;
3118 struct hlist_node *node;
3119
3120 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
3121 notifier->ops->sched_out(notifier, next);
3122}
3123
Dhaval Giani6d6bc0a2008-05-30 14:23:45 +02003124#else /* !CONFIG_PREEMPT_NOTIFIERS */
Avi Kivitye107be32007-07-26 13:40:43 +02003125
3126static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
3127{
3128}
3129
3130static void
3131fire_sched_out_preempt_notifiers(struct task_struct *curr,
3132 struct task_struct *next)
3133{
3134}
3135
Dhaval Giani6d6bc0a2008-05-30 14:23:45 +02003136#endif /* CONFIG_PREEMPT_NOTIFIERS */
Avi Kivitye107be32007-07-26 13:40:43 +02003137
Linus Torvalds1da177e2005-04-16 15:20:36 -07003138/**
Nick Piggin4866cde2005-06-25 14:57:23 -07003139 * prepare_task_switch - prepare to switch tasks
3140 * @rq: the runqueue preparing to switch
Randy Dunlap421cee22007-07-31 00:37:50 -07003141 * @prev: the current task that is being switched out
Nick Piggin4866cde2005-06-25 14:57:23 -07003142 * @next: the task we are going to switch to.
3143 *
3144 * This is called with the rq lock held and interrupts off. It must
3145 * be paired with a subsequent finish_task_switch after the context
3146 * switch.
3147 *
3148 * prepare_task_switch sets up locking and calls architecture specific
3149 * hooks.
3150 */
Avi Kivitye107be32007-07-26 13:40:43 +02003151static inline void
3152prepare_task_switch(struct rq *rq, struct task_struct *prev,
3153 struct task_struct *next)
Nick Piggin4866cde2005-06-25 14:57:23 -07003154{
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01003155 sched_info_switch(prev, next);
3156 perf_event_task_sched_out(prev, next);
Avi Kivitye107be32007-07-26 13:40:43 +02003157 fire_sched_out_preempt_notifiers(prev, next);
Nick Piggin4866cde2005-06-25 14:57:23 -07003158 prepare_lock_switch(rq, next);
3159 prepare_arch_switch(next);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01003160 trace_sched_switch(prev, next);
Nick Piggin4866cde2005-06-25 14:57:23 -07003161}
3162
3163/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07003164 * finish_task_switch - clean up after a task-switch
Jeff Garzik344baba2005-09-07 01:15:17 -04003165 * @rq: runqueue associated with task-switch
Linus Torvalds1da177e2005-04-16 15:20:36 -07003166 * @prev: the thread we just switched away from.
3167 *
Nick Piggin4866cde2005-06-25 14:57:23 -07003168 * finish_task_switch must be called after the context switch, paired
3169 * with a prepare_task_switch call before the context switch.
3170 * finish_task_switch will reconcile locking set up by prepare_task_switch,
3171 * and do any other architecture-specific cleanup actions.
Linus Torvalds1da177e2005-04-16 15:20:36 -07003172 *
3173 * Note that we may have delayed dropping an mm in context_switch(). If
Ingo Molnar41a2d6c2007-12-05 15:46:09 +01003174 * so, we finish that here outside of the runqueue lock. (Doing it
Linus Torvalds1da177e2005-04-16 15:20:36 -07003175 * with the lock held can cause deadlocks; see schedule() for
3176 * details.)
3177 */
Alexey Dobriyana9957442007-10-15 17:00:13 +02003178static void finish_task_switch(struct rq *rq, struct task_struct *prev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003179 __releases(rq->lock)
3180{
Linus Torvalds1da177e2005-04-16 15:20:36 -07003181 struct mm_struct *mm = rq->prev_mm;
Oleg Nesterov55a101f2006-09-29 02:01:10 -07003182 long prev_state;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003183
3184 rq->prev_mm = NULL;
3185
3186 /*
3187 * A task struct has one reference for the use as "current".
Oleg Nesterovc394cc92006-09-29 02:01:11 -07003188 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
Oleg Nesterov55a101f2006-09-29 02:01:10 -07003189 * schedule one last time. The schedule call will never return, and
3190 * the scheduled task must drop that reference.
Oleg Nesterovc394cc92006-09-29 02:01:11 -07003191 * The test for TASK_DEAD must occur while the runqueue locks are
Linus Torvalds1da177e2005-04-16 15:20:36 -07003192 * still held, otherwise prev could be scheduled on another cpu, die
3193 * there before we look at prev->state, and then the reference would
3194 * be dropped twice.
3195 * Manfred Spraul <manfred@colorfullife.com>
3196 */
Oleg Nesterov55a101f2006-09-29 02:01:10 -07003197 prev_state = prev->state;
Nick Piggin4866cde2005-06-25 14:57:23 -07003198 finish_arch_switch(prev);
Jamie Iles8381f652010-01-08 15:27:33 +00003199#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
3200 local_irq_disable();
3201#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
Stephane Eraniana8d757e2011-08-25 15:58:03 +02003202 perf_event_task_sched_in(prev, current);
Jamie Iles8381f652010-01-08 15:27:33 +00003203#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
3204 local_irq_enable();
3205#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
Nick Piggin4866cde2005-06-25 14:57:23 -07003206 finish_lock_switch(rq, prev);
Steven Rostedte8fa1362008-01-25 21:08:05 +01003207
Avi Kivitye107be32007-07-26 13:40:43 +02003208 fire_sched_in_preempt_notifiers(current);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003209 if (mm)
3210 mmdrop(mm);
Oleg Nesterovc394cc92006-09-29 02:01:11 -07003211 if (unlikely(prev_state == TASK_DEAD)) {
bibo maoc6fd91f2006-03-26 01:38:20 -08003212 /*
3213 * Remove function-return probe instances associated with this
3214 * task and put them back on the free list.
Ingo Molnar9761eea2007-07-09 18:52:00 +02003215 */
bibo maoc6fd91f2006-03-26 01:38:20 -08003216 kprobe_flush_task(prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003217 put_task_struct(prev);
bibo maoc6fd91f2006-03-26 01:38:20 -08003218 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003219}
3220
Gregory Haskins3f029d32009-07-29 11:08:47 -04003221#ifdef CONFIG_SMP
3222
3223/* assumes rq->lock is held */
3224static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
3225{
3226 if (prev->sched_class->pre_schedule)
3227 prev->sched_class->pre_schedule(rq, prev);
3228}
3229
3230/* rq->lock is NOT held, but preemption is disabled */
3231static inline void post_schedule(struct rq *rq)
3232{
3233 if (rq->post_schedule) {
3234 unsigned long flags;
3235
Thomas Gleixner05fa7852009-11-17 14:28:38 +01003236 raw_spin_lock_irqsave(&rq->lock, flags);
Gregory Haskins3f029d32009-07-29 11:08:47 -04003237 if (rq->curr->sched_class->post_schedule)
3238 rq->curr->sched_class->post_schedule(rq);
Thomas Gleixner05fa7852009-11-17 14:28:38 +01003239 raw_spin_unlock_irqrestore(&rq->lock, flags);
Gregory Haskins3f029d32009-07-29 11:08:47 -04003240
3241 rq->post_schedule = 0;
3242 }
3243}
3244
3245#else
3246
3247static inline void pre_schedule(struct rq *rq, struct task_struct *p)
3248{
3249}
3250
3251static inline void post_schedule(struct rq *rq)
3252{
3253}
3254
3255#endif
3256
Linus Torvalds1da177e2005-04-16 15:20:36 -07003257/**
3258 * schedule_tail - first thing a freshly forked thread must call.
3259 * @prev: the thread we just switched away from.
3260 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07003261asmlinkage void schedule_tail(struct task_struct *prev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003262 __releases(rq->lock)
3263{
Ingo Molnar70b97a72006-07-03 00:25:42 -07003264 struct rq *rq = this_rq();
3265
Nick Piggin4866cde2005-06-25 14:57:23 -07003266 finish_task_switch(rq, prev);
Steven Rostedtda19ab52009-07-29 00:21:22 -04003267
Gregory Haskins3f029d32009-07-29 11:08:47 -04003268 /*
3269 * FIXME: do we need to worry about rq being invalidated by the
3270 * task_switch?
3271 */
3272 post_schedule(rq);
Steven Rostedtda19ab52009-07-29 00:21:22 -04003273
Nick Piggin4866cde2005-06-25 14:57:23 -07003274#ifdef __ARCH_WANT_UNLOCKED_CTXSW
3275 /* In this case, finish_task_switch does not reenable preemption */
3276 preempt_enable();
3277#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07003278 if (current->set_child_tid)
Pavel Emelyanovb4888932007-10-18 23:40:14 -07003279 put_user(task_pid_vnr(current), current->set_child_tid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003280}
3281
3282/*
3283 * context_switch - switch to the new MM and the new
3284 * thread's register state.
3285 */
Ingo Molnardd41f592007-07-09 18:51:59 +02003286static inline void
Ingo Molnar70b97a72006-07-03 00:25:42 -07003287context_switch(struct rq *rq, struct task_struct *prev,
Ingo Molnar36c8b582006-07-03 00:25:41 -07003288 struct task_struct *next)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003289{
Ingo Molnardd41f592007-07-09 18:51:59 +02003290 struct mm_struct *mm, *oldmm;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003291
Avi Kivitye107be32007-07-26 13:40:43 +02003292 prepare_task_switch(rq, prev, next);
Peter Zijlstrafe4b04f2011-02-02 13:19:09 +01003293
Ingo Molnardd41f592007-07-09 18:51:59 +02003294 mm = next->mm;
3295 oldmm = prev->active_mm;
Zachary Amsden9226d122007-02-13 13:26:21 +01003296 /*
3297 * For paravirt, this is coupled with an exit in switch_to to
3298 * combine the page table reload and the switch backend into
3299 * one hypercall.
3300 */
Jeremy Fitzhardinge224101e2009-02-18 11:18:57 -08003301 arch_start_context_switch(prev);
Zachary Amsden9226d122007-02-13 13:26:21 +01003302
Heiko Carstens31915ab2010-09-16 14:42:25 +02003303 if (!mm) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003304 next->active_mm = oldmm;
3305 atomic_inc(&oldmm->mm_count);
3306 enter_lazy_tlb(oldmm, next);
3307 } else
3308 switch_mm(oldmm, mm, next);
3309
Heiko Carstens31915ab2010-09-16 14:42:25 +02003310 if (!prev->mm) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003311 prev->active_mm = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003312 rq->prev_mm = oldmm;
3313 }
Ingo Molnar3a5f5e42006-07-14 00:24:27 -07003314 /*
3315 * Since the runqueue lock will be released by the next
3316 * task (which is an invalid locking op but in the case
3317 * of the scheduler it's an obvious special-case), so we
3318 * do an early lockdep release here:
3319 */
3320#ifndef __ARCH_WANT_UNLOCKED_CTXSW
Ingo Molnar8a25d5d2006-07-03 00:24:54 -07003321 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
Ingo Molnar3a5f5e42006-07-14 00:24:27 -07003322#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07003323
3324 /* Here we just switch the register state and the stack. */
3325 switch_to(prev, next, prev);
3326
Ingo Molnardd41f592007-07-09 18:51:59 +02003327 barrier();
3328 /*
3329 * this_rq must be evaluated again because prev may have moved
3330 * CPUs since it called schedule(), thus the 'rq' on its stack
3331 * frame will be invalid.
3332 */
3333 finish_task_switch(this_rq(), prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003334}
3335
3336/*
3337 * nr_running, nr_uninterruptible and nr_context_switches:
3338 *
3339 * externally visible scheduler statistics: current number of runnable
3340 * threads, current number of uninterruptible-sleeping threads, total
3341 * number of context switches performed since bootup.
3342 */
3343unsigned long nr_running(void)
3344{
3345 unsigned long i, sum = 0;
3346
3347 for_each_online_cpu(i)
3348 sum += cpu_rq(i)->nr_running;
3349
3350 return sum;
3351}
3352
3353unsigned long nr_uninterruptible(void)
3354{
3355 unsigned long i, sum = 0;
3356
KAMEZAWA Hiroyuki0a945022006-03-28 01:56:37 -08003357 for_each_possible_cpu(i)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003358 sum += cpu_rq(i)->nr_uninterruptible;
3359
3360 /*
3361 * Since we read the counters lockless, it might be slightly
3362 * inaccurate. Do not allow it to go below zero though:
3363 */
3364 if (unlikely((long)sum < 0))
3365 sum = 0;
3366
3367 return sum;
3368}
3369
3370unsigned long long nr_context_switches(void)
3371{
Steven Rostedtcc94abf2006-06-27 02:54:31 -07003372 int i;
3373 unsigned long long sum = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003374
KAMEZAWA Hiroyuki0a945022006-03-28 01:56:37 -08003375 for_each_possible_cpu(i)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003376 sum += cpu_rq(i)->nr_switches;
3377
3378 return sum;
3379}
3380
3381unsigned long nr_iowait(void)
3382{
3383 unsigned long i, sum = 0;
3384
KAMEZAWA Hiroyuki0a945022006-03-28 01:56:37 -08003385 for_each_possible_cpu(i)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003386 sum += atomic_read(&cpu_rq(i)->nr_iowait);
3387
3388 return sum;
3389}
3390
Peter Zijlstra8c215bd2010-07-01 09:07:17 +02003391unsigned long nr_iowait_cpu(int cpu)
Arjan van de Ven69d25872009-09-21 17:04:08 -07003392{
Peter Zijlstra8c215bd2010-07-01 09:07:17 +02003393 struct rq *this = cpu_rq(cpu);
Arjan van de Ven69d25872009-09-21 17:04:08 -07003394 return atomic_read(&this->nr_iowait);
3395}
3396
3397unsigned long this_cpu_load(void)
3398{
3399 struct rq *this = this_rq();
3400 return this->cpu_load[0];
3401}
3402
3403
Thomas Gleixnerdce48a82009-04-11 10:43:41 +02003404/* Variables and functions for calc_load */
3405static atomic_long_t calc_load_tasks;
3406static unsigned long calc_load_update;
3407unsigned long avenrun[3];
3408EXPORT_SYMBOL(avenrun);
3409
Peter Zijlstra74f51872010-04-22 21:50:19 +02003410static long calc_load_fold_active(struct rq *this_rq)
3411{
3412 long nr_active, delta = 0;
3413
3414 nr_active = this_rq->nr_running;
3415 nr_active += (long) this_rq->nr_uninterruptible;
3416
3417 if (nr_active != this_rq->calc_load_active) {
3418 delta = nr_active - this_rq->calc_load_active;
3419 this_rq->calc_load_active = nr_active;
3420 }
3421
3422 return delta;
3423}
3424
Peter Zijlstra0f004f52010-11-30 19:48:45 +01003425static unsigned long
3426calc_load(unsigned long load, unsigned long exp, unsigned long active)
3427{
3428 load *= exp;
3429 load += active * (FIXED_1 - exp);
3430 load += 1UL << (FSHIFT - 1);
3431 return load >> FSHIFT;
3432}
3433
Peter Zijlstra74f51872010-04-22 21:50:19 +02003434#ifdef CONFIG_NO_HZ
3435/*
3436 * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
3437 *
3438 * When making the ILB scale, we should try to pull this in as well.
3439 */
3440static atomic_long_t calc_load_tasks_idle;
3441
3442static void calc_load_account_idle(struct rq *this_rq)
3443{
3444 long delta;
3445
3446 delta = calc_load_fold_active(this_rq);
3447 if (delta)
3448 atomic_long_add(delta, &calc_load_tasks_idle);
3449}
3450
3451static long calc_load_fold_idle(void)
3452{
3453 long delta = 0;
3454
3455 /*
3456 * Its got a race, we don't care...
3457 */
3458 if (atomic_long_read(&calc_load_tasks_idle))
3459 delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
3460
3461 return delta;
3462}
Peter Zijlstra0f004f52010-11-30 19:48:45 +01003463
3464/**
3465 * fixed_power_int - compute: x^n, in O(log n) time
3466 *
3467 * @x: base of the power
3468 * @frac_bits: fractional bits of @x
3469 * @n: power to raise @x to.
3470 *
3471 * By exploiting the relation between the definition of the natural power
3472 * function: x^n := x*x*...*x (x multiplied by itself for n times), and
3473 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
3474 * (where: n_i \elem {0, 1}, the binary vector representing n),
3475 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
3476 * of course trivially computable in O(log_2 n), the length of our binary
3477 * vector.
3478 */
3479static unsigned long
3480fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
3481{
3482 unsigned long result = 1UL << frac_bits;
3483
3484 if (n) for (;;) {
3485 if (n & 1) {
3486 result *= x;
3487 result += 1UL << (frac_bits - 1);
3488 result >>= frac_bits;
3489 }
3490 n >>= 1;
3491 if (!n)
3492 break;
3493 x *= x;
3494 x += 1UL << (frac_bits - 1);
3495 x >>= frac_bits;
3496 }
3497
3498 return result;
3499}
3500
3501/*
3502 * a1 = a0 * e + a * (1 - e)
3503 *
3504 * a2 = a1 * e + a * (1 - e)
3505 * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
3506 * = a0 * e^2 + a * (1 - e) * (1 + e)
3507 *
3508 * a3 = a2 * e + a * (1 - e)
3509 * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
3510 * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
3511 *
3512 * ...
3513 *
3514 * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
3515 * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
3516 * = a0 * e^n + a * (1 - e^n)
3517 *
3518 * [1] application of the geometric series:
3519 *
3520 * n 1 - x^(n+1)
3521 * S_n := \Sum x^i = -------------
3522 * i=0 1 - x
3523 */
3524static unsigned long
3525calc_load_n(unsigned long load, unsigned long exp,
3526 unsigned long active, unsigned int n)
3527{
3528
3529 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
3530}
3531
3532/*
3533 * NO_HZ can leave us missing all per-cpu ticks calling
3534 * calc_load_account_active(), but since an idle CPU folds its delta into
3535 * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
3536 * in the pending idle delta if our idle period crossed a load cycle boundary.
3537 *
3538 * Once we've updated the global active value, we need to apply the exponential
3539 * weights adjusted to the number of cycles missed.
3540 */
3541static void calc_global_nohz(unsigned long ticks)
3542{
3543 long delta, active, n;
3544
3545 if (time_before(jiffies, calc_load_update))
3546 return;
3547
3548 /*
3549 * If we crossed a calc_load_update boundary, make sure to fold
3550 * any pending idle changes, the respective CPUs might have
3551 * missed the tick driven calc_load_account_active() update
3552 * due to NO_HZ.
3553 */
3554 delta = calc_load_fold_idle();
3555 if (delta)
3556 atomic_long_add(delta, &calc_load_tasks);
3557
3558 /*
3559 * If we were idle for multiple load cycles, apply them.
3560 */
3561 if (ticks >= LOAD_FREQ) {
3562 n = ticks / LOAD_FREQ;
3563
3564 active = atomic_long_read(&calc_load_tasks);
3565 active = active > 0 ? active * FIXED_1 : 0;
3566
3567 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
3568 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
3569 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
3570
3571 calc_load_update += n * LOAD_FREQ;
3572 }
3573
3574 /*
3575 * Its possible the remainder of the above division also crosses
3576 * a LOAD_FREQ period, the regular check in calc_global_load()
3577 * which comes after this will take care of that.
3578 *
3579 * Consider us being 11 ticks before a cycle completion, and us
3580 * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
3581 * age us 4 cycles, and the test in calc_global_load() will
3582 * pick up the final one.
3583 */
3584}
Peter Zijlstra74f51872010-04-22 21:50:19 +02003585#else
3586static void calc_load_account_idle(struct rq *this_rq)
3587{
3588}
3589
3590static inline long calc_load_fold_idle(void)
3591{
3592 return 0;
3593}
Peter Zijlstra0f004f52010-11-30 19:48:45 +01003594
3595static void calc_global_nohz(unsigned long ticks)
3596{
3597}
Peter Zijlstra74f51872010-04-22 21:50:19 +02003598#endif
3599
Thomas Gleixner2d024942009-05-02 20:08:52 +02003600/**
3601 * get_avenrun - get the load average array
3602 * @loads: pointer to dest load array
3603 * @offset: offset to add
3604 * @shift: shift count to shift the result left
3605 *
3606 * These values are estimates at best, so no need for locking.
3607 */
3608void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
3609{
3610 loads[0] = (avenrun[0] + offset) << shift;
3611 loads[1] = (avenrun[1] + offset) << shift;
3612 loads[2] = (avenrun[2] + offset) << shift;
3613}
3614
Thomas Gleixnerdce48a82009-04-11 10:43:41 +02003615/*
3616 * calc_load - update the avenrun load estimates 10 ticks after the
3617 * CPUs have updated calc_load_tasks.
3618 */
Peter Zijlstra0f004f52010-11-30 19:48:45 +01003619void calc_global_load(unsigned long ticks)
Thomas Gleixnerdce48a82009-04-11 10:43:41 +02003620{
Thomas Gleixnerdce48a82009-04-11 10:43:41 +02003621 long active;
3622
Peter Zijlstra0f004f52010-11-30 19:48:45 +01003623 calc_global_nohz(ticks);
3624
3625 if (time_before(jiffies, calc_load_update + 10))
Thomas Gleixnerdce48a82009-04-11 10:43:41 +02003626 return;
3627
3628 active = atomic_long_read(&calc_load_tasks);
3629 active = active > 0 ? active * FIXED_1 : 0;
3630
3631 avenrun[0] = calc_load(avenrun[0], EXP_1, active);
3632 avenrun[1] = calc_load(avenrun[1], EXP_5, active);
3633 avenrun[2] = calc_load(avenrun[2], EXP_15, active);
3634
3635 calc_load_update += LOAD_FREQ;
3636}
3637
3638/*
Peter Zijlstra74f51872010-04-22 21:50:19 +02003639 * Called from update_cpu_load() to periodically update this CPU's
3640 * active count.
Thomas Gleixnerdce48a82009-04-11 10:43:41 +02003641 */
3642static void calc_load_account_active(struct rq *this_rq)
3643{
Peter Zijlstra74f51872010-04-22 21:50:19 +02003644 long delta;
Thomas Gleixnerdce48a82009-04-11 10:43:41 +02003645
Peter Zijlstra74f51872010-04-22 21:50:19 +02003646 if (time_before(jiffies, this_rq->calc_load_update))
3647 return;
Thomas Gleixnerdce48a82009-04-11 10:43:41 +02003648
Peter Zijlstra74f51872010-04-22 21:50:19 +02003649 delta = calc_load_fold_active(this_rq);
3650 delta += calc_load_fold_idle();
3651 if (delta)
Thomas Gleixnerdce48a82009-04-11 10:43:41 +02003652 atomic_long_add(delta, &calc_load_tasks);
Peter Zijlstra74f51872010-04-22 21:50:19 +02003653
3654 this_rq->calc_load_update += LOAD_FREQ;
Jack Steinerdb1b1fe2006-03-31 02:31:21 -08003655}
3656
Linus Torvalds1da177e2005-04-16 15:20:36 -07003657/*
Venkatesh Pallipadifdf3e952010-05-17 18:14:43 -07003658 * The exact cpuload at various idx values, calculated at every tick would be
3659 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
3660 *
3661 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
3662 * on nth tick when cpu may be busy, then we have:
3663 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
3664 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
3665 *
3666 * decay_load_missed() below does efficient calculation of
3667 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
3668 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
3669 *
3670 * The calculation is approximated on a 128 point scale.
3671 * degrade_zero_ticks is the number of ticks after which load at any
3672 * particular idx is approximated to be zero.
3673 * degrade_factor is a precomputed table, a row for each load idx.
3674 * Each column corresponds to degradation factor for a power of two ticks,
3675 * based on 128 point scale.
3676 * Example:
3677 * row 2, col 3 (=12) says that the degradation at load idx 2 after
3678 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
3679 *
3680 * With this power of 2 load factors, we can degrade the load n times
3681 * by looking at 1 bits in n and doing as many mult/shift instead of
3682 * n mult/shifts needed by the exact degradation.
3683 */
3684#define DEGRADE_SHIFT 7
3685static const unsigned char
3686 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
3687static const unsigned char
3688 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
3689 {0, 0, 0, 0, 0, 0, 0, 0},
3690 {64, 32, 8, 0, 0, 0, 0, 0},
3691 {96, 72, 40, 12, 1, 0, 0},
3692 {112, 98, 75, 43, 15, 1, 0},
3693 {120, 112, 98, 76, 45, 16, 2} };
3694
3695/*
3696 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
3697 * would be when CPU is idle and so we just decay the old load without
3698 * adding any new load.
3699 */
3700static unsigned long
3701decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
3702{
3703 int j = 0;
3704
3705 if (!missed_updates)
3706 return load;
3707
3708 if (missed_updates >= degrade_zero_ticks[idx])
3709 return 0;
3710
3711 if (idx == 1)
3712 return load >> missed_updates;
3713
3714 while (missed_updates) {
3715 if (missed_updates % 2)
3716 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
3717
3718 missed_updates >>= 1;
3719 j++;
3720 }
3721 return load;
3722}
3723
3724/*
Ingo Molnardd41f592007-07-09 18:51:59 +02003725 * Update rq->cpu_load[] statistics. This function is usually called every
Venkatesh Pallipadifdf3e952010-05-17 18:14:43 -07003726 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
3727 * every tick. We fix it up based on jiffies.
Ingo Molnar48f24c42006-07-03 00:25:40 -07003728 */
Ingo Molnardd41f592007-07-09 18:51:59 +02003729static void update_cpu_load(struct rq *this_rq)
Ingo Molnar48f24c42006-07-03 00:25:40 -07003730{
Dmitry Adamushko495eca42007-10-15 17:00:06 +02003731 unsigned long this_load = this_rq->load.weight;
Venkatesh Pallipadifdf3e952010-05-17 18:14:43 -07003732 unsigned long curr_jiffies = jiffies;
3733 unsigned long pending_updates;
Ingo Molnardd41f592007-07-09 18:51:59 +02003734 int i, scale;
3735
3736 this_rq->nr_load_updates++;
Ingo Molnardd41f592007-07-09 18:51:59 +02003737
Venkatesh Pallipadifdf3e952010-05-17 18:14:43 -07003738 /* Avoid repeated calls on same jiffy, when moving in and out of idle */
3739 if (curr_jiffies == this_rq->last_load_update_tick)
3740 return;
3741
3742 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
3743 this_rq->last_load_update_tick = curr_jiffies;
3744
Ingo Molnardd41f592007-07-09 18:51:59 +02003745 /* Update our load: */
Venkatesh Pallipadifdf3e952010-05-17 18:14:43 -07003746 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
3747 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
Ingo Molnardd41f592007-07-09 18:51:59 +02003748 unsigned long old_load, new_load;
3749
3750 /* scale is effectively 1 << i now, and >> i divides by scale */
3751
3752 old_load = this_rq->cpu_load[i];
Venkatesh Pallipadifdf3e952010-05-17 18:14:43 -07003753 old_load = decay_load_missed(old_load, pending_updates - 1, i);
Ingo Molnardd41f592007-07-09 18:51:59 +02003754 new_load = this_load;
Ingo Molnara25707f2007-10-15 17:00:03 +02003755 /*
3756 * Round up the averaging division if load is increasing. This
3757 * prevents us from getting stuck on 9 if the load is 10, for
3758 * example.
3759 */
3760 if (new_load > old_load)
Venkatesh Pallipadifdf3e952010-05-17 18:14:43 -07003761 new_load += scale - 1;
3762
3763 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
Ingo Molnardd41f592007-07-09 18:51:59 +02003764 }
Suresh Siddhada2b71e2010-08-23 13:42:51 -07003765
3766 sched_avg_update(this_rq);
Venkatesh Pallipadifdf3e952010-05-17 18:14:43 -07003767}
3768
3769static void update_cpu_load_active(struct rq *this_rq)
3770{
3771 update_cpu_load(this_rq);
Thomas Gleixnerdce48a82009-04-11 10:43:41 +02003772
Peter Zijlstra74f51872010-04-22 21:50:19 +02003773 calc_load_account_active(this_rq);
Ingo Molnar48f24c42006-07-03 00:25:40 -07003774}
3775
Ingo Molnardd41f592007-07-09 18:51:59 +02003776#ifdef CONFIG_SMP
3777
Ingo Molnar48f24c42006-07-03 00:25:40 -07003778/*
Peter Zijlstra38022902009-12-16 18:04:37 +01003779 * sched_exec - execve() is a valuable balancing opportunity, because at
3780 * this point the task has the smallest effective memory and cache footprint.
Linus Torvalds1da177e2005-04-16 15:20:36 -07003781 */
Peter Zijlstra38022902009-12-16 18:04:37 +01003782void sched_exec(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003783{
Peter Zijlstra38022902009-12-16 18:04:37 +01003784 struct task_struct *p = current;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003785 unsigned long flags;
Peter Zijlstra0017d732010-03-24 18:34:10 +01003786 int dest_cpu;
Peter Zijlstra38022902009-12-16 18:04:37 +01003787
Peter Zijlstra8f42ced2011-04-05 17:23:53 +02003788 raw_spin_lock_irqsave(&p->pi_lock, flags);
Peter Zijlstra7608dec2011-04-05 17:23:46 +02003789 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
Peter Zijlstra0017d732010-03-24 18:34:10 +01003790 if (dest_cpu == smp_processor_id())
3791 goto unlock;
Peter Zijlstra38022902009-12-16 18:04:37 +01003792
Peter Zijlstra8f42ced2011-04-05 17:23:53 +02003793 if (likely(cpu_active(dest_cpu))) {
Tejun Heo969c7922010-05-06 18:49:21 +02003794 struct migration_arg arg = { p, dest_cpu };
Ingo Molnar36c8b582006-07-03 00:25:41 -07003795
Peter Zijlstra8f42ced2011-04-05 17:23:53 +02003796 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3797 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003798 return;
3799 }
Peter Zijlstra0017d732010-03-24 18:34:10 +01003800unlock:
Peter Zijlstra8f42ced2011-04-05 17:23:53 +02003801 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003802}
3803
Linus Torvalds1da177e2005-04-16 15:20:36 -07003804#endif
3805
Linus Torvalds1da177e2005-04-16 15:20:36 -07003806DEFINE_PER_CPU(struct kernel_stat, kstat);
3807
3808EXPORT_PER_CPU_SYMBOL(kstat);
3809
3810/*
Hidetoshi Setoc5f8d992009-03-31 16:56:03 +09003811 * Return any ns on the sched_clock that have not yet been accounted in
Frank Mayharf06febc2008-09-12 09:54:39 -07003812 * @p in case that task is currently running.
Hidetoshi Setoc5f8d992009-03-31 16:56:03 +09003813 *
3814 * Called with task_rq_lock() held on @rq.
Linus Torvalds1da177e2005-04-16 15:20:36 -07003815 */
Hidetoshi Setoc5f8d992009-03-31 16:56:03 +09003816static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
3817{
3818 u64 ns = 0;
3819
3820 if (task_current(rq, p)) {
3821 update_rq_clock(rq);
Venkatesh Pallipadi305e6832010-10-04 17:03:21 -07003822 ns = rq->clock_task - p->se.exec_start;
Hidetoshi Setoc5f8d992009-03-31 16:56:03 +09003823 if ((s64)ns < 0)
3824 ns = 0;
3825 }
3826
3827 return ns;
3828}
3829
Frank Mayharbb34d922008-09-12 09:54:39 -07003830unsigned long long task_delta_exec(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003831{
Linus Torvalds1da177e2005-04-16 15:20:36 -07003832 unsigned long flags;
Ingo Molnar41b86e92007-07-09 18:51:58 +02003833 struct rq *rq;
Frank Mayharbb34d922008-09-12 09:54:39 -07003834 u64 ns = 0;
Ingo Molnar48f24c42006-07-03 00:25:40 -07003835
Ingo Molnar41b86e92007-07-09 18:51:58 +02003836 rq = task_rq_lock(p, &flags);
Hidetoshi Setoc5f8d992009-03-31 16:56:03 +09003837 ns = do_task_delta_exec(p, rq);
Peter Zijlstra0122ec52011-04-05 17:23:51 +02003838 task_rq_unlock(rq, p, &flags);
Ingo Molnar15084872008-09-30 08:28:17 +02003839
Hidetoshi Setoc5f8d992009-03-31 16:56:03 +09003840 return ns;
3841}
Frank Mayharf06febc2008-09-12 09:54:39 -07003842
Hidetoshi Setoc5f8d992009-03-31 16:56:03 +09003843/*
3844 * Return accounted runtime for the task.
3845 * In case the task is currently running, return the runtime plus current's
3846 * pending runtime that have not been accounted yet.
3847 */
3848unsigned long long task_sched_runtime(struct task_struct *p)
3849{
3850 unsigned long flags;
3851 struct rq *rq;
3852 u64 ns = 0;
Ingo Molnar48f24c42006-07-03 00:25:40 -07003853
Hidetoshi Setoc5f8d992009-03-31 16:56:03 +09003854 rq = task_rq_lock(p, &flags);
3855 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
Peter Zijlstra0122ec52011-04-05 17:23:51 +02003856 task_rq_unlock(rq, p, &flags);
Hidetoshi Setoc5f8d992009-03-31 16:56:03 +09003857
3858 return ns;
3859}
3860
3861/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07003862 * Account user cpu time to a process.
3863 * @p: the process that the cpu time gets accounted to
Linus Torvalds1da177e2005-04-16 15:20:36 -07003864 * @cputime: the cpu time spent in user space since the last update
Martin Schwidefsky457533a2008-12-31 15:11:37 +01003865 * @cputime_scaled: cputime scaled by cpu frequency
Linus Torvalds1da177e2005-04-16 15:20:36 -07003866 */
Martin Schwidefsky457533a2008-12-31 15:11:37 +01003867void account_user_time(struct task_struct *p, cputime_t cputime,
3868 cputime_t cputime_scaled)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003869{
3870 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3871 cputime64_t tmp;
3872
Martin Schwidefsky457533a2008-12-31 15:11:37 +01003873 /* Add user time to process. */
Linus Torvalds1da177e2005-04-16 15:20:36 -07003874 p->utime = cputime_add(p->utime, cputime);
Martin Schwidefsky457533a2008-12-31 15:11:37 +01003875 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
Frank Mayharf06febc2008-09-12 09:54:39 -07003876 account_group_user_time(p, cputime);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003877
3878 /* Add user time to cpustat. */
3879 tmp = cputime_to_cputime64(cputime);
3880 if (TASK_NICE(p) > 0)
3881 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3882 else
3883 cpustat->user = cputime64_add(cpustat->user, tmp);
Bharata B Raoef12fef2009-03-31 10:02:22 +05303884
3885 cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
Jonathan Lim49b5cf32008-07-25 01:48:40 -07003886 /* Account for user time used */
3887 acct_update_integrals(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003888}
3889
3890/*
Laurent Vivier94886b82007-10-15 17:00:19 +02003891 * Account guest cpu time to a process.
3892 * @p: the process that the cpu time gets accounted to
3893 * @cputime: the cpu time spent in virtual machine since the last update
Martin Schwidefsky457533a2008-12-31 15:11:37 +01003894 * @cputime_scaled: cputime scaled by cpu frequency
Laurent Vivier94886b82007-10-15 17:00:19 +02003895 */
Martin Schwidefsky457533a2008-12-31 15:11:37 +01003896static void account_guest_time(struct task_struct *p, cputime_t cputime,
3897 cputime_t cputime_scaled)
Laurent Vivier94886b82007-10-15 17:00:19 +02003898{
3899 cputime64_t tmp;
3900 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3901
3902 tmp = cputime_to_cputime64(cputime);
3903
Martin Schwidefsky457533a2008-12-31 15:11:37 +01003904 /* Add guest time to process. */
Laurent Vivier94886b82007-10-15 17:00:19 +02003905 p->utime = cputime_add(p->utime, cputime);
Martin Schwidefsky457533a2008-12-31 15:11:37 +01003906 p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
Frank Mayharf06febc2008-09-12 09:54:39 -07003907 account_group_user_time(p, cputime);
Laurent Vivier94886b82007-10-15 17:00:19 +02003908 p->gtime = cputime_add(p->gtime, cputime);
3909
Martin Schwidefsky457533a2008-12-31 15:11:37 +01003910 /* Add guest time to cpustat. */
Ryota Ozakice0e7b22009-10-24 01:20:10 +09003911 if (TASK_NICE(p) > 0) {
3912 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3913 cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);
3914 } else {
3915 cpustat->user = cputime64_add(cpustat->user, tmp);
3916 cpustat->guest = cputime64_add(cpustat->guest, tmp);
3917 }
Laurent Vivier94886b82007-10-15 17:00:19 +02003918}
3919
3920/*
Venkatesh Pallipadi70a89a62010-12-21 17:09:02 -08003921 * Account system cpu time to a process and desired cpustat field
3922 * @p: the process that the cpu time gets accounted to
3923 * @cputime: the cpu time spent in kernel space since the last update
3924 * @cputime_scaled: cputime scaled by cpu frequency
3925 * @target_cputime64: pointer to cpustat field that has to be updated
3926 */
3927static inline
3928void __account_system_time(struct task_struct *p, cputime_t cputime,
3929 cputime_t cputime_scaled, cputime64_t *target_cputime64)
3930{
3931 cputime64_t tmp = cputime_to_cputime64(cputime);
3932
3933 /* Add system time to process. */
3934 p->stime = cputime_add(p->stime, cputime);
3935 p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
3936 account_group_system_time(p, cputime);
3937
3938 /* Add system time to cpustat. */
3939 *target_cputime64 = cputime64_add(*target_cputime64, tmp);
3940 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
3941
3942 /* Account for system time used */
3943 acct_update_integrals(p);
3944}
3945
3946/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07003947 * Account system cpu time to a process.
3948 * @p: the process that the cpu time gets accounted to
3949 * @hardirq_offset: the offset to subtract from hardirq_count()
3950 * @cputime: the cpu time spent in kernel space since the last update
Martin Schwidefsky457533a2008-12-31 15:11:37 +01003951 * @cputime_scaled: cputime scaled by cpu frequency
Linus Torvalds1da177e2005-04-16 15:20:36 -07003952 */
3953void account_system_time(struct task_struct *p, int hardirq_offset,
Martin Schwidefsky457533a2008-12-31 15:11:37 +01003954 cputime_t cputime, cputime_t cputime_scaled)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003955{
3956 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
Venkatesh Pallipadi70a89a62010-12-21 17:09:02 -08003957 cputime64_t *target_cputime64;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003958
Harvey Harrison983ed7a2008-04-24 18:17:55 -07003959 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
Martin Schwidefsky457533a2008-12-31 15:11:37 +01003960 account_guest_time(p, cputime, cputime_scaled);
Harvey Harrison983ed7a2008-04-24 18:17:55 -07003961 return;
3962 }
Laurent Vivier94886b82007-10-15 17:00:19 +02003963
Linus Torvalds1da177e2005-04-16 15:20:36 -07003964 if (hardirq_count() - hardirq_offset)
Venkatesh Pallipadi70a89a62010-12-21 17:09:02 -08003965 target_cputime64 = &cpustat->irq;
Venkatesh Pallipadi75e10562010-10-04 17:03:16 -07003966 else if (in_serving_softirq())
Venkatesh Pallipadi70a89a62010-12-21 17:09:02 -08003967 target_cputime64 = &cpustat->softirq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003968 else
Venkatesh Pallipadi70a89a62010-12-21 17:09:02 -08003969 target_cputime64 = &cpustat->system;
Martin Schwidefsky79741dd2008-12-31 15:11:38 +01003970
Venkatesh Pallipadi70a89a62010-12-21 17:09:02 -08003971 __account_system_time(p, cputime, cputime_scaled, target_cputime64);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003972}
3973
3974/*
3975 * Account for involuntary wait time.
Venkatesh Pallipadi544b4a12011-02-25 15:13:16 -08003976 * @cputime: the cpu time spent in involuntary wait
Linus Torvalds1da177e2005-04-16 15:20:36 -07003977 */
Martin Schwidefsky79741dd2008-12-31 15:11:38 +01003978void account_steal_time(cputime_t cputime)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003979{
Linus Torvalds1da177e2005-04-16 15:20:36 -07003980 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
Martin Schwidefsky79741dd2008-12-31 15:11:38 +01003981 cputime64_t cputime64 = cputime_to_cputime64(cputime);
3982
3983 cpustat->steal = cputime64_add(cpustat->steal, cputime64);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003984}
3985
Christoph Lameter7835b982006-12-10 02:20:22 -08003986/*
Martin Schwidefsky79741dd2008-12-31 15:11:38 +01003987 * Account for idle time.
3988 * @cputime: the cpu time spent in idle wait
Linus Torvalds1da177e2005-04-16 15:20:36 -07003989 */
Martin Schwidefsky79741dd2008-12-31 15:11:38 +01003990void account_idle_time(cputime_t cputime)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003991{
3992 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
Martin Schwidefsky79741dd2008-12-31 15:11:38 +01003993 cputime64_t cputime64 = cputime_to_cputime64(cputime);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003994 struct rq *rq = this_rq();
3995
Martin Schwidefsky79741dd2008-12-31 15:11:38 +01003996 if (atomic_read(&rq->nr_iowait) > 0)
3997 cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
3998 else
3999 cpustat->idle = cputime64_add(cpustat->idle, cputime64);
Christoph Lameter7835b982006-12-10 02:20:22 -08004000}
4001
Glauber Costae6e66852011-07-11 15:28:17 -04004002static __always_inline bool steal_account_process_tick(void)
4003{
4004#ifdef CONFIG_PARAVIRT
4005 if (static_branch(&paravirt_steal_enabled)) {
4006 u64 steal, st = 0;
4007
4008 steal = paravirt_steal_clock(smp_processor_id());
4009 steal -= this_rq()->prev_steal_time;
4010
4011 st = steal_ticks(steal);
4012 this_rq()->prev_steal_time += st * TICK_NSEC;
4013
4014 account_steal_time(st);
4015 return st;
4016 }
4017#endif
4018 return false;
4019}
4020
Martin Schwidefsky79741dd2008-12-31 15:11:38 +01004021#ifndef CONFIG_VIRT_CPU_ACCOUNTING
4022
Venkatesh Pallipadiabb74ce2010-12-21 17:09:03 -08004023#ifdef CONFIG_IRQ_TIME_ACCOUNTING
4024/*
4025 * Account a tick to a process and cpustat
4026 * @p: the process that the cpu time gets accounted to
4027 * @user_tick: is the tick from userspace
4028 * @rq: the pointer to rq
4029 *
4030 * Tick demultiplexing follows the order
4031 * - pending hardirq update
4032 * - pending softirq update
4033 * - user_time
4034 * - idle_time
4035 * - system time
4036 * - check for guest_time
4037 * - else account as system_time
4038 *
4039 * Check for hardirq is done both for system and user time as there is
4040 * no timer going off while we are on hardirq and hence we may never get an
4041 * opportunity to update it solely in system time.
4042 * p->stime and friends are only updated on system time and not on irq
4043 * softirq as those do not count in task exec_runtime any more.
4044 */
4045static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
4046 struct rq *rq)
4047{
4048 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
4049 cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
4050 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
4051
Glauber Costae6e66852011-07-11 15:28:17 -04004052 if (steal_account_process_tick())
4053 return;
4054
Venkatesh Pallipadiabb74ce2010-12-21 17:09:03 -08004055 if (irqtime_account_hi_update()) {
4056 cpustat->irq = cputime64_add(cpustat->irq, tmp);
4057 } else if (irqtime_account_si_update()) {
4058 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
Venkatesh Pallipadi414bee92010-12-21 17:09:04 -08004059 } else if (this_cpu_ksoftirqd() == p) {
4060 /*
4061 * ksoftirqd time do not get accounted in cpu_softirq_time.
4062 * So, we have to handle it separately here.
4063 * Also, p->stime needs to be updated for ksoftirqd.
4064 */
4065 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
4066 &cpustat->softirq);
Venkatesh Pallipadiabb74ce2010-12-21 17:09:03 -08004067 } else if (user_tick) {
4068 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
4069 } else if (p == rq->idle) {
4070 account_idle_time(cputime_one_jiffy);
4071 } else if (p->flags & PF_VCPU) { /* System time or guest time */
4072 account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
4073 } else {
4074 __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
4075 &cpustat->system);
4076 }
4077}
4078
4079static void irqtime_account_idle_ticks(int ticks)
4080{
4081 int i;
4082 struct rq *rq = this_rq();
4083
4084 for (i = 0; i < ticks; i++)
4085 irqtime_account_process_tick(current, 0, rq);
4086}
Venkatesh Pallipadi544b4a12011-02-25 15:13:16 -08004087#else /* CONFIG_IRQ_TIME_ACCOUNTING */
Venkatesh Pallipadiabb74ce2010-12-21 17:09:03 -08004088static void irqtime_account_idle_ticks(int ticks) {}
4089static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
4090 struct rq *rq) {}
Venkatesh Pallipadi544b4a12011-02-25 15:13:16 -08004091#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
Martin Schwidefsky79741dd2008-12-31 15:11:38 +01004092
4093/*
4094 * Account a single tick of cpu time.
4095 * @p: the process that the cpu time gets accounted to
4096 * @user_tick: indicates if the tick is a user or a system tick
4097 */
4098void account_process_tick(struct task_struct *p, int user_tick)
4099{
Stanislaw Gruszkaa42548a2009-07-29 12:15:29 +02004100 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
Martin Schwidefsky79741dd2008-12-31 15:11:38 +01004101 struct rq *rq = this_rq();
4102
Venkatesh Pallipadiabb74ce2010-12-21 17:09:03 -08004103 if (sched_clock_irqtime) {
4104 irqtime_account_process_tick(p, user_tick, rq);
4105 return;
4106 }
4107
Glauber Costae6e66852011-07-11 15:28:17 -04004108 if (steal_account_process_tick())
4109 return;
4110
Martin Schwidefsky79741dd2008-12-31 15:11:38 +01004111 if (user_tick)
Stanislaw Gruszkaa42548a2009-07-29 12:15:29 +02004112 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
Eric Dumazetf5f293a2009-04-29 14:44:49 +02004113 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
Stanislaw Gruszkaa42548a2009-07-29 12:15:29 +02004114 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
Martin Schwidefsky79741dd2008-12-31 15:11:38 +01004115 one_jiffy_scaled);
4116 else
Stanislaw Gruszkaa42548a2009-07-29 12:15:29 +02004117 account_idle_time(cputime_one_jiffy);
Martin Schwidefsky79741dd2008-12-31 15:11:38 +01004118}
4119
4120/*
4121 * Account multiple ticks of steal time.
4122 * @p: the process from which the cpu time has been stolen
4123 * @ticks: number of stolen ticks
4124 */
4125void account_steal_ticks(unsigned long ticks)
4126{
4127 account_steal_time(jiffies_to_cputime(ticks));
4128}
4129
4130/*
4131 * Account multiple ticks of idle time.
4132 * @ticks: number of stolen ticks
4133 */
4134void account_idle_ticks(unsigned long ticks)
4135{
Venkatesh Pallipadiabb74ce2010-12-21 17:09:03 -08004136
4137 if (sched_clock_irqtime) {
4138 irqtime_account_idle_ticks(ticks);
4139 return;
4140 }
4141
Martin Schwidefsky79741dd2008-12-31 15:11:38 +01004142 account_idle_time(jiffies_to_cputime(ticks));
4143}
4144
4145#endif
4146
Christoph Lameter7835b982006-12-10 02:20:22 -08004147/*
Balbir Singh49048622008-09-05 18:12:23 +02004148 * Use precise platform statistics if available:
4149 */
4150#ifdef CONFIG_VIRT_CPU_ACCOUNTING
Hidetoshi Setod180c5b2009-11-26 14:48:30 +09004151void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
Balbir Singh49048622008-09-05 18:12:23 +02004152{
Hidetoshi Setod99ca3b2009-12-02 17:26:47 +09004153 *ut = p->utime;
4154 *st = p->stime;
Balbir Singh49048622008-09-05 18:12:23 +02004155}
4156
Hidetoshi Seto0cf55e12009-12-02 17:28:07 +09004157void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
Balbir Singh49048622008-09-05 18:12:23 +02004158{
Hidetoshi Seto0cf55e12009-12-02 17:28:07 +09004159 struct task_cputime cputime;
4160
4161 thread_group_cputime(p, &cputime);
4162
4163 *ut = cputime.utime;
4164 *st = cputime.stime;
Balbir Singh49048622008-09-05 18:12:23 +02004165}
4166#else
Hidetoshi Seto761b1d22009-11-12 13:33:45 +09004167
4168#ifndef nsecs_to_cputime
Hidetoshi Setob7b20df92009-11-26 14:49:27 +09004169# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
Hidetoshi Seto761b1d22009-11-12 13:33:45 +09004170#endif
4171
Hidetoshi Setod180c5b2009-11-26 14:48:30 +09004172void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
Balbir Singh49048622008-09-05 18:12:23 +02004173{
Hidetoshi Setod99ca3b2009-12-02 17:26:47 +09004174 cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);
Balbir Singh49048622008-09-05 18:12:23 +02004175
4176 /*
4177 * Use CFS's precise accounting:
4178 */
Hidetoshi Setod180c5b2009-11-26 14:48:30 +09004179 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
Balbir Singh49048622008-09-05 18:12:23 +02004180
4181 if (total) {
Stanislaw Gruszkae75e8632010-09-14 16:35:14 +02004182 u64 temp = rtime;
Balbir Singh49048622008-09-05 18:12:23 +02004183
Stanislaw Gruszkae75e8632010-09-14 16:35:14 +02004184 temp *= utime;
Balbir Singh49048622008-09-05 18:12:23 +02004185 do_div(temp, total);
Hidetoshi Setod180c5b2009-11-26 14:48:30 +09004186 utime = (cputime_t)temp;
4187 } else
4188 utime = rtime;
Balbir Singh49048622008-09-05 18:12:23 +02004189
4190 /*
Hidetoshi Setod180c5b2009-11-26 14:48:30 +09004191 * Compare with previous values, to keep monotonicity:
Balbir Singh49048622008-09-05 18:12:23 +02004192 */
Hidetoshi Seto761b1d22009-11-12 13:33:45 +09004193 p->prev_utime = max(p->prev_utime, utime);
Hidetoshi Setod99ca3b2009-12-02 17:26:47 +09004194 p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));
Balbir Singh49048622008-09-05 18:12:23 +02004195
Hidetoshi Setod99ca3b2009-12-02 17:26:47 +09004196 *ut = p->prev_utime;
4197 *st = p->prev_stime;
Hidetoshi Setod180c5b2009-11-26 14:48:30 +09004198}
Balbir Singh49048622008-09-05 18:12:23 +02004199
Hidetoshi Seto0cf55e12009-12-02 17:28:07 +09004200/*
4201 * Must be called with siglock held.
4202 */
4203void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
4204{
4205 struct signal_struct *sig = p->signal;
4206 struct task_cputime cputime;
4207 cputime_t rtime, utime, total;
4208
4209 thread_group_cputime(p, &cputime);
4210
4211 total = cputime_add(cputime.utime, cputime.stime);
4212 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
4213
4214 if (total) {
Stanislaw Gruszkae75e8632010-09-14 16:35:14 +02004215 u64 temp = rtime;
Hidetoshi Seto0cf55e12009-12-02 17:28:07 +09004216
Stanislaw Gruszkae75e8632010-09-14 16:35:14 +02004217 temp *= cputime.utime;
Hidetoshi Seto0cf55e12009-12-02 17:28:07 +09004218 do_div(temp, total);
4219 utime = (cputime_t)temp;
4220 } else
4221 utime = rtime;
4222
4223 sig->prev_utime = max(sig->prev_utime, utime);
4224 sig->prev_stime = max(sig->prev_stime,
4225 cputime_sub(rtime, sig->prev_utime));
4226
4227 *ut = sig->prev_utime;
4228 *st = sig->prev_stime;
Balbir Singh49048622008-09-05 18:12:23 +02004229}
4230#endif
4231
Balbir Singh49048622008-09-05 18:12:23 +02004232/*
Christoph Lameter7835b982006-12-10 02:20:22 -08004233 * This function gets called by the timer code, with HZ frequency.
4234 * We call it with interrupts disabled.
Christoph Lameter7835b982006-12-10 02:20:22 -08004235 */
4236void scheduler_tick(void)
4237{
Christoph Lameter7835b982006-12-10 02:20:22 -08004238 int cpu = smp_processor_id();
4239 struct rq *rq = cpu_rq(cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02004240 struct task_struct *curr = rq->curr;
Peter Zijlstra3e51f332008-05-03 18:29:28 +02004241
4242 sched_clock_tick();
Christoph Lameter7835b982006-12-10 02:20:22 -08004243
Thomas Gleixner05fa7852009-11-17 14:28:38 +01004244 raw_spin_lock(&rq->lock);
Peter Zijlstra3e51f332008-05-03 18:29:28 +02004245 update_rq_clock(rq);
Venkatesh Pallipadifdf3e952010-05-17 18:14:43 -07004246 update_cpu_load_active(rq);
Peter Zijlstrafa85ae22008-01-25 21:08:29 +01004247 curr->sched_class->task_tick(rq, curr, 0);
Thomas Gleixner05fa7852009-11-17 14:28:38 +01004248 raw_spin_unlock(&rq->lock);
Ingo Molnardd41f592007-07-09 18:51:59 +02004249
Peter Zijlstrae9d2b062010-09-17 11:28:50 +02004250 perf_event_task_tick();
Peter Zijlstrae220d2d2009-05-23 18:28:55 +02004251
Christoph Lametere418e1c2006-12-10 02:20:23 -08004252#ifdef CONFIG_SMP
Suresh Siddha6eb57e02011-10-03 15:09:01 -07004253 rq->idle_balance = idle_cpu(cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02004254 trigger_load_balance(rq, cpu);
Christoph Lametere418e1c2006-12-10 02:20:23 -08004255#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07004256}
4257
Lai Jiangshan132380a2009-04-02 14:18:25 +08004258notrace unsigned long get_parent_ip(unsigned long addr)
Steven Rostedt6cd8a4b2008-05-12 21:20:42 +02004259{
4260 if (in_lock_functions(addr)) {
4261 addr = CALLER_ADDR2;
4262 if (in_lock_functions(addr))
4263 addr = CALLER_ADDR3;
4264 }
4265 return addr;
4266}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004267
Steven Rostedt7e49fcc2009-01-22 19:01:40 -05004268#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
4269 defined(CONFIG_PREEMPT_TRACER))
4270
Srinivasa Ds43627582008-02-23 15:24:04 -08004271void __kprobes add_preempt_count(int val)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004272{
Steven Rostedt6cd8a4b2008-05-12 21:20:42 +02004273#ifdef CONFIG_DEBUG_PREEMPT
Linus Torvalds1da177e2005-04-16 15:20:36 -07004274 /*
4275 * Underflow?
4276 */
Ingo Molnar9a11b49a2006-07-03 00:24:33 -07004277 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
4278 return;
Steven Rostedt6cd8a4b2008-05-12 21:20:42 +02004279#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07004280 preempt_count() += val;
Steven Rostedt6cd8a4b2008-05-12 21:20:42 +02004281#ifdef CONFIG_DEBUG_PREEMPT
Linus Torvalds1da177e2005-04-16 15:20:36 -07004282 /*
4283 * Spinlock count overflowing soon?
4284 */
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08004285 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
4286 PREEMPT_MASK - 10);
Steven Rostedt6cd8a4b2008-05-12 21:20:42 +02004287#endif
4288 if (preempt_count() == val)
4289 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
Linus Torvalds1da177e2005-04-16 15:20:36 -07004290}
4291EXPORT_SYMBOL(add_preempt_count);
4292
Srinivasa Ds43627582008-02-23 15:24:04 -08004293void __kprobes sub_preempt_count(int val)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004294{
Steven Rostedt6cd8a4b2008-05-12 21:20:42 +02004295#ifdef CONFIG_DEBUG_PREEMPT
Linus Torvalds1da177e2005-04-16 15:20:36 -07004296 /*
4297 * Underflow?
4298 */
Ingo Molnar01e3eb82009-01-12 13:00:50 +01004299 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
Ingo Molnar9a11b49a2006-07-03 00:24:33 -07004300 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004301 /*
4302 * Is the spinlock portion underflowing?
4303 */
Ingo Molnar9a11b49a2006-07-03 00:24:33 -07004304 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
4305 !(preempt_count() & PREEMPT_MASK)))
4306 return;
Steven Rostedt6cd8a4b2008-05-12 21:20:42 +02004307#endif
Ingo Molnar9a11b49a2006-07-03 00:24:33 -07004308
Steven Rostedt6cd8a4b2008-05-12 21:20:42 +02004309 if (preempt_count() == val)
4310 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
Linus Torvalds1da177e2005-04-16 15:20:36 -07004311 preempt_count() -= val;
4312}
4313EXPORT_SYMBOL(sub_preempt_count);
4314
4315#endif
4316
4317/*
Ingo Molnardd41f592007-07-09 18:51:59 +02004318 * Print scheduling while atomic bug:
Linus Torvalds1da177e2005-04-16 15:20:36 -07004319 */
Ingo Molnardd41f592007-07-09 18:51:59 +02004320static noinline void __schedule_bug(struct task_struct *prev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004321{
Satyam Sharma838225b2007-10-24 18:23:50 +02004322 struct pt_regs *regs = get_irq_regs();
4323
Peter Zijlstra3df0fc52009-12-20 14:23:57 +01004324 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
4325 prev->comm, prev->pid, preempt_count());
Satyam Sharma838225b2007-10-24 18:23:50 +02004326
Ingo Molnardd41f592007-07-09 18:51:59 +02004327 debug_show_held_locks(prev);
Arjan van de Vene21f5b12008-05-23 09:05:58 -07004328 print_modules();
Ingo Molnardd41f592007-07-09 18:51:59 +02004329 if (irqs_disabled())
4330 print_irqtrace_events(prev);
Satyam Sharma838225b2007-10-24 18:23:50 +02004331
4332 if (regs)
4333 show_regs(regs);
4334 else
4335 dump_stack();
Ingo Molnardd41f592007-07-09 18:51:59 +02004336}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004337
Ingo Molnardd41f592007-07-09 18:51:59 +02004338/*
4339 * Various schedule()-time debugging checks and statistics:
4340 */
4341static inline void schedule_debug(struct task_struct *prev)
4342{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004343 /*
Ingo Molnar41a2d6c2007-12-05 15:46:09 +01004344 * Test if we are atomic. Since do_exit() needs to call into
Linus Torvalds1da177e2005-04-16 15:20:36 -07004345 * schedule() atomically, we ignore that path for now.
4346 * Otherwise, whine if we are scheduling when we should not be.
4347 */
Roel Kluin3f33a7c2008-05-13 23:44:11 +02004348 if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
Ingo Molnardd41f592007-07-09 18:51:59 +02004349 __schedule_bug(prev);
Paul E. McKenneyb3fbab02011-05-24 08:31:09 -07004350 rcu_sleep_check();
Ingo Molnardd41f592007-07-09 18:51:59 +02004351
Linus Torvalds1da177e2005-04-16 15:20:36 -07004352 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
4353
Ingo Molnar2d723762007-10-15 17:00:12 +02004354 schedstat_inc(this_rq(), sched_count);
Ingo Molnardd41f592007-07-09 18:51:59 +02004355}
4356
Peter Zijlstra6cecd082009-11-30 13:00:37 +01004357static void put_prev_task(struct rq *rq, struct task_struct *prev)
Mike Galbraithdf1c99d2009-03-10 19:08:11 +01004358{
Mike Galbraith61eadef2011-04-29 08:36:50 +02004359 if (prev->on_rq || rq->skip_clock_update < 0)
Mike Galbraitha64692a2010-03-11 17:16:20 +01004360 update_rq_clock(rq);
Peter Zijlstra6cecd082009-11-30 13:00:37 +01004361 prev->sched_class->put_prev_task(rq, prev);
Mike Galbraithdf1c99d2009-03-10 19:08:11 +01004362}
4363
Ingo Molnardd41f592007-07-09 18:51:59 +02004364/*
4365 * Pick up the highest-prio task:
4366 */
4367static inline struct task_struct *
Wang Chenb67802e2009-03-02 13:55:26 +08004368pick_next_task(struct rq *rq)
Ingo Molnardd41f592007-07-09 18:51:59 +02004369{
Ingo Molnar5522d5d2007-10-15 17:00:12 +02004370 const struct sched_class *class;
Ingo Molnardd41f592007-07-09 18:51:59 +02004371 struct task_struct *p;
4372
4373 /*
4374 * Optimization: we know that if all tasks are in
4375 * the fair class we can call that function directly:
4376 */
Paul Turner953bfcd2011-07-21 09:43:27 -07004377 if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
Ingo Molnarfb8d4722007-08-09 11:16:48 +02004378 p = fair_sched_class.pick_next_task(rq);
Ingo Molnardd41f592007-07-09 18:51:59 +02004379 if (likely(p))
4380 return p;
4381 }
4382
Peter Zijlstra34f971f2010-09-22 13:53:15 +02004383 for_each_class(class) {
Ingo Molnarfb8d4722007-08-09 11:16:48 +02004384 p = class->pick_next_task(rq);
Ingo Molnardd41f592007-07-09 18:51:59 +02004385 if (p)
4386 return p;
Ingo Molnardd41f592007-07-09 18:51:59 +02004387 }
Peter Zijlstra34f971f2010-09-22 13:53:15 +02004388
4389 BUG(); /* the idle class will always have a runnable task */
Ingo Molnardd41f592007-07-09 18:51:59 +02004390}
4391
4392/*
Thomas Gleixnerc259e012011-06-22 19:47:00 +02004393 * __schedule() is the main scheduler function.
Ingo Molnardd41f592007-07-09 18:51:59 +02004394 */
Thomas Gleixnerc259e012011-06-22 19:47:00 +02004395static void __sched __schedule(void)
Ingo Molnardd41f592007-07-09 18:51:59 +02004396{
4397 struct task_struct *prev, *next;
Harvey Harrison67ca7bd2008-02-15 09:56:36 -08004398 unsigned long *switch_count;
Ingo Molnardd41f592007-07-09 18:51:59 +02004399 struct rq *rq;
Peter Zijlstra31656512008-07-18 18:01:23 +02004400 int cpu;
Ingo Molnardd41f592007-07-09 18:51:59 +02004401
Peter Zijlstraff743342009-03-13 12:21:26 +01004402need_resched:
4403 preempt_disable();
Ingo Molnardd41f592007-07-09 18:51:59 +02004404 cpu = smp_processor_id();
4405 rq = cpu_rq(cpu);
Paul E. McKenney25502a62010-04-01 17:37:01 -07004406 rcu_note_context_switch(cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02004407 prev = rq->curr;
Ingo Molnardd41f592007-07-09 18:51:59 +02004408
Ingo Molnardd41f592007-07-09 18:51:59 +02004409 schedule_debug(prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004410
Peter Zijlstra31656512008-07-18 18:01:23 +02004411 if (sched_feat(HRTICK))
Mike Galbraithf333fdc2008-05-12 21:20:55 +02004412 hrtick_clear(rq);
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01004413
Thomas Gleixner05fa7852009-11-17 14:28:38 +01004414 raw_spin_lock_irq(&rq->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004415
Oleg Nesterov246d86b2010-05-19 14:57:11 +02004416 switch_count = &prev->nivcsw;
Ingo Molnardd41f592007-07-09 18:51:59 +02004417 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
Tejun Heo21aa9af2010-06-08 21:40:37 +02004418 if (unlikely(signal_pending_state(prev->state, prev))) {
Ingo Molnardd41f592007-07-09 18:51:59 +02004419 prev->state = TASK_RUNNING;
Tejun Heo21aa9af2010-06-08 21:40:37 +02004420 } else {
Peter Zijlstra2acca552011-04-05 17:23:50 +02004421 deactivate_task(rq, prev, DEQUEUE_SLEEP);
4422 prev->on_rq = 0;
4423
Tejun Heo21aa9af2010-06-08 21:40:37 +02004424 /*
Peter Zijlstra2acca552011-04-05 17:23:50 +02004425 * If a worker went to sleep, notify and ask workqueue
4426 * whether it wants to wake up a task to maintain
4427 * concurrency.
Tejun Heo21aa9af2010-06-08 21:40:37 +02004428 */
4429 if (prev->flags & PF_WQ_WORKER) {
4430 struct task_struct *to_wakeup;
4431
4432 to_wakeup = wq_worker_sleeping(prev, cpu);
4433 if (to_wakeup)
4434 try_to_wake_up_local(to_wakeup);
4435 }
Tejun Heo21aa9af2010-06-08 21:40:37 +02004436 }
Ingo Molnardd41f592007-07-09 18:51:59 +02004437 switch_count = &prev->nvcsw;
4438 }
4439
Gregory Haskins3f029d32009-07-29 11:08:47 -04004440 pre_schedule(rq, prev);
Steven Rostedtf65eda42008-01-25 21:08:07 +01004441
Ingo Molnardd41f592007-07-09 18:51:59 +02004442 if (unlikely(!rq->nr_running))
4443 idle_balance(cpu, rq);
4444
Mike Galbraithdf1c99d2009-03-10 19:08:11 +01004445 put_prev_task(rq, prev);
Wang Chenb67802e2009-03-02 13:55:26 +08004446 next = pick_next_task(rq);
Mike Galbraithf26f9af2010-12-08 11:05:42 +01004447 clear_tsk_need_resched(prev);
4448 rq->skip_clock_update = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004449
Linus Torvalds1da177e2005-04-16 15:20:36 -07004450 if (likely(prev != next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07004451 rq->nr_switches++;
4452 rq->curr = next;
4453 ++*switch_count;
4454
Ingo Molnardd41f592007-07-09 18:51:59 +02004455 context_switch(rq, prev, next); /* unlocks the rq */
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01004456 /*
Oleg Nesterov246d86b2010-05-19 14:57:11 +02004457 * The context switch have flipped the stack from under us
4458 * and restored the local variables which were saved when
4459 * this task called schedule() in the past. prev == current
4460 * is still correct, but it can be moved to another cpu/rq.
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01004461 */
4462 cpu = smp_processor_id();
4463 rq = cpu_rq(cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004464 } else
Thomas Gleixner05fa7852009-11-17 14:28:38 +01004465 raw_spin_unlock_irq(&rq->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004466
Gregory Haskins3f029d32009-07-29 11:08:47 -04004467 post_schedule(rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004468
Linus Torvalds1da177e2005-04-16 15:20:36 -07004469 preempt_enable_no_resched();
Peter Zijlstraff743342009-03-13 12:21:26 +01004470 if (need_resched())
Linus Torvalds1da177e2005-04-16 15:20:36 -07004471 goto need_resched;
4472}
Thomas Gleixnerc259e012011-06-22 19:47:00 +02004473
Thomas Gleixner9c40cef2011-06-22 19:47:01 +02004474static inline void sched_submit_work(struct task_struct *tsk)
4475{
4476 if (!tsk->state)
4477 return;
4478 /*
4479 * If we are going to sleep and we have plugged IO queued,
4480 * make sure to submit it to avoid deadlocks.
4481 */
4482 if (blk_needs_flush_plug(tsk))
4483 blk_schedule_flush_plug(tsk);
4484}
4485
Simon Kirby6ebbe7a2011-09-22 17:03:46 -07004486asmlinkage void __sched schedule(void)
Thomas Gleixnerc259e012011-06-22 19:47:00 +02004487{
Thomas Gleixner9c40cef2011-06-22 19:47:01 +02004488 struct task_struct *tsk = current;
4489
4490 sched_submit_work(tsk);
Thomas Gleixnerc259e012011-06-22 19:47:00 +02004491 __schedule();
4492}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004493EXPORT_SYMBOL(schedule);
4494
Frederic Weisbeckerc08f7822009-12-02 20:49:17 +01004495#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
Peter Zijlstrac6eb3dd2011-04-05 17:23:41 +02004496
4497static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
4498{
Peter Zijlstrac6eb3dd2011-04-05 17:23:41 +02004499 if (lock->owner != owner)
Thomas Gleixner307bf982011-06-10 15:08:55 +02004500 return false;
Peter Zijlstrac6eb3dd2011-04-05 17:23:41 +02004501
4502 /*
4503 * Ensure we emit the owner->on_cpu, dereference _after_ checking
4504 * lock->owner still matches owner, if that fails, owner might
4505 * point to free()d memory, if it still matches, the rcu_read_lock()
4506 * ensures the memory stays valid.
4507 */
4508 barrier();
4509
Thomas Gleixner307bf982011-06-10 15:08:55 +02004510 return owner->on_cpu;
Peter Zijlstrac6eb3dd2011-04-05 17:23:41 +02004511}
4512
Peter Zijlstra0d66bf62009-01-12 14:01:47 +01004513/*
4514 * Look out! "owner" is an entirely speculative pointer
4515 * access and not reliable.
4516 */
Peter Zijlstrac6eb3dd2011-04-05 17:23:41 +02004517int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
Peter Zijlstra0d66bf62009-01-12 14:01:47 +01004518{
Peter Zijlstra0d66bf62009-01-12 14:01:47 +01004519 if (!sched_feat(OWNER_SPIN))
4520 return 0;
4521
Thomas Gleixner307bf982011-06-10 15:08:55 +02004522 rcu_read_lock();
Peter Zijlstrac6eb3dd2011-04-05 17:23:41 +02004523 while (owner_running(lock, owner)) {
4524 if (need_resched())
Thomas Gleixner307bf982011-06-10 15:08:55 +02004525 break;
Peter Zijlstra0d66bf62009-01-12 14:01:47 +01004526
Gerald Schaefer335d7af2010-11-22 15:47:36 +01004527 arch_mutex_cpu_relax();
Peter Zijlstra0d66bf62009-01-12 14:01:47 +01004528 }
Thomas Gleixner307bf982011-06-10 15:08:55 +02004529 rcu_read_unlock();
Benjamin Herrenschmidt4b402212010-04-16 23:20:00 +02004530
Peter Zijlstrac6eb3dd2011-04-05 17:23:41 +02004531 /*
Thomas Gleixner307bf982011-06-10 15:08:55 +02004532 * We break out the loop above on need_resched() and when the
4533 * owner changed, which is a sign for heavy contention. Return
4534 * success only when lock->owner is NULL.
Peter Zijlstrac6eb3dd2011-04-05 17:23:41 +02004535 */
Thomas Gleixner307bf982011-06-10 15:08:55 +02004536 return lock->owner == NULL;
Peter Zijlstra0d66bf62009-01-12 14:01:47 +01004537}
4538#endif
4539
Linus Torvalds1da177e2005-04-16 15:20:36 -07004540#ifdef CONFIG_PREEMPT
4541/*
Andreas Mohr2ed6e342006-07-10 04:43:52 -07004542 * this is the entry point to schedule() from in-kernel preemption
Ingo Molnar41a2d6c2007-12-05 15:46:09 +01004543 * off of preempt_enable. Kernel preemptions off return from interrupt
Linus Torvalds1da177e2005-04-16 15:20:36 -07004544 * occur there and call schedule directly.
4545 */
Steven Rostedtd1f74e22010-06-02 21:52:29 -04004546asmlinkage void __sched notrace preempt_schedule(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004547{
4548 struct thread_info *ti = current_thread_info();
Ingo Molnar6478d882008-01-25 21:08:33 +01004549
Linus Torvalds1da177e2005-04-16 15:20:36 -07004550 /*
4551 * If there is a non-zero preempt_count or interrupts are disabled,
Ingo Molnar41a2d6c2007-12-05 15:46:09 +01004552 * we do not want to preempt the current task. Just return..
Linus Torvalds1da177e2005-04-16 15:20:36 -07004553 */
Nick Pigginbeed33a2006-10-11 01:21:52 -07004554 if (likely(ti->preempt_count || irqs_disabled()))
Linus Torvalds1da177e2005-04-16 15:20:36 -07004555 return;
4556
Andi Kleen3a5c3592007-10-15 17:00:14 +02004557 do {
Steven Rostedtd1f74e22010-06-02 21:52:29 -04004558 add_preempt_count_notrace(PREEMPT_ACTIVE);
Thomas Gleixnerc259e012011-06-22 19:47:00 +02004559 __schedule();
Steven Rostedtd1f74e22010-06-02 21:52:29 -04004560 sub_preempt_count_notrace(PREEMPT_ACTIVE);
Andi Kleen3a5c3592007-10-15 17:00:14 +02004561
4562 /*
4563 * Check again in case we missed a preemption opportunity
4564 * between schedule and now.
4565 */
4566 barrier();
Lai Jiangshan5ed0cec2009-03-06 19:40:20 +08004567 } while (need_resched());
Linus Torvalds1da177e2005-04-16 15:20:36 -07004568}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004569EXPORT_SYMBOL(preempt_schedule);
4570
4571/*
Andreas Mohr2ed6e342006-07-10 04:43:52 -07004572 * this is the entry point to schedule() from kernel preemption
Linus Torvalds1da177e2005-04-16 15:20:36 -07004573 * off of irq context.
4574 * Note, that this is called and return with irqs disabled. This will
4575 * protect us against recursive calling from irq.
4576 */
4577asmlinkage void __sched preempt_schedule_irq(void)
4578{
4579 struct thread_info *ti = current_thread_info();
Ingo Molnar6478d882008-01-25 21:08:33 +01004580
Andreas Mohr2ed6e342006-07-10 04:43:52 -07004581 /* Catch callers which need to be fixed */
Linus Torvalds1da177e2005-04-16 15:20:36 -07004582 BUG_ON(ti->preempt_count || !irqs_disabled());
4583
Andi Kleen3a5c3592007-10-15 17:00:14 +02004584 do {
4585 add_preempt_count(PREEMPT_ACTIVE);
Andi Kleen3a5c3592007-10-15 17:00:14 +02004586 local_irq_enable();
Thomas Gleixnerc259e012011-06-22 19:47:00 +02004587 __schedule();
Andi Kleen3a5c3592007-10-15 17:00:14 +02004588 local_irq_disable();
Andi Kleen3a5c3592007-10-15 17:00:14 +02004589 sub_preempt_count(PREEMPT_ACTIVE);
4590
4591 /*
4592 * Check again in case we missed a preemption opportunity
4593 * between schedule and now.
4594 */
4595 barrier();
Lai Jiangshan5ed0cec2009-03-06 19:40:20 +08004596 } while (need_resched());
Linus Torvalds1da177e2005-04-16 15:20:36 -07004597}
4598
4599#endif /* CONFIG_PREEMPT */
4600
Peter Zijlstra63859d42009-09-15 19:14:42 +02004601int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07004602 void *key)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004603{
Peter Zijlstra63859d42009-09-15 19:14:42 +02004604 return try_to_wake_up(curr->private, mode, wake_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004605}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004606EXPORT_SYMBOL(default_wake_function);
4607
4608/*
Ingo Molnar41a2d6c2007-12-05 15:46:09 +01004609 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
4610 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
Linus Torvalds1da177e2005-04-16 15:20:36 -07004611 * number) then we wake all the non-exclusive tasks and one exclusive task.
4612 *
4613 * There are circumstances in which we can try to wake a task which has already
Ingo Molnar41a2d6c2007-12-05 15:46:09 +01004614 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
Linus Torvalds1da177e2005-04-16 15:20:36 -07004615 * zero in this (rare) case, and we handle it by continuing to scan the queue.
4616 */
Johannes Weiner78ddb082009-04-14 16:53:05 +02004617static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
Peter Zijlstra63859d42009-09-15 19:14:42 +02004618 int nr_exclusive, int wake_flags, void *key)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004619{
Matthias Kaehlcke2e458742007-10-15 17:00:02 +02004620 wait_queue_t *curr, *next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004621
Matthias Kaehlcke2e458742007-10-15 17:00:02 +02004622 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
Ingo Molnar48f24c42006-07-03 00:25:40 -07004623 unsigned flags = curr->flags;
4624
Peter Zijlstra63859d42009-09-15 19:14:42 +02004625 if (curr->func(curr, mode, wake_flags, key) &&
Ingo Molnar48f24c42006-07-03 00:25:40 -07004626 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004627 break;
4628 }
4629}
4630
4631/**
4632 * __wake_up - wake up threads blocked on a waitqueue.
4633 * @q: the waitqueue
4634 * @mode: which threads
4635 * @nr_exclusive: how many wake-one or wake-many threads to wake up
Martin Waitz67be2dd2005-05-01 08:59:26 -07004636 * @key: is directly passed to the wakeup function
David Howells50fa6102009-04-28 15:01:38 +01004637 *
4638 * It may be assumed that this function implies a write memory barrier before
4639 * changing the task state if and only if any tasks are woken up.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004640 */
Harvey Harrison7ad5b3a2008-02-08 04:19:53 -08004641void __wake_up(wait_queue_head_t *q, unsigned int mode,
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07004642 int nr_exclusive, void *key)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004643{
4644 unsigned long flags;
4645
4646 spin_lock_irqsave(&q->lock, flags);
4647 __wake_up_common(q, mode, nr_exclusive, 0, key);
4648 spin_unlock_irqrestore(&q->lock, flags);
4649}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004650EXPORT_SYMBOL(__wake_up);
4651
4652/*
4653 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
4654 */
Harvey Harrison7ad5b3a2008-02-08 04:19:53 -08004655void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004656{
4657 __wake_up_common(q, mode, 1, 0, NULL);
4658}
Michal Nazarewicz22c43c82010-05-05 12:53:11 +02004659EXPORT_SYMBOL_GPL(__wake_up_locked);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004660
Davide Libenzi4ede8162009-03-31 15:24:20 -07004661void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
4662{
4663 __wake_up_common(q, mode, 1, 0, key);
4664}
Trond Myklebustbf294b42011-02-21 11:05:41 -08004665EXPORT_SYMBOL_GPL(__wake_up_locked_key);
Davide Libenzi4ede8162009-03-31 15:24:20 -07004666
Linus Torvalds1da177e2005-04-16 15:20:36 -07004667/**
Davide Libenzi4ede8162009-03-31 15:24:20 -07004668 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004669 * @q: the waitqueue
4670 * @mode: which threads
4671 * @nr_exclusive: how many wake-one or wake-many threads to wake up
Davide Libenzi4ede8162009-03-31 15:24:20 -07004672 * @key: opaque value to be passed to wakeup targets
Linus Torvalds1da177e2005-04-16 15:20:36 -07004673 *
4674 * The sync wakeup differs that the waker knows that it will schedule
4675 * away soon, so while the target thread will be woken up, it will not
4676 * be migrated to another CPU - ie. the two threads are 'synchronized'
4677 * with each other. This can prevent needless bouncing between CPUs.
4678 *
4679 * On UP it can prevent extra preemption.
David Howells50fa6102009-04-28 15:01:38 +01004680 *
4681 * It may be assumed that this function implies a write memory barrier before
4682 * changing the task state if and only if any tasks are woken up.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004683 */
Davide Libenzi4ede8162009-03-31 15:24:20 -07004684void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
4685 int nr_exclusive, void *key)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004686{
4687 unsigned long flags;
Peter Zijlstra7d478722009-09-14 19:55:44 +02004688 int wake_flags = WF_SYNC;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004689
4690 if (unlikely(!q))
4691 return;
4692
4693 if (unlikely(!nr_exclusive))
Peter Zijlstra7d478722009-09-14 19:55:44 +02004694 wake_flags = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004695
4696 spin_lock_irqsave(&q->lock, flags);
Peter Zijlstra7d478722009-09-14 19:55:44 +02004697 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004698 spin_unlock_irqrestore(&q->lock, flags);
4699}
Davide Libenzi4ede8162009-03-31 15:24:20 -07004700EXPORT_SYMBOL_GPL(__wake_up_sync_key);
4701
4702/*
4703 * __wake_up_sync - see __wake_up_sync_key()
4704 */
4705void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
4706{
4707 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
4708}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004709EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
4710
Kevin Diggs65eb3dc2008-08-26 10:26:54 +02004711/**
4712 * complete: - signals a single thread waiting on this completion
4713 * @x: holds the state of this particular completion
4714 *
4715 * This will wake up a single thread waiting on this completion. Threads will be
4716 * awakened in the same order in which they were queued.
4717 *
4718 * See also complete_all(), wait_for_completion() and related routines.
David Howells50fa6102009-04-28 15:01:38 +01004719 *
4720 * It may be assumed that this function implies a write memory barrier before
4721 * changing the task state if and only if any tasks are woken up.
Kevin Diggs65eb3dc2008-08-26 10:26:54 +02004722 */
Ingo Molnarb15136e2007-10-24 18:23:48 +02004723void complete(struct completion *x)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004724{
4725 unsigned long flags;
4726
4727 spin_lock_irqsave(&x->wait.lock, flags);
4728 x->done++;
Matthew Wilcoxd9514f62007-12-06 11:07:07 -05004729 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004730 spin_unlock_irqrestore(&x->wait.lock, flags);
4731}
4732EXPORT_SYMBOL(complete);
4733
Kevin Diggs65eb3dc2008-08-26 10:26:54 +02004734/**
4735 * complete_all: - signals all threads waiting on this completion
4736 * @x: holds the state of this particular completion
4737 *
4738 * This will wake up all threads waiting on this particular completion event.
David Howells50fa6102009-04-28 15:01:38 +01004739 *
4740 * It may be assumed that this function implies a write memory barrier before
4741 * changing the task state if and only if any tasks are woken up.
Kevin Diggs65eb3dc2008-08-26 10:26:54 +02004742 */
Ingo Molnarb15136e2007-10-24 18:23:48 +02004743void complete_all(struct completion *x)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004744{
4745 unsigned long flags;
4746
4747 spin_lock_irqsave(&x->wait.lock, flags);
4748 x->done += UINT_MAX/2;
Matthew Wilcoxd9514f62007-12-06 11:07:07 -05004749 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004750 spin_unlock_irqrestore(&x->wait.lock, flags);
4751}
4752EXPORT_SYMBOL(complete_all);
4753
Andi Kleen8cbbe862007-10-15 17:00:14 +02004754static inline long __sched
4755do_wait_for_common(struct completion *x, long timeout, int state)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004756{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004757 if (!x->done) {
4758 DECLARE_WAITQUEUE(wait, current);
4759
Changli Gaoa93d2f12010-05-07 14:33:26 +08004760 __add_wait_queue_tail_exclusive(&x->wait, &wait);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004761 do {
Oleg Nesterov94d3d822008-08-20 16:54:41 -07004762 if (signal_pending_state(state, current)) {
Oleg Nesterovea71a542008-06-20 18:32:20 +04004763 timeout = -ERESTARTSYS;
4764 break;
Andi Kleen8cbbe862007-10-15 17:00:14 +02004765 }
4766 __set_current_state(state);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004767 spin_unlock_irq(&x->wait.lock);
Andi Kleen8cbbe862007-10-15 17:00:14 +02004768 timeout = schedule_timeout(timeout);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004769 spin_lock_irq(&x->wait.lock);
Oleg Nesterovea71a542008-06-20 18:32:20 +04004770 } while (!x->done && timeout);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004771 __remove_wait_queue(&x->wait, &wait);
Oleg Nesterovea71a542008-06-20 18:32:20 +04004772 if (!x->done)
4773 return timeout;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004774 }
4775 x->done--;
Oleg Nesterovea71a542008-06-20 18:32:20 +04004776 return timeout ?: 1;
Andi Kleen8cbbe862007-10-15 17:00:14 +02004777}
4778
4779static long __sched
4780wait_for_common(struct completion *x, long timeout, int state)
4781{
4782 might_sleep();
4783
4784 spin_lock_irq(&x->wait.lock);
4785 timeout = do_wait_for_common(x, timeout, state);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004786 spin_unlock_irq(&x->wait.lock);
Andi Kleen8cbbe862007-10-15 17:00:14 +02004787 return timeout;
4788}
4789
Kevin Diggs65eb3dc2008-08-26 10:26:54 +02004790/**
4791 * wait_for_completion: - waits for completion of a task
4792 * @x: holds the state of this particular completion
4793 *
4794 * This waits to be signaled for completion of a specific task. It is NOT
4795 * interruptible and there is no timeout.
4796 *
4797 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
4798 * and interrupt capability. Also see complete().
4799 */
Ingo Molnarb15136e2007-10-24 18:23:48 +02004800void __sched wait_for_completion(struct completion *x)
Andi Kleen8cbbe862007-10-15 17:00:14 +02004801{
4802 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004803}
4804EXPORT_SYMBOL(wait_for_completion);
4805
Kevin Diggs65eb3dc2008-08-26 10:26:54 +02004806/**
4807 * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
4808 * @x: holds the state of this particular completion
4809 * @timeout: timeout value in jiffies
4810 *
4811 * This waits for either a completion of a specific task to be signaled or for a
4812 * specified timeout to expire. The timeout is in jiffies. It is not
4813 * interruptible.
J. Bruce Fieldsc6dc7f02011-10-06 15:22:46 -04004814 *
4815 * The return value is 0 if timed out, and positive (at least 1, or number of
4816 * jiffies left till timeout) if completed.
Kevin Diggs65eb3dc2008-08-26 10:26:54 +02004817 */
Ingo Molnarb15136e2007-10-24 18:23:48 +02004818unsigned long __sched
Linus Torvalds1da177e2005-04-16 15:20:36 -07004819wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4820{
Andi Kleen8cbbe862007-10-15 17:00:14 +02004821 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004822}
4823EXPORT_SYMBOL(wait_for_completion_timeout);
4824
Kevin Diggs65eb3dc2008-08-26 10:26:54 +02004825/**
4826 * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
4827 * @x: holds the state of this particular completion
4828 *
4829 * This waits for completion of a specific task to be signaled. It is
4830 * interruptible.
J. Bruce Fieldsc6dc7f02011-10-06 15:22:46 -04004831 *
4832 * The return value is -ERESTARTSYS if interrupted, 0 if completed.
Kevin Diggs65eb3dc2008-08-26 10:26:54 +02004833 */
Andi Kleen8cbbe862007-10-15 17:00:14 +02004834int __sched wait_for_completion_interruptible(struct completion *x)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004835{
Andi Kleen51e97992007-10-18 21:32:55 +02004836 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
4837 if (t == -ERESTARTSYS)
4838 return t;
4839 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004840}
4841EXPORT_SYMBOL(wait_for_completion_interruptible);
4842
Kevin Diggs65eb3dc2008-08-26 10:26:54 +02004843/**
4844 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
4845 * @x: holds the state of this particular completion
4846 * @timeout: timeout value in jiffies
4847 *
4848 * This waits for either a completion of a specific task to be signaled or for a
4849 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
J. Bruce Fieldsc6dc7f02011-10-06 15:22:46 -04004850 *
4851 * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
4852 * positive (at least 1, or number of jiffies left till timeout) if completed.
Kevin Diggs65eb3dc2008-08-26 10:26:54 +02004853 */
NeilBrown6bf41232011-01-05 12:50:16 +11004854long __sched
Linus Torvalds1da177e2005-04-16 15:20:36 -07004855wait_for_completion_interruptible_timeout(struct completion *x,
4856 unsigned long timeout)
4857{
Andi Kleen8cbbe862007-10-15 17:00:14 +02004858 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004859}
4860EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4861
Kevin Diggs65eb3dc2008-08-26 10:26:54 +02004862/**
4863 * wait_for_completion_killable: - waits for completion of a task (killable)
4864 * @x: holds the state of this particular completion
4865 *
4866 * This waits to be signaled for completion of a specific task. It can be
4867 * interrupted by a kill signal.
J. Bruce Fieldsc6dc7f02011-10-06 15:22:46 -04004868 *
4869 * The return value is -ERESTARTSYS if interrupted, 0 if completed.
Kevin Diggs65eb3dc2008-08-26 10:26:54 +02004870 */
Matthew Wilcox009e5772007-12-06 12:29:54 -05004871int __sched wait_for_completion_killable(struct completion *x)
4872{
4873 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
4874 if (t == -ERESTARTSYS)
4875 return t;
4876 return 0;
4877}
4878EXPORT_SYMBOL(wait_for_completion_killable);
4879
Dave Chinnerbe4de352008-08-15 00:40:44 -07004880/**
Sage Weil0aa12fb2010-05-29 09:12:30 -07004881 * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
4882 * @x: holds the state of this particular completion
4883 * @timeout: timeout value in jiffies
4884 *
4885 * This waits for either a completion of a specific task to be
4886 * signaled or for a specified timeout to expire. It can be
4887 * interrupted by a kill signal. The timeout is in jiffies.
J. Bruce Fieldsc6dc7f02011-10-06 15:22:46 -04004888 *
4889 * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
4890 * positive (at least 1, or number of jiffies left till timeout) if completed.
Sage Weil0aa12fb2010-05-29 09:12:30 -07004891 */
NeilBrown6bf41232011-01-05 12:50:16 +11004892long __sched
Sage Weil0aa12fb2010-05-29 09:12:30 -07004893wait_for_completion_killable_timeout(struct completion *x,
4894 unsigned long timeout)
4895{
4896 return wait_for_common(x, timeout, TASK_KILLABLE);
4897}
4898EXPORT_SYMBOL(wait_for_completion_killable_timeout);
4899
4900/**
Dave Chinnerbe4de352008-08-15 00:40:44 -07004901 * try_wait_for_completion - try to decrement a completion without blocking
4902 * @x: completion structure
4903 *
4904 * Returns: 0 if a decrement cannot be done without blocking
4905 * 1 if a decrement succeeded.
4906 *
4907 * If a completion is being used as a counting completion,
4908 * attempt to decrement the counter without blocking. This
4909 * enables us to avoid waiting if the resource the completion
4910 * is protecting is not available.
4911 */
4912bool try_wait_for_completion(struct completion *x)
4913{
Rafael J. Wysocki7539a3b2009-12-13 00:07:30 +01004914 unsigned long flags;
Dave Chinnerbe4de352008-08-15 00:40:44 -07004915 int ret = 1;
4916
Rafael J. Wysocki7539a3b2009-12-13 00:07:30 +01004917 spin_lock_irqsave(&x->wait.lock, flags);
Dave Chinnerbe4de352008-08-15 00:40:44 -07004918 if (!x->done)
4919 ret = 0;
4920 else
4921 x->done--;
Rafael J. Wysocki7539a3b2009-12-13 00:07:30 +01004922 spin_unlock_irqrestore(&x->wait.lock, flags);
Dave Chinnerbe4de352008-08-15 00:40:44 -07004923 return ret;
4924}
4925EXPORT_SYMBOL(try_wait_for_completion);
4926
4927/**
4928 * completion_done - Test to see if a completion has any waiters
4929 * @x: completion structure
4930 *
4931 * Returns: 0 if there are waiters (wait_for_completion() in progress)
4932 * 1 if there are no waiters.
4933 *
4934 */
4935bool completion_done(struct completion *x)
4936{
Rafael J. Wysocki7539a3b2009-12-13 00:07:30 +01004937 unsigned long flags;
Dave Chinnerbe4de352008-08-15 00:40:44 -07004938 int ret = 1;
4939
Rafael J. Wysocki7539a3b2009-12-13 00:07:30 +01004940 spin_lock_irqsave(&x->wait.lock, flags);
Dave Chinnerbe4de352008-08-15 00:40:44 -07004941 if (!x->done)
4942 ret = 0;
Rafael J. Wysocki7539a3b2009-12-13 00:07:30 +01004943 spin_unlock_irqrestore(&x->wait.lock, flags);
Dave Chinnerbe4de352008-08-15 00:40:44 -07004944 return ret;
4945}
4946EXPORT_SYMBOL(completion_done);
4947
Andi Kleen8cbbe862007-10-15 17:00:14 +02004948static long __sched
4949sleep_on_common(wait_queue_head_t *q, int state, long timeout)
Ingo Molnar0fec1712007-07-09 18:52:01 +02004950{
4951 unsigned long flags;
4952 wait_queue_t wait;
4953
4954 init_waitqueue_entry(&wait, current);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004955
Andi Kleen8cbbe862007-10-15 17:00:14 +02004956 __set_current_state(state);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004957
Andi Kleen8cbbe862007-10-15 17:00:14 +02004958 spin_lock_irqsave(&q->lock, flags);
4959 __add_wait_queue(q, &wait);
4960 spin_unlock(&q->lock);
4961 timeout = schedule_timeout(timeout);
4962 spin_lock_irq(&q->lock);
4963 __remove_wait_queue(q, &wait);
4964 spin_unlock_irqrestore(&q->lock, flags);
4965
4966 return timeout;
4967}
4968
4969void __sched interruptible_sleep_on(wait_queue_head_t *q)
4970{
4971 sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004972}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004973EXPORT_SYMBOL(interruptible_sleep_on);
4974
Ingo Molnar0fec1712007-07-09 18:52:01 +02004975long __sched
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07004976interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004977{
Andi Kleen8cbbe862007-10-15 17:00:14 +02004978 return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004979}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004980EXPORT_SYMBOL(interruptible_sleep_on_timeout);
4981
Ingo Molnar0fec1712007-07-09 18:52:01 +02004982void __sched sleep_on(wait_queue_head_t *q)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004983{
Andi Kleen8cbbe862007-10-15 17:00:14 +02004984 sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004985}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004986EXPORT_SYMBOL(sleep_on);
4987
Ingo Molnar0fec1712007-07-09 18:52:01 +02004988long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004989{
Andi Kleen8cbbe862007-10-15 17:00:14 +02004990 return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004991}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004992EXPORT_SYMBOL(sleep_on_timeout);
4993
Ingo Molnarb29739f2006-06-27 02:54:51 -07004994#ifdef CONFIG_RT_MUTEXES
4995
4996/*
4997 * rt_mutex_setprio - set the current priority of a task
4998 * @p: task
4999 * @prio: prio value (kernel-internal form)
5000 *
5001 * This function changes the 'effective' priority of a task. It does
5002 * not touch ->normal_prio like __setscheduler().
5003 *
5004 * Used by the rt_mutex code to implement priority inheritance logic.
5005 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07005006void rt_mutex_setprio(struct task_struct *p, int prio)
Ingo Molnarb29739f2006-06-27 02:54:51 -07005007{
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +02005008 int oldprio, on_rq, running;
Ingo Molnar70b97a72006-07-03 00:25:42 -07005009 struct rq *rq;
Thomas Gleixner83ab0aa2010-02-17 09:05:48 +01005010 const struct sched_class *prev_class;
Ingo Molnarb29739f2006-06-27 02:54:51 -07005011
5012 BUG_ON(prio < 0 || prio > MAX_PRIO);
5013
Peter Zijlstra0122ec52011-04-05 17:23:51 +02005014 rq = __task_rq_lock(p);
Ingo Molnarb29739f2006-06-27 02:54:51 -07005015
Steven Rostedta8027072010-09-20 15:13:34 -04005016 trace_sched_pi_setprio(p, prio);
Andrew Mortond5f9f942007-05-08 20:27:06 -07005017 oldprio = p->prio;
Thomas Gleixner83ab0aa2010-02-17 09:05:48 +01005018 prev_class = p->sched_class;
Peter Zijlstrafd2f4412011-04-05 17:23:44 +02005019 on_rq = p->on_rq;
Dmitry Adamushko051a1d12007-12-18 15:21:13 +01005020 running = task_current(rq, p);
Hiroshi Shimamoto0e1f3482008-03-10 11:01:20 -07005021 if (on_rq)
Ingo Molnar69be72c2007-08-09 11:16:49 +02005022 dequeue_task(rq, p, 0);
Hiroshi Shimamoto0e1f3482008-03-10 11:01:20 -07005023 if (running)
5024 p->sched_class->put_prev_task(rq, p);
Ingo Molnardd41f592007-07-09 18:51:59 +02005025
5026 if (rt_prio(prio))
5027 p->sched_class = &rt_sched_class;
5028 else
5029 p->sched_class = &fair_sched_class;
5030
Ingo Molnarb29739f2006-06-27 02:54:51 -07005031 p->prio = prio;
5032
Hiroshi Shimamoto0e1f3482008-03-10 11:01:20 -07005033 if (running)
5034 p->sched_class->set_curr_task(rq);
Peter Zijlstrada7a7352011-01-17 17:03:27 +01005035 if (on_rq)
Peter Zijlstra371fd7e2010-03-24 16:38:48 +01005036 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
Steven Rostedtcb469842008-01-25 21:08:22 +01005037
Peter Zijlstrada7a7352011-01-17 17:03:27 +01005038 check_class_changed(rq, p, prev_class, oldprio);
Peter Zijlstra0122ec52011-04-05 17:23:51 +02005039 __task_rq_unlock(rq);
Ingo Molnarb29739f2006-06-27 02:54:51 -07005040}
5041
5042#endif
5043
Ingo Molnar36c8b582006-07-03 00:25:41 -07005044void set_user_nice(struct task_struct *p, long nice)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005045{
Ingo Molnardd41f592007-07-09 18:51:59 +02005046 int old_prio, delta, on_rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005047 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07005048 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005049
5050 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
5051 return;
5052 /*
5053 * We have to be careful, if called from sys_setpriority(),
5054 * the task might be in the middle of scheduling on another CPU.
5055 */
5056 rq = task_rq_lock(p, &flags);
5057 /*
5058 * The RT priorities are set via sched_setscheduler(), but we still
5059 * allow the 'normal' nice value to be set - but as expected
5060 * it wont have any effect on scheduling until the task is
Ingo Molnardd41f592007-07-09 18:51:59 +02005061 * SCHED_FIFO/SCHED_RR:
Linus Torvalds1da177e2005-04-16 15:20:36 -07005062 */
Ingo Molnare05606d2007-07-09 18:51:59 +02005063 if (task_has_rt_policy(p)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005064 p->static_prio = NICE_TO_PRIO(nice);
5065 goto out_unlock;
5066 }
Peter Zijlstrafd2f4412011-04-05 17:23:44 +02005067 on_rq = p->on_rq;
Peter Zijlstrac09595f2008-06-27 13:41:14 +02005068 if (on_rq)
Ingo Molnar69be72c2007-08-09 11:16:49 +02005069 dequeue_task(rq, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005070
Linus Torvalds1da177e2005-04-16 15:20:36 -07005071 p->static_prio = NICE_TO_PRIO(nice);
Peter Williams2dd73a42006-06-27 02:54:34 -07005072 set_load_weight(p);
Ingo Molnarb29739f2006-06-27 02:54:51 -07005073 old_prio = p->prio;
5074 p->prio = effective_prio(p);
5075 delta = p->prio - old_prio;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005076
Ingo Molnardd41f592007-07-09 18:51:59 +02005077 if (on_rq) {
Peter Zijlstra371fd7e2010-03-24 16:38:48 +01005078 enqueue_task(rq, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005079 /*
Andrew Mortond5f9f942007-05-08 20:27:06 -07005080 * If the task increased its priority or is running and
5081 * lowered its priority, then reschedule its CPU:
Linus Torvalds1da177e2005-04-16 15:20:36 -07005082 */
Andrew Mortond5f9f942007-05-08 20:27:06 -07005083 if (delta < 0 || (delta > 0 && task_running(rq, p)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07005084 resched_task(rq->curr);
5085 }
5086out_unlock:
Peter Zijlstra0122ec52011-04-05 17:23:51 +02005087 task_rq_unlock(rq, p, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005088}
Linus Torvalds1da177e2005-04-16 15:20:36 -07005089EXPORT_SYMBOL(set_user_nice);
5090
Matt Mackalle43379f2005-05-01 08:59:00 -07005091/*
5092 * can_nice - check if a task can reduce its nice value
5093 * @p: task
5094 * @nice: nice value
5095 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07005096int can_nice(const struct task_struct *p, const int nice)
Matt Mackalle43379f2005-05-01 08:59:00 -07005097{
Matt Mackall024f4742005-08-18 11:24:19 -07005098 /* convert nice value [19,-20] to rlimit style value [1,40] */
5099 int nice_rlim = 20 - nice;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005100
Jiri Slaby78d7d402010-03-05 13:42:54 -08005101 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
Matt Mackalle43379f2005-05-01 08:59:00 -07005102 capable(CAP_SYS_NICE));
5103}
5104
Linus Torvalds1da177e2005-04-16 15:20:36 -07005105#ifdef __ARCH_WANT_SYS_NICE
5106
5107/*
5108 * sys_nice - change the priority of the current process.
5109 * @increment: priority increment
5110 *
5111 * sys_setpriority is a more generic, but much slower function that
5112 * does similar things.
5113 */
Heiko Carstens5add95d2009-01-14 14:14:08 +01005114SYSCALL_DEFINE1(nice, int, increment)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005115{
Ingo Molnar48f24c42006-07-03 00:25:40 -07005116 long nice, retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005117
5118 /*
5119 * Setpriority might change our priority at the same moment.
5120 * We don't have to worry. Conceptually one call occurs first
5121 * and we have a single winner.
5122 */
Matt Mackalle43379f2005-05-01 08:59:00 -07005123 if (increment < -40)
5124 increment = -40;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005125 if (increment > 40)
5126 increment = 40;
5127
Américo Wang2b8f8362009-02-16 18:54:21 +08005128 nice = TASK_NICE(current) + increment;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005129 if (nice < -20)
5130 nice = -20;
5131 if (nice > 19)
5132 nice = 19;
5133
Matt Mackalle43379f2005-05-01 08:59:00 -07005134 if (increment < 0 && !can_nice(current, nice))
5135 return -EPERM;
5136
Linus Torvalds1da177e2005-04-16 15:20:36 -07005137 retval = security_task_setnice(current, nice);
5138 if (retval)
5139 return retval;
5140
5141 set_user_nice(current, nice);
5142 return 0;
5143}
5144
5145#endif
5146
5147/**
5148 * task_prio - return the priority value of a given task.
5149 * @p: the task in question.
5150 *
5151 * This is the priority value as seen by users in /proc.
5152 * RT tasks are offset by -200. Normal tasks are centered
5153 * around 0, value goes from -16 to +15.
5154 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07005155int task_prio(const struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005156{
5157 return p->prio - MAX_RT_PRIO;
5158}
5159
5160/**
5161 * task_nice - return the nice value of a given task.
5162 * @p: the task in question.
5163 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07005164int task_nice(const struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005165{
5166 return TASK_NICE(p);
5167}
Pavel Roskin150d8be2008-03-05 16:56:37 -05005168EXPORT_SYMBOL(task_nice);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005169
5170/**
5171 * idle_cpu - is a given cpu idle currently?
5172 * @cpu: the processor in question.
5173 */
5174int idle_cpu(int cpu)
5175{
Thomas Gleixner908a3282011-09-15 15:32:06 +02005176 struct rq *rq = cpu_rq(cpu);
5177
5178 if (rq->curr != rq->idle)
5179 return 0;
5180
5181 if (rq->nr_running)
5182 return 0;
5183
5184#ifdef CONFIG_SMP
5185 if (!llist_empty(&rq->wake_list))
5186 return 0;
5187#endif
5188
5189 return 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005190}
5191
Linus Torvalds1da177e2005-04-16 15:20:36 -07005192/**
5193 * idle_task - return the idle task for a given cpu.
5194 * @cpu: the processor in question.
5195 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07005196struct task_struct *idle_task(int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005197{
5198 return cpu_rq(cpu)->idle;
5199}
5200
5201/**
5202 * find_process_by_pid - find a process with a matching PID value.
5203 * @pid: the pid in question.
5204 */
Alexey Dobriyana9957442007-10-15 17:00:13 +02005205static struct task_struct *find_process_by_pid(pid_t pid)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005206{
Pavel Emelyanov228ebcb2007-10-18 23:40:16 -07005207 return pid ? find_task_by_vpid(pid) : current;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005208}
5209
5210/* Actually do priority change: must hold rq lock. */
Ingo Molnardd41f592007-07-09 18:51:59 +02005211static void
5212__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005213{
Linus Torvalds1da177e2005-04-16 15:20:36 -07005214 p->policy = policy;
5215 p->rt_priority = prio;
Ingo Molnarb29739f2006-06-27 02:54:51 -07005216 p->normal_prio = normal_prio(p);
5217 /* we are holding p->pi_lock already */
5218 p->prio = rt_mutex_getprio(p);
Peter Zijlstraffd44db2009-11-10 20:12:01 +01005219 if (rt_prio(p->prio))
5220 p->sched_class = &rt_sched_class;
5221 else
5222 p->sched_class = &fair_sched_class;
Peter Williams2dd73a42006-06-27 02:54:34 -07005223 set_load_weight(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005224}
5225
David Howellsc69e8d92008-11-14 10:39:19 +11005226/*
5227 * check the target process has a UID that matches the current process's
5228 */
5229static bool check_same_owner(struct task_struct *p)
5230{
5231 const struct cred *cred = current_cred(), *pcred;
5232 bool match;
5233
5234 rcu_read_lock();
5235 pcred = __task_cred(p);
Serge E. Hallynb0e77592011-03-23 16:43:24 -07005236 if (cred->user->user_ns == pcred->user->user_ns)
5237 match = (cred->euid == pcred->euid ||
5238 cred->euid == pcred->uid);
5239 else
5240 match = false;
David Howellsc69e8d92008-11-14 10:39:19 +11005241 rcu_read_unlock();
5242 return match;
5243}
5244
Rusty Russell961ccdd2008-06-23 13:55:38 +10005245static int __sched_setscheduler(struct task_struct *p, int policy,
KOSAKI Motohirofe7de492010-10-20 16:01:12 -07005246 const struct sched_param *param, bool user)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005247{
Srivatsa Vaddagiri83b699e2007-10-15 17:00:08 +02005248 int retval, oldprio, oldpolicy = -1, on_rq, running;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005249 unsigned long flags;
Thomas Gleixner83ab0aa2010-02-17 09:05:48 +01005250 const struct sched_class *prev_class;
Ingo Molnar70b97a72006-07-03 00:25:42 -07005251 struct rq *rq;
Lennart Poetteringca94c442009-06-15 17:17:47 +02005252 int reset_on_fork;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005253
Steven Rostedt66e53932006-06-27 02:54:44 -07005254 /* may grab non-irq protected spin_locks */
5255 BUG_ON(in_interrupt());
Linus Torvalds1da177e2005-04-16 15:20:36 -07005256recheck:
5257 /* double check policy once rq lock held */
Lennart Poetteringca94c442009-06-15 17:17:47 +02005258 if (policy < 0) {
5259 reset_on_fork = p->sched_reset_on_fork;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005260 policy = oldpolicy = p->policy;
Lennart Poetteringca94c442009-06-15 17:17:47 +02005261 } else {
5262 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
5263 policy &= ~SCHED_RESET_ON_FORK;
5264
5265 if (policy != SCHED_FIFO && policy != SCHED_RR &&
5266 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
5267 policy != SCHED_IDLE)
5268 return -EINVAL;
5269 }
5270
Linus Torvalds1da177e2005-04-16 15:20:36 -07005271 /*
5272 * Valid priorities for SCHED_FIFO and SCHED_RR are
Ingo Molnardd41f592007-07-09 18:51:59 +02005273 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
5274 * SCHED_BATCH and SCHED_IDLE is 0.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005275 */
5276 if (param->sched_priority < 0 ||
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07005277 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
Steven Rostedtd46523e2005-07-25 16:28:39 -04005278 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
Linus Torvalds1da177e2005-04-16 15:20:36 -07005279 return -EINVAL;
Ingo Molnare05606d2007-07-09 18:51:59 +02005280 if (rt_policy(policy) != (param->sched_priority != 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07005281 return -EINVAL;
5282
Olivier Croquette37e4ab32005-06-25 14:57:32 -07005283 /*
5284 * Allow unprivileged RT tasks to decrease priority:
5285 */
Rusty Russell961ccdd2008-06-23 13:55:38 +10005286 if (user && !capable(CAP_SYS_NICE)) {
Ingo Molnare05606d2007-07-09 18:51:59 +02005287 if (rt_policy(policy)) {
Oleg Nesterova44702e2010-06-11 01:09:44 +02005288 unsigned long rlim_rtprio =
5289 task_rlimit(p, RLIMIT_RTPRIO);
Oleg Nesterov5fe1d752006-09-29 02:00:48 -07005290
Oleg Nesterov8dc3e902006-09-29 02:00:50 -07005291 /* can't set/change the rt policy */
5292 if (policy != p->policy && !rlim_rtprio)
5293 return -EPERM;
5294
5295 /* can't increase priority */
5296 if (param->sched_priority > p->rt_priority &&
5297 param->sched_priority > rlim_rtprio)
5298 return -EPERM;
5299 }
Darren Hartc02aa732011-02-17 15:37:07 -08005300
Ingo Molnardd41f592007-07-09 18:51:59 +02005301 /*
Darren Hartc02aa732011-02-17 15:37:07 -08005302 * Treat SCHED_IDLE as nice 20. Only allow a switch to
5303 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
Ingo Molnardd41f592007-07-09 18:51:59 +02005304 */
Darren Hartc02aa732011-02-17 15:37:07 -08005305 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
5306 if (!can_nice(p, TASK_NICE(p)))
5307 return -EPERM;
5308 }
Oleg Nesterov8dc3e902006-09-29 02:00:50 -07005309
Olivier Croquette37e4ab32005-06-25 14:57:32 -07005310 /* can't change other user's priorities */
David Howellsc69e8d92008-11-14 10:39:19 +11005311 if (!check_same_owner(p))
Olivier Croquette37e4ab32005-06-25 14:57:32 -07005312 return -EPERM;
Lennart Poetteringca94c442009-06-15 17:17:47 +02005313
5314 /* Normal users shall not reset the sched_reset_on_fork flag */
5315 if (p->sched_reset_on_fork && !reset_on_fork)
5316 return -EPERM;
Olivier Croquette37e4ab32005-06-25 14:57:32 -07005317 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005318
Jeremy Fitzhardinge725aad22008-08-03 09:33:03 -07005319 if (user) {
KOSAKI Motohirob0ae1982010-10-15 04:21:18 +09005320 retval = security_task_setscheduler(p);
Jeremy Fitzhardinge725aad22008-08-03 09:33:03 -07005321 if (retval)
5322 return retval;
5323 }
5324
Linus Torvalds1da177e2005-04-16 15:20:36 -07005325 /*
Ingo Molnarb29739f2006-06-27 02:54:51 -07005326 * make sure no PI-waiters arrive (or leave) while we are
5327 * changing the priority of the task:
Peter Zijlstra0122ec52011-04-05 17:23:51 +02005328 *
Lucas De Marchi25985ed2011-03-30 22:57:33 -03005329 * To be able to change p->policy safely, the appropriate
Linus Torvalds1da177e2005-04-16 15:20:36 -07005330 * runqueue lock must be held.
5331 */
Peter Zijlstra0122ec52011-04-05 17:23:51 +02005332 rq = task_rq_lock(p, &flags);
Peter Zijlstradc61b1d2010-06-08 11:40:42 +02005333
Peter Zijlstra34f971f2010-09-22 13:53:15 +02005334 /*
5335 * Changing the policy of the stop threads its a very bad idea
5336 */
5337 if (p == rq->stop) {
Peter Zijlstra0122ec52011-04-05 17:23:51 +02005338 task_rq_unlock(rq, p, &flags);
Peter Zijlstra34f971f2010-09-22 13:53:15 +02005339 return -EINVAL;
5340 }
5341
Dario Faggiolia51e9192011-03-24 14:00:18 +01005342 /*
5343 * If not changing anything there's no need to proceed further:
5344 */
5345 if (unlikely(policy == p->policy && (!rt_policy(policy) ||
5346 param->sched_priority == p->rt_priority))) {
5347
5348 __task_rq_unlock(rq);
5349 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5350 return 0;
5351 }
5352
Peter Zijlstradc61b1d2010-06-08 11:40:42 +02005353#ifdef CONFIG_RT_GROUP_SCHED
5354 if (user) {
5355 /*
5356 * Do not allow realtime tasks into groups that have no runtime
5357 * assigned.
5358 */
5359 if (rt_bandwidth_enabled() && rt_policy(policy) &&
Mike Galbraithf4493772011-01-13 04:54:50 +01005360 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
5361 !task_group_is_autogroup(task_group(p))) {
Peter Zijlstra0122ec52011-04-05 17:23:51 +02005362 task_rq_unlock(rq, p, &flags);
Peter Zijlstradc61b1d2010-06-08 11:40:42 +02005363 return -EPERM;
5364 }
5365 }
5366#endif
5367
Linus Torvalds1da177e2005-04-16 15:20:36 -07005368 /* recheck policy now with rq lock held */
5369 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
5370 policy = oldpolicy = -1;
Peter Zijlstra0122ec52011-04-05 17:23:51 +02005371 task_rq_unlock(rq, p, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005372 goto recheck;
5373 }
Peter Zijlstrafd2f4412011-04-05 17:23:44 +02005374 on_rq = p->on_rq;
Dmitry Adamushko051a1d12007-12-18 15:21:13 +01005375 running = task_current(rq, p);
Hiroshi Shimamoto0e1f3482008-03-10 11:01:20 -07005376 if (on_rq)
Ingo Molnar2e1cb742007-08-09 11:16:49 +02005377 deactivate_task(rq, p, 0);
Hiroshi Shimamoto0e1f3482008-03-10 11:01:20 -07005378 if (running)
5379 p->sched_class->put_prev_task(rq, p);
Dmitry Adamushkof6b53202007-10-15 17:00:08 +02005380
Lennart Poetteringca94c442009-06-15 17:17:47 +02005381 p->sched_reset_on_fork = reset_on_fork;
5382
Linus Torvalds1da177e2005-04-16 15:20:36 -07005383 oldprio = p->prio;
Thomas Gleixner83ab0aa2010-02-17 09:05:48 +01005384 prev_class = p->sched_class;
Ingo Molnardd41f592007-07-09 18:51:59 +02005385 __setscheduler(rq, p, policy, param->sched_priority);
Dmitry Adamushkof6b53202007-10-15 17:00:08 +02005386
Hiroshi Shimamoto0e1f3482008-03-10 11:01:20 -07005387 if (running)
5388 p->sched_class->set_curr_task(rq);
Peter Zijlstrada7a7352011-01-17 17:03:27 +01005389 if (on_rq)
Ingo Molnardd41f592007-07-09 18:51:59 +02005390 activate_task(rq, p, 0);
Steven Rostedtcb469842008-01-25 21:08:22 +01005391
Peter Zijlstrada7a7352011-01-17 17:03:27 +01005392 check_class_changed(rq, p, prev_class, oldprio);
Peter Zijlstra0122ec52011-04-05 17:23:51 +02005393 task_rq_unlock(rq, p, &flags);
Ingo Molnarb29739f2006-06-27 02:54:51 -07005394
Thomas Gleixner95e02ca2006-06-27 02:55:02 -07005395 rt_mutex_adjust_pi(p);
5396
Linus Torvalds1da177e2005-04-16 15:20:36 -07005397 return 0;
5398}
Rusty Russell961ccdd2008-06-23 13:55:38 +10005399
5400/**
5401 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
5402 * @p: the task in question.
5403 * @policy: new policy.
5404 * @param: structure containing the new RT priority.
5405 *
5406 * NOTE that the task may be already dead.
5407 */
5408int sched_setscheduler(struct task_struct *p, int policy,
KOSAKI Motohirofe7de492010-10-20 16:01:12 -07005409 const struct sched_param *param)
Rusty Russell961ccdd2008-06-23 13:55:38 +10005410{
5411 return __sched_setscheduler(p, policy, param, true);
5412}
Linus Torvalds1da177e2005-04-16 15:20:36 -07005413EXPORT_SYMBOL_GPL(sched_setscheduler);
5414
Rusty Russell961ccdd2008-06-23 13:55:38 +10005415/**
5416 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
5417 * @p: the task in question.
5418 * @policy: new policy.
5419 * @param: structure containing the new RT priority.
5420 *
5421 * Just like sched_setscheduler, only don't bother checking if the
5422 * current context has permission. For example, this is needed in
5423 * stop_machine(): we create temporary high priority worker threads,
5424 * but our caller might not have that capability.
5425 */
5426int sched_setscheduler_nocheck(struct task_struct *p, int policy,
KOSAKI Motohirofe7de492010-10-20 16:01:12 -07005427 const struct sched_param *param)
Rusty Russell961ccdd2008-06-23 13:55:38 +10005428{
5429 return __sched_setscheduler(p, policy, param, false);
5430}
5431
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07005432static int
5433do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005434{
Linus Torvalds1da177e2005-04-16 15:20:36 -07005435 struct sched_param lparam;
5436 struct task_struct *p;
Ingo Molnar36c8b582006-07-03 00:25:41 -07005437 int retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005438
5439 if (!param || pid < 0)
5440 return -EINVAL;
5441 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
5442 return -EFAULT;
Oleg Nesterov5fe1d752006-09-29 02:00:48 -07005443
5444 rcu_read_lock();
5445 retval = -ESRCH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005446 p = find_process_by_pid(pid);
Oleg Nesterov5fe1d752006-09-29 02:00:48 -07005447 if (p != NULL)
5448 retval = sched_setscheduler(p, policy, &lparam);
5449 rcu_read_unlock();
Ingo Molnar36c8b582006-07-03 00:25:41 -07005450
Linus Torvalds1da177e2005-04-16 15:20:36 -07005451 return retval;
5452}
5453
5454/**
5455 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
5456 * @pid: the pid in question.
5457 * @policy: new policy.
5458 * @param: structure containing the new RT priority.
5459 */
Heiko Carstens5add95d2009-01-14 14:14:08 +01005460SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
5461 struct sched_param __user *, param)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005462{
Jason Baronc21761f2006-01-18 17:43:03 -08005463 /* negative values for policy are not valid */
5464 if (policy < 0)
5465 return -EINVAL;
5466
Linus Torvalds1da177e2005-04-16 15:20:36 -07005467 return do_sched_setscheduler(pid, policy, param);
5468}
5469
5470/**
5471 * sys_sched_setparam - set/change the RT priority of a thread
5472 * @pid: the pid in question.
5473 * @param: structure containing the new RT priority.
5474 */
Heiko Carstens5add95d2009-01-14 14:14:08 +01005475SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005476{
5477 return do_sched_setscheduler(pid, -1, param);
5478}
5479
5480/**
5481 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
5482 * @pid: the pid in question.
5483 */
Heiko Carstens5add95d2009-01-14 14:14:08 +01005484SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005485{
Ingo Molnar36c8b582006-07-03 00:25:41 -07005486 struct task_struct *p;
Andi Kleen3a5c3592007-10-15 17:00:14 +02005487 int retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005488
5489 if (pid < 0)
Andi Kleen3a5c3592007-10-15 17:00:14 +02005490 return -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005491
5492 retval = -ESRCH;
Thomas Gleixner5fe85be2009-12-09 10:14:58 +00005493 rcu_read_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005494 p = find_process_by_pid(pid);
5495 if (p) {
5496 retval = security_task_getscheduler(p);
5497 if (!retval)
Lennart Poetteringca94c442009-06-15 17:17:47 +02005498 retval = p->policy
5499 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005500 }
Thomas Gleixner5fe85be2009-12-09 10:14:58 +00005501 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005502 return retval;
5503}
5504
5505/**
Lennart Poetteringca94c442009-06-15 17:17:47 +02005506 * sys_sched_getparam - get the RT priority of a thread
Linus Torvalds1da177e2005-04-16 15:20:36 -07005507 * @pid: the pid in question.
5508 * @param: structure containing the RT priority.
5509 */
Heiko Carstens5add95d2009-01-14 14:14:08 +01005510SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005511{
5512 struct sched_param lp;
Ingo Molnar36c8b582006-07-03 00:25:41 -07005513 struct task_struct *p;
Andi Kleen3a5c3592007-10-15 17:00:14 +02005514 int retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005515
5516 if (!param || pid < 0)
Andi Kleen3a5c3592007-10-15 17:00:14 +02005517 return -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005518
Thomas Gleixner5fe85be2009-12-09 10:14:58 +00005519 rcu_read_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005520 p = find_process_by_pid(pid);
5521 retval = -ESRCH;
5522 if (!p)
5523 goto out_unlock;
5524
5525 retval = security_task_getscheduler(p);
5526 if (retval)
5527 goto out_unlock;
5528
5529 lp.sched_priority = p->rt_priority;
Thomas Gleixner5fe85be2009-12-09 10:14:58 +00005530 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005531
5532 /*
5533 * This one might sleep, we cannot do it with a spinlock held ...
5534 */
5535 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
5536
Linus Torvalds1da177e2005-04-16 15:20:36 -07005537 return retval;
5538
5539out_unlock:
Thomas Gleixner5fe85be2009-12-09 10:14:58 +00005540 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005541 return retval;
5542}
5543
Rusty Russell96f874e2008-11-25 02:35:14 +10305544long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005545{
Rusty Russell5a16f3d2008-11-25 02:35:11 +10305546 cpumask_var_t cpus_allowed, new_mask;
Ingo Molnar36c8b582006-07-03 00:25:41 -07005547 struct task_struct *p;
5548 int retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005549
Gautham R Shenoy95402b32008-01-25 21:08:02 +01005550 get_online_cpus();
Thomas Gleixner23f5d142009-12-09 10:15:01 +00005551 rcu_read_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005552
5553 p = find_process_by_pid(pid);
5554 if (!p) {
Thomas Gleixner23f5d142009-12-09 10:15:01 +00005555 rcu_read_unlock();
Gautham R Shenoy95402b32008-01-25 21:08:02 +01005556 put_online_cpus();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005557 return -ESRCH;
5558 }
5559
Thomas Gleixner23f5d142009-12-09 10:15:01 +00005560 /* Prevent p going away */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005561 get_task_struct(p);
Thomas Gleixner23f5d142009-12-09 10:15:01 +00005562 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005563
Rusty Russell5a16f3d2008-11-25 02:35:11 +10305564 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
5565 retval = -ENOMEM;
5566 goto out_put_task;
5567 }
5568 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
5569 retval = -ENOMEM;
5570 goto out_free_cpus_allowed;
5571 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005572 retval = -EPERM;
Serge E. Hallynb0e77592011-03-23 16:43:24 -07005573 if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
Linus Torvalds1da177e2005-04-16 15:20:36 -07005574 goto out_unlock;
5575
KOSAKI Motohirob0ae1982010-10-15 04:21:18 +09005576 retval = security_task_setscheduler(p);
David Quigleye7834f82006-06-23 02:03:59 -07005577 if (retval)
5578 goto out_unlock;
5579
Rusty Russell5a16f3d2008-11-25 02:35:11 +10305580 cpuset_cpus_allowed(p, cpus_allowed);
5581 cpumask_and(new_mask, in_mask, cpus_allowed);
Peter Zijlstra49246272010-10-17 21:46:10 +02005582again:
Rusty Russell5a16f3d2008-11-25 02:35:11 +10305583 retval = set_cpus_allowed_ptr(p, new_mask);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005584
Paul Menage8707d8b2007-10-18 23:40:22 -07005585 if (!retval) {
Rusty Russell5a16f3d2008-11-25 02:35:11 +10305586 cpuset_cpus_allowed(p, cpus_allowed);
5587 if (!cpumask_subset(new_mask, cpus_allowed)) {
Paul Menage8707d8b2007-10-18 23:40:22 -07005588 /*
5589 * We must have raced with a concurrent cpuset
5590 * update. Just reset the cpus_allowed to the
5591 * cpuset's cpus_allowed
5592 */
Rusty Russell5a16f3d2008-11-25 02:35:11 +10305593 cpumask_copy(new_mask, cpus_allowed);
Paul Menage8707d8b2007-10-18 23:40:22 -07005594 goto again;
5595 }
5596 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005597out_unlock:
Rusty Russell5a16f3d2008-11-25 02:35:11 +10305598 free_cpumask_var(new_mask);
5599out_free_cpus_allowed:
5600 free_cpumask_var(cpus_allowed);
5601out_put_task:
Linus Torvalds1da177e2005-04-16 15:20:36 -07005602 put_task_struct(p);
Gautham R Shenoy95402b32008-01-25 21:08:02 +01005603 put_online_cpus();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005604 return retval;
5605}
5606
5607static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
Rusty Russell96f874e2008-11-25 02:35:14 +10305608 struct cpumask *new_mask)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005609{
Rusty Russell96f874e2008-11-25 02:35:14 +10305610 if (len < cpumask_size())
5611 cpumask_clear(new_mask);
5612 else if (len > cpumask_size())
5613 len = cpumask_size();
5614
Linus Torvalds1da177e2005-04-16 15:20:36 -07005615 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
5616}
5617
5618/**
5619 * sys_sched_setaffinity - set the cpu affinity of a process
5620 * @pid: pid of the process
5621 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
5622 * @user_mask_ptr: user-space pointer to the new cpu mask
5623 */
Heiko Carstens5add95d2009-01-14 14:14:08 +01005624SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
5625 unsigned long __user *, user_mask_ptr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005626{
Rusty Russell5a16f3d2008-11-25 02:35:11 +10305627 cpumask_var_t new_mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005628 int retval;
5629
Rusty Russell5a16f3d2008-11-25 02:35:11 +10305630 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
5631 return -ENOMEM;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005632
Rusty Russell5a16f3d2008-11-25 02:35:11 +10305633 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
5634 if (retval == 0)
5635 retval = sched_setaffinity(pid, new_mask);
5636 free_cpumask_var(new_mask);
5637 return retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005638}
5639
Rusty Russell96f874e2008-11-25 02:35:14 +10305640long sched_getaffinity(pid_t pid, struct cpumask *mask)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005641{
Ingo Molnar36c8b582006-07-03 00:25:41 -07005642 struct task_struct *p;
Thomas Gleixner31605682009-12-08 20:24:16 +00005643 unsigned long flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005644 int retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005645
Gautham R Shenoy95402b32008-01-25 21:08:02 +01005646 get_online_cpus();
Thomas Gleixner23f5d142009-12-09 10:15:01 +00005647 rcu_read_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005648
5649 retval = -ESRCH;
5650 p = find_process_by_pid(pid);
5651 if (!p)
5652 goto out_unlock;
5653
David Quigleye7834f82006-06-23 02:03:59 -07005654 retval = security_task_getscheduler(p);
5655 if (retval)
5656 goto out_unlock;
5657
Peter Zijlstra013fdb82011-04-05 17:23:45 +02005658 raw_spin_lock_irqsave(&p->pi_lock, flags);
Rusty Russell96f874e2008-11-25 02:35:14 +10305659 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
Peter Zijlstra013fdb82011-04-05 17:23:45 +02005660 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005661
5662out_unlock:
Thomas Gleixner23f5d142009-12-09 10:15:01 +00005663 rcu_read_unlock();
Gautham R Shenoy95402b32008-01-25 21:08:02 +01005664 put_online_cpus();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005665
Ulrich Drepper9531b622007-08-09 11:16:46 +02005666 return retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005667}
5668
5669/**
5670 * sys_sched_getaffinity - get the cpu affinity of a process
5671 * @pid: pid of the process
5672 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
5673 * @user_mask_ptr: user-space pointer to hold the current cpu mask
5674 */
Heiko Carstens5add95d2009-01-14 14:14:08 +01005675SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
5676 unsigned long __user *, user_mask_ptr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005677{
5678 int ret;
Rusty Russellf17c8602008-11-25 02:35:11 +10305679 cpumask_var_t mask;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005680
Anton Blanchard84fba5e2010-04-06 17:02:19 +10005681 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
KOSAKI Motohirocd3d8032010-03-12 16:15:36 +09005682 return -EINVAL;
5683 if (len & (sizeof(unsigned long)-1))
Linus Torvalds1da177e2005-04-16 15:20:36 -07005684 return -EINVAL;
5685
Rusty Russellf17c8602008-11-25 02:35:11 +10305686 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
5687 return -ENOMEM;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005688
Rusty Russellf17c8602008-11-25 02:35:11 +10305689 ret = sched_getaffinity(pid, mask);
5690 if (ret == 0) {
KOSAKI Motohiro8bc037f2010-03-17 09:36:58 +09005691 size_t retlen = min_t(size_t, len, cpumask_size());
KOSAKI Motohirocd3d8032010-03-12 16:15:36 +09005692
5693 if (copy_to_user(user_mask_ptr, mask, retlen))
Rusty Russellf17c8602008-11-25 02:35:11 +10305694 ret = -EFAULT;
5695 else
KOSAKI Motohirocd3d8032010-03-12 16:15:36 +09005696 ret = retlen;
Rusty Russellf17c8602008-11-25 02:35:11 +10305697 }
5698 free_cpumask_var(mask);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005699
Rusty Russellf17c8602008-11-25 02:35:11 +10305700 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005701}
5702
5703/**
5704 * sys_sched_yield - yield the current processor to other threads.
5705 *
Ingo Molnardd41f592007-07-09 18:51:59 +02005706 * This function yields the current CPU to other tasks. If there are no
5707 * other threads running on this CPU then this function will return.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005708 */
Heiko Carstens5add95d2009-01-14 14:14:08 +01005709SYSCALL_DEFINE0(sched_yield)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005710{
Ingo Molnar70b97a72006-07-03 00:25:42 -07005711 struct rq *rq = this_rq_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005712
Ingo Molnar2d723762007-10-15 17:00:12 +02005713 schedstat_inc(rq, yld_count);
Dmitry Adamushko4530d7a2007-10-15 17:00:08 +02005714 current->sched_class->yield_task(rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005715
5716 /*
5717 * Since we are going to call schedule() anyway, there's
5718 * no need to preempt or enable interrupts:
5719 */
5720 __release(rq->lock);
Ingo Molnar8a25d5d2006-07-03 00:24:54 -07005721 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
Thomas Gleixner9828ea92009-12-03 20:55:53 +01005722 do_raw_spin_unlock(&rq->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005723 preempt_enable_no_resched();
5724
5725 schedule();
5726
5727 return 0;
5728}
5729
Peter Zijlstrad86ee482009-07-10 14:57:57 +02005730static inline int should_resched(void)
5731{
5732 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
5733}
5734
Andrew Mortone7b38402006-06-30 01:56:00 -07005735static void __cond_resched(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005736{
Frederic Weisbeckere7aaaa62009-07-16 15:44:29 +02005737 add_preempt_count(PREEMPT_ACTIVE);
Thomas Gleixnerc259e012011-06-22 19:47:00 +02005738 __schedule();
Frederic Weisbeckere7aaaa62009-07-16 15:44:29 +02005739 sub_preempt_count(PREEMPT_ACTIVE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005740}
5741
Herbert Xu02b67cc32008-01-25 21:08:28 +01005742int __sched _cond_resched(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005743{
Peter Zijlstrad86ee482009-07-10 14:57:57 +02005744 if (should_resched()) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005745 __cond_resched();
5746 return 1;
5747 }
5748 return 0;
5749}
Herbert Xu02b67cc32008-01-25 21:08:28 +01005750EXPORT_SYMBOL(_cond_resched);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005751
5752/*
Frederic Weisbecker613afbf2009-07-16 15:44:29 +02005753 * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
Linus Torvalds1da177e2005-04-16 15:20:36 -07005754 * call schedule, and on return reacquire the lock.
5755 *
Ingo Molnar41a2d6c2007-12-05 15:46:09 +01005756 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
Linus Torvalds1da177e2005-04-16 15:20:36 -07005757 * operations here to prevent schedule() from being called twice (once via
5758 * spin_unlock(), once by hand).
5759 */
Frederic Weisbecker613afbf2009-07-16 15:44:29 +02005760int __cond_resched_lock(spinlock_t *lock)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005761{
Peter Zijlstrad86ee482009-07-10 14:57:57 +02005762 int resched = should_resched();
Jan Kara6df3cec2005-06-13 15:52:32 -07005763 int ret = 0;
5764
Peter Zijlstraf607c662009-07-20 19:16:29 +02005765 lockdep_assert_held(lock);
5766
Nick Piggin95c354f2008-01-30 13:31:20 +01005767 if (spin_needbreak(lock) || resched) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005768 spin_unlock(lock);
Peter Zijlstrad86ee482009-07-10 14:57:57 +02005769 if (resched)
Nick Piggin95c354f2008-01-30 13:31:20 +01005770 __cond_resched();
5771 else
5772 cpu_relax();
Jan Kara6df3cec2005-06-13 15:52:32 -07005773 ret = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005774 spin_lock(lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005775 }
Jan Kara6df3cec2005-06-13 15:52:32 -07005776 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005777}
Frederic Weisbecker613afbf2009-07-16 15:44:29 +02005778EXPORT_SYMBOL(__cond_resched_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005779
Frederic Weisbecker613afbf2009-07-16 15:44:29 +02005780int __sched __cond_resched_softirq(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005781{
5782 BUG_ON(!in_softirq());
5783
Peter Zijlstrad86ee482009-07-10 14:57:57 +02005784 if (should_resched()) {
Thomas Gleixner98d825672007-05-23 13:58:18 -07005785 local_bh_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005786 __cond_resched();
5787 local_bh_disable();
5788 return 1;
5789 }
5790 return 0;
5791}
Frederic Weisbecker613afbf2009-07-16 15:44:29 +02005792EXPORT_SYMBOL(__cond_resched_softirq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005793
Linus Torvalds1da177e2005-04-16 15:20:36 -07005794/**
5795 * yield - yield the current processor to other threads.
5796 *
Robert P. J. Day72fd4a32007-02-10 01:45:59 -08005797 * This is a shortcut for kernel-space yielding - it marks the
Linus Torvalds1da177e2005-04-16 15:20:36 -07005798 * thread runnable and calls sys_sched_yield().
5799 */
5800void __sched yield(void)
5801{
5802 set_current_state(TASK_RUNNING);
5803 sys_sched_yield();
5804}
Linus Torvalds1da177e2005-04-16 15:20:36 -07005805EXPORT_SYMBOL(yield);
5806
Mike Galbraithd95f4122011-02-01 09:50:51 -05005807/**
5808 * yield_to - yield the current processor to another thread in
5809 * your thread group, or accelerate that thread toward the
5810 * processor it's on.
Randy Dunlap16addf92011-03-18 09:34:53 -07005811 * @p: target task
5812 * @preempt: whether task preemption is allowed or not
Mike Galbraithd95f4122011-02-01 09:50:51 -05005813 *
5814 * It's the caller's job to ensure that the target task struct
5815 * can't go away on us before we can do any checks.
5816 *
5817 * Returns true if we indeed boosted the target task.
5818 */
5819bool __sched yield_to(struct task_struct *p, bool preempt)
5820{
5821 struct task_struct *curr = current;
5822 struct rq *rq, *p_rq;
5823 unsigned long flags;
5824 bool yielded = 0;
5825
5826 local_irq_save(flags);
5827 rq = this_rq();
5828
5829again:
5830 p_rq = task_rq(p);
5831 double_rq_lock(rq, p_rq);
5832 while (task_rq(p) != p_rq) {
5833 double_rq_unlock(rq, p_rq);
5834 goto again;
5835 }
5836
5837 if (!curr->sched_class->yield_to_task)
5838 goto out;
5839
5840 if (curr->sched_class != p->sched_class)
5841 goto out;
5842
5843 if (task_running(p_rq, p) || p->state)
5844 goto out;
5845
5846 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
Venkatesh Pallipadi6d1cafd2011-03-01 16:28:21 -08005847 if (yielded) {
Mike Galbraithd95f4122011-02-01 09:50:51 -05005848 schedstat_inc(rq, yld_count);
Venkatesh Pallipadi6d1cafd2011-03-01 16:28:21 -08005849 /*
5850 * Make p's CPU reschedule; pick_next_entity takes care of
5851 * fairness.
5852 */
5853 if (preempt && rq != p_rq)
5854 resched_task(p_rq->curr);
5855 }
Mike Galbraithd95f4122011-02-01 09:50:51 -05005856
5857out:
5858 double_rq_unlock(rq, p_rq);
5859 local_irq_restore(flags);
5860
5861 if (yielded)
5862 schedule();
5863
5864 return yielded;
5865}
5866EXPORT_SYMBOL_GPL(yield_to);
5867
Linus Torvalds1da177e2005-04-16 15:20:36 -07005868/*
Ingo Molnar41a2d6c2007-12-05 15:46:09 +01005869 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
Linus Torvalds1da177e2005-04-16 15:20:36 -07005870 * that process accounting knows that this is a task in IO wait state.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005871 */
5872void __sched io_schedule(void)
5873{
Hitoshi Mitake54d35f22009-06-29 14:44:57 +09005874 struct rq *rq = raw_rq();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005875
Shailabh Nagar0ff92242006-07-14 00:24:37 -07005876 delayacct_blkio_start();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005877 atomic_inc(&rq->nr_iowait);
Jens Axboe73c10102011-03-08 13:19:51 +01005878 blk_flush_plug(current);
Arjan van de Ven8f0dfc32009-07-20 11:26:58 -07005879 current->in_iowait = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005880 schedule();
Arjan van de Ven8f0dfc32009-07-20 11:26:58 -07005881 current->in_iowait = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005882 atomic_dec(&rq->nr_iowait);
Shailabh Nagar0ff92242006-07-14 00:24:37 -07005883 delayacct_blkio_end();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005884}
Linus Torvalds1da177e2005-04-16 15:20:36 -07005885EXPORT_SYMBOL(io_schedule);
5886
5887long __sched io_schedule_timeout(long timeout)
5888{
Hitoshi Mitake54d35f22009-06-29 14:44:57 +09005889 struct rq *rq = raw_rq();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005890 long ret;
5891
Shailabh Nagar0ff92242006-07-14 00:24:37 -07005892 delayacct_blkio_start();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005893 atomic_inc(&rq->nr_iowait);
Jens Axboe73c10102011-03-08 13:19:51 +01005894 blk_flush_plug(current);
Arjan van de Ven8f0dfc32009-07-20 11:26:58 -07005895 current->in_iowait = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005896 ret = schedule_timeout(timeout);
Arjan van de Ven8f0dfc32009-07-20 11:26:58 -07005897 current->in_iowait = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005898 atomic_dec(&rq->nr_iowait);
Shailabh Nagar0ff92242006-07-14 00:24:37 -07005899 delayacct_blkio_end();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005900 return ret;
5901}
5902
5903/**
5904 * sys_sched_get_priority_max - return maximum RT priority.
5905 * @policy: scheduling class.
5906 *
5907 * this syscall returns the maximum rt_priority that can be used
5908 * by a given scheduling class.
5909 */
Heiko Carstens5add95d2009-01-14 14:14:08 +01005910SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005911{
5912 int ret = -EINVAL;
5913
5914 switch (policy) {
5915 case SCHED_FIFO:
5916 case SCHED_RR:
5917 ret = MAX_USER_RT_PRIO-1;
5918 break;
5919 case SCHED_NORMAL:
Ingo Molnarb0a94992006-01-14 13:20:41 -08005920 case SCHED_BATCH:
Ingo Molnardd41f592007-07-09 18:51:59 +02005921 case SCHED_IDLE:
Linus Torvalds1da177e2005-04-16 15:20:36 -07005922 ret = 0;
5923 break;
5924 }
5925 return ret;
5926}
5927
5928/**
5929 * sys_sched_get_priority_min - return minimum RT priority.
5930 * @policy: scheduling class.
5931 *
5932 * this syscall returns the minimum rt_priority that can be used
5933 * by a given scheduling class.
5934 */
Heiko Carstens5add95d2009-01-14 14:14:08 +01005935SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005936{
5937 int ret = -EINVAL;
5938
5939 switch (policy) {
5940 case SCHED_FIFO:
5941 case SCHED_RR:
5942 ret = 1;
5943 break;
5944 case SCHED_NORMAL:
Ingo Molnarb0a94992006-01-14 13:20:41 -08005945 case SCHED_BATCH:
Ingo Molnardd41f592007-07-09 18:51:59 +02005946 case SCHED_IDLE:
Linus Torvalds1da177e2005-04-16 15:20:36 -07005947 ret = 0;
5948 }
5949 return ret;
5950}
5951
5952/**
5953 * sys_sched_rr_get_interval - return the default timeslice of a process.
5954 * @pid: pid of the process.
5955 * @interval: userspace pointer to the timeslice value.
5956 *
5957 * this syscall writes the default timeslice value of a given process
5958 * into the user-space timespec buffer. A value of '0' means infinity.
5959 */
Heiko Carstens17da2bd2009-01-14 14:14:10 +01005960SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
Heiko Carstens754fe8d2009-01-14 14:14:09 +01005961 struct timespec __user *, interval)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005962{
Ingo Molnar36c8b582006-07-03 00:25:41 -07005963 struct task_struct *p;
Dmitry Adamushkoa4ec24b2007-10-15 17:00:13 +02005964 unsigned int time_slice;
Thomas Gleixnerdba091b2009-12-09 09:32:03 +01005965 unsigned long flags;
5966 struct rq *rq;
Andi Kleen3a5c3592007-10-15 17:00:14 +02005967 int retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005968 struct timespec t;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005969
5970 if (pid < 0)
Andi Kleen3a5c3592007-10-15 17:00:14 +02005971 return -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005972
5973 retval = -ESRCH;
Thomas Gleixner1a551ae2009-12-09 10:15:11 +00005974 rcu_read_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005975 p = find_process_by_pid(pid);
5976 if (!p)
5977 goto out_unlock;
5978
5979 retval = security_task_getscheduler(p);
5980 if (retval)
5981 goto out_unlock;
5982
Thomas Gleixnerdba091b2009-12-09 09:32:03 +01005983 rq = task_rq_lock(p, &flags);
5984 time_slice = p->sched_class->get_rr_interval(rq, p);
Peter Zijlstra0122ec52011-04-05 17:23:51 +02005985 task_rq_unlock(rq, p, &flags);
Dmitry Adamushkoa4ec24b2007-10-15 17:00:13 +02005986
Thomas Gleixner1a551ae2009-12-09 10:15:11 +00005987 rcu_read_unlock();
Dmitry Adamushkoa4ec24b2007-10-15 17:00:13 +02005988 jiffies_to_timespec(time_slice, &t);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005989 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005990 return retval;
Andi Kleen3a5c3592007-10-15 17:00:14 +02005991
Linus Torvalds1da177e2005-04-16 15:20:36 -07005992out_unlock:
Thomas Gleixner1a551ae2009-12-09 10:15:11 +00005993 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07005994 return retval;
5995}
5996
Steven Rostedt7c731e02008-05-12 21:20:41 +02005997static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
Ingo Molnar36c8b582006-07-03 00:25:41 -07005998
Ingo Molnar82a1fcb2008-01-25 21:08:02 +01005999void sched_show_task(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006000{
Linus Torvalds1da177e2005-04-16 15:20:36 -07006001 unsigned long free = 0;
Ingo Molnar36c8b582006-07-03 00:25:41 -07006002 unsigned state;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006003
Linus Torvalds1da177e2005-04-16 15:20:36 -07006004 state = p->state ? __ffs(p->state) + 1 : 0;
Erik Gilling28d06862010-11-19 18:08:51 -08006005 printk(KERN_INFO "%-15.15s %c", p->comm,
Andreas Mohr2ed6e342006-07-10 04:43:52 -07006006 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
Ingo Molnar4bd77322007-07-11 21:21:47 +02006007#if BITS_PER_LONG == 32
Linus Torvalds1da177e2005-04-16 15:20:36 -07006008 if (state == TASK_RUNNING)
Peter Zijlstra3df0fc52009-12-20 14:23:57 +01006009 printk(KERN_CONT " running ");
Linus Torvalds1da177e2005-04-16 15:20:36 -07006010 else
Peter Zijlstra3df0fc52009-12-20 14:23:57 +01006011 printk(KERN_CONT " %08lx ", thread_saved_pc(p));
Linus Torvalds1da177e2005-04-16 15:20:36 -07006012#else
6013 if (state == TASK_RUNNING)
Peter Zijlstra3df0fc52009-12-20 14:23:57 +01006014 printk(KERN_CONT " running task ");
Linus Torvalds1da177e2005-04-16 15:20:36 -07006015 else
Peter Zijlstra3df0fc52009-12-20 14:23:57 +01006016 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
Linus Torvalds1da177e2005-04-16 15:20:36 -07006017#endif
6018#ifdef CONFIG_DEBUG_STACK_USAGE
Eric Sandeen7c9f8862008-04-22 16:38:23 -05006019 free = stack_not_used(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006020#endif
Peter Zijlstra3df0fc52009-12-20 14:23:57 +01006021 printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
David Rientjesaa47b7e2009-05-04 01:38:05 -07006022 task_pid_nr(p), task_pid_nr(p->real_parent),
6023 (unsigned long)task_thread_info(p)->flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006024
Nick Piggin5fb5e6d2008-01-25 21:08:34 +01006025 show_stack(p, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006026}
6027
Ingo Molnare59e2ae2006-12-06 20:35:59 -08006028void show_state_filter(unsigned long state_filter)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006029{
Ingo Molnar36c8b582006-07-03 00:25:41 -07006030 struct task_struct *g, *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006031
Ingo Molnar4bd77322007-07-11 21:21:47 +02006032#if BITS_PER_LONG == 32
Peter Zijlstra3df0fc52009-12-20 14:23:57 +01006033 printk(KERN_INFO
6034 " task PC stack pid father\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07006035#else
Peter Zijlstra3df0fc52009-12-20 14:23:57 +01006036 printk(KERN_INFO
6037 " task PC stack pid father\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07006038#endif
Thomas Gleixner510f5ac2011-07-17 20:47:54 +02006039 rcu_read_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07006040 do_each_thread(g, p) {
6041 /*
6042 * reset the NMI-timeout, listing all files on a slow
Lucas De Marchi25985ed2011-03-30 22:57:33 -03006043 * console might take a lot of time:
Linus Torvalds1da177e2005-04-16 15:20:36 -07006044 */
6045 touch_nmi_watchdog();
Ingo Molnar39bc89f2007-04-25 20:50:03 -07006046 if (!state_filter || (p->state & state_filter))
Ingo Molnar82a1fcb2008-01-25 21:08:02 +01006047 sched_show_task(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006048 } while_each_thread(g, p);
6049
Jeremy Fitzhardinge04c91672007-05-08 00:28:05 -07006050 touch_all_softlockup_watchdogs();
6051
Ingo Molnardd41f592007-07-09 18:51:59 +02006052#ifdef CONFIG_SCHED_DEBUG
6053 sysrq_sched_debug_show();
6054#endif
Thomas Gleixner510f5ac2011-07-17 20:47:54 +02006055 rcu_read_unlock();
Ingo Molnare59e2ae2006-12-06 20:35:59 -08006056 /*
6057 * Only show locks if all tasks are dumped:
6058 */
Shmulik Ladkani93335a22009-11-25 15:23:41 +02006059 if (!state_filter)
Ingo Molnare59e2ae2006-12-06 20:35:59 -08006060 debug_show_all_locks();
Linus Torvalds1da177e2005-04-16 15:20:36 -07006061}
6062
Ingo Molnar1df21052007-07-09 18:51:58 +02006063void __cpuinit init_idle_bootup_task(struct task_struct *idle)
6064{
Ingo Molnardd41f592007-07-09 18:51:59 +02006065 idle->sched_class = &idle_sched_class;
Ingo Molnar1df21052007-07-09 18:51:58 +02006066}
6067
Ingo Molnarf340c0d2005-06-28 16:40:42 +02006068/**
6069 * init_idle - set up an idle thread for a given CPU
6070 * @idle: task in question
6071 * @cpu: cpu the idle task belongs to
6072 *
6073 * NOTE: this function does not set the idle thread's NEED_RESCHED
6074 * flag, to make booting more robust.
6075 */
Nick Piggin5c1e1762006-10-03 01:14:04 -07006076void __cpuinit init_idle(struct task_struct *idle, int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006077{
Ingo Molnar70b97a72006-07-03 00:25:42 -07006078 struct rq *rq = cpu_rq(cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006079 unsigned long flags;
6080
Thomas Gleixner05fa7852009-11-17 14:28:38 +01006081 raw_spin_lock_irqsave(&rq->lock, flags);
Ingo Molnar5cbd54e2008-11-12 20:05:50 +01006082
Ingo Molnardd41f592007-07-09 18:51:59 +02006083 __sched_fork(idle);
Peter Zijlstra06b83b52009-12-16 18:04:35 +01006084 idle->state = TASK_RUNNING;
Ingo Molnardd41f592007-07-09 18:51:59 +02006085 idle->se.exec_start = sched_clock();
6086
KOSAKI Motohiro1e1b6c52011-05-19 15:08:58 +09006087 do_set_cpus_allowed(idle, cpumask_of(cpu));
Peter Zijlstra6506cf6c2010-09-16 17:50:31 +02006088 /*
6089 * We're having a chicken and egg problem, even though we are
6090 * holding rq->lock, the cpu isn't yet set to this cpu so the
6091 * lockdep check in task_group() will fail.
6092 *
6093 * Similar case to sched_fork(). / Alternatively we could
6094 * use task_rq_lock() here and obtain the other rq->lock.
6095 *
6096 * Silence PROVE_RCU
6097 */
6098 rcu_read_lock();
Ingo Molnardd41f592007-07-09 18:51:59 +02006099 __set_task_cpu(idle, cpu);
Peter Zijlstra6506cf6c2010-09-16 17:50:31 +02006100 rcu_read_unlock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07006101
Linus Torvalds1da177e2005-04-16 15:20:36 -07006102 rq->curr = rq->idle = idle;
Peter Zijlstra3ca7a442011-04-05 17:23:40 +02006103#if defined(CONFIG_SMP)
6104 idle->on_cpu = 1;
Nick Piggin4866cde2005-06-25 14:57:23 -07006105#endif
Thomas Gleixner05fa7852009-11-17 14:28:38 +01006106 raw_spin_unlock_irqrestore(&rq->lock, flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006107
6108 /* Set the preempt count _outside_ the spinlocks! */
Al Viroa1261f52005-11-13 16:06:55 -08006109 task_thread_info(idle)->preempt_count = 0;
Jonathan Corbet625f2a32011-04-22 11:19:10 -06006110
Ingo Molnardd41f592007-07-09 18:51:59 +02006111 /*
6112 * The idle tasks have their own, simple scheduling class:
6113 */
6114 idle->sched_class = &idle_sched_class;
Steven Rostedt868baf02011-02-10 21:26:13 -05006115 ftrace_graph_init_idle_task(idle, cpu);
Carsten Emdef1c6f1a2011-10-26 23:14:16 +02006116#if defined(CONFIG_SMP)
6117 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
6118#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07006119}
6120
6121/*
Ingo Molnar19978ca2007-11-09 22:39:38 +01006122 * Increase the granularity value when there are more CPUs,
6123 * because with more CPUs the 'effective latency' as visible
6124 * to users decreases. But the relationship is not linear,
6125 * so pick a second-best guess by going with the log2 of the
6126 * number of CPUs.
6127 *
6128 * This idea comes from the SD scheduler of Con Kolivas:
6129 */
Christian Ehrhardtacb4a842009-11-30 12:16:48 +01006130static int get_update_sysctl_factor(void)
Christian Ehrhardt0bcdcf22009-11-30 12:16:46 +01006131{
Mike Galbraith4ca3ef72009-12-10 09:25:53 +01006132 unsigned int cpus = min_t(int, num_online_cpus(), 8);
Christian Ehrhardt1983a922009-11-30 12:16:47 +01006133 unsigned int factor;
6134
6135 switch (sysctl_sched_tunable_scaling) {
6136 case SCHED_TUNABLESCALING_NONE:
6137 factor = 1;
6138 break;
6139 case SCHED_TUNABLESCALING_LINEAR:
6140 factor = cpus;
6141 break;
6142 case SCHED_TUNABLESCALING_LOG:
6143 default:
6144 factor = 1 + ilog2(cpus);
6145 break;
6146 }
Christian Ehrhardt0bcdcf22009-11-30 12:16:46 +01006147
Christian Ehrhardtacb4a842009-11-30 12:16:48 +01006148 return factor;
6149}
6150
6151static void update_sysctl(void)
6152{
6153 unsigned int factor = get_update_sysctl_factor();
6154
Christian Ehrhardt0bcdcf22009-11-30 12:16:46 +01006155#define SET_SYSCTL(name) \
6156 (sysctl_##name = (factor) * normalized_sysctl_##name)
6157 SET_SYSCTL(sched_min_granularity);
6158 SET_SYSCTL(sched_latency);
6159 SET_SYSCTL(sched_wakeup_granularity);
Christian Ehrhardt0bcdcf22009-11-30 12:16:46 +01006160#undef SET_SYSCTL
6161}
6162
Ingo Molnar19978ca2007-11-09 22:39:38 +01006163static inline void sched_init_granularity(void)
6164{
Christian Ehrhardt0bcdcf22009-11-30 12:16:46 +01006165 update_sysctl();
Ingo Molnar19978ca2007-11-09 22:39:38 +01006166}
6167
Linus Torvalds1da177e2005-04-16 15:20:36 -07006168#ifdef CONFIG_SMP
KOSAKI Motohiro1e1b6c52011-05-19 15:08:58 +09006169void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
6170{
6171 if (p->sched_class && p->sched_class->set_cpus_allowed)
6172 p->sched_class->set_cpus_allowed(p, new_mask);
Peter Zijlstra49396022011-06-25 15:45:46 +02006173
6174 cpumask_copy(&p->cpus_allowed, new_mask);
6175 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
KOSAKI Motohiro1e1b6c52011-05-19 15:08:58 +09006176}
6177
Linus Torvalds1da177e2005-04-16 15:20:36 -07006178/*
6179 * This is how migration works:
6180 *
Tejun Heo969c7922010-05-06 18:49:21 +02006181 * 1) we invoke migration_cpu_stop() on the target CPU using
6182 * stop_one_cpu().
6183 * 2) stopper starts to run (implicitly forcing the migrated thread
6184 * off the CPU)
6185 * 3) it checks whether the migrated task is still in the wrong runqueue.
6186 * 4) if it's in the wrong runqueue then the migration thread removes
Linus Torvalds1da177e2005-04-16 15:20:36 -07006187 * it and puts it into the right queue.
Tejun Heo969c7922010-05-06 18:49:21 +02006188 * 5) stopper completes and stop_one_cpu() returns and the migration
6189 * is done.
Linus Torvalds1da177e2005-04-16 15:20:36 -07006190 */
6191
6192/*
6193 * Change a given task's CPU affinity. Migrate the thread to a
6194 * proper CPU and schedule it away if the CPU it's executing on
6195 * is removed from the allowed bitmask.
6196 *
6197 * NOTE: the caller must have a valid reference to the task, the
Ingo Molnar41a2d6c2007-12-05 15:46:09 +01006198 * task must not exit() & deallocate itself prematurely. The
Linus Torvalds1da177e2005-04-16 15:20:36 -07006199 * call is not atomic; no spinlocks may be held.
6200 */
Rusty Russell96f874e2008-11-25 02:35:14 +10306201int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006202{
6203 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07006204 struct rq *rq;
Tejun Heo969c7922010-05-06 18:49:21 +02006205 unsigned int dest_cpu;
Ingo Molnar48f24c42006-07-03 00:25:40 -07006206 int ret = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006207
6208 rq = task_rq_lock(p, &flags);
Peter Zijlstrae2912002009-12-16 18:04:36 +01006209
Yong Zhangdb44fc02011-05-09 22:07:05 +08006210 if (cpumask_equal(&p->cpus_allowed, new_mask))
6211 goto out;
6212
Peter Zijlstra6ad4c182009-11-25 13:31:39 +01006213 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07006214 ret = -EINVAL;
6215 goto out;
6216 }
6217
Yong Zhangdb44fc02011-05-09 22:07:05 +08006218 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
David Rientjes9985b0b2008-06-05 12:57:11 -07006219 ret = -EINVAL;
6220 goto out;
6221 }
6222
KOSAKI Motohiro1e1b6c52011-05-19 15:08:58 +09006223 do_set_cpus_allowed(p, new_mask);
Gregory Haskins73fe6aa2008-01-25 21:08:07 +01006224
Linus Torvalds1da177e2005-04-16 15:20:36 -07006225 /* Can the task run on the task's current CPU? If so, we're done */
Rusty Russell96f874e2008-11-25 02:35:14 +10306226 if (cpumask_test_cpu(task_cpu(p), new_mask))
Linus Torvalds1da177e2005-04-16 15:20:36 -07006227 goto out;
6228
Tejun Heo969c7922010-05-06 18:49:21 +02006229 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
Peter Zijlstrabd8e7dd2011-04-05 17:23:59 +02006230 if (p->on_rq) {
Tejun Heo969c7922010-05-06 18:49:21 +02006231 struct migration_arg arg = { p, dest_cpu };
Linus Torvalds1da177e2005-04-16 15:20:36 -07006232 /* Need help from migration thread: drop lock and wait. */
Peter Zijlstra0122ec52011-04-05 17:23:51 +02006233 task_rq_unlock(rq, p, &flags);
Tejun Heo969c7922010-05-06 18:49:21 +02006234 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006235 tlb_migrate_finish(p->mm);
6236 return 0;
6237 }
6238out:
Peter Zijlstra0122ec52011-04-05 17:23:51 +02006239 task_rq_unlock(rq, p, &flags);
Ingo Molnar48f24c42006-07-03 00:25:40 -07006240
Linus Torvalds1da177e2005-04-16 15:20:36 -07006241 return ret;
6242}
Mike Traviscd8ba7c2008-03-26 14:23:49 -07006243EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006244
6245/*
Ingo Molnar41a2d6c2007-12-05 15:46:09 +01006246 * Move (not current) task off this cpu, onto dest cpu. We're doing
Linus Torvalds1da177e2005-04-16 15:20:36 -07006247 * this because either it can't run here any more (set_cpus_allowed()
6248 * away from this CPU, or CPU going down), or because we're
6249 * attempting to rebalance this task on exec (sched_exec).
6250 *
6251 * So we race with normal scheduler movements, but that's OK, as long
6252 * as the task is no longer on this CPU.
Kirill Korotaevefc30812006-06-27 02:54:32 -07006253 *
6254 * Returns non-zero if task was successfully migrated.
Linus Torvalds1da177e2005-04-16 15:20:36 -07006255 */
Kirill Korotaevefc30812006-06-27 02:54:32 -07006256static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006257{
Ingo Molnar70b97a72006-07-03 00:25:42 -07006258 struct rq *rq_dest, *rq_src;
Peter Zijlstrae2912002009-12-16 18:04:36 +01006259 int ret = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006260
Max Krasnyanskye761b772008-07-15 04:43:49 -07006261 if (unlikely(!cpu_active(dest_cpu)))
Kirill Korotaevefc30812006-06-27 02:54:32 -07006262 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006263
6264 rq_src = cpu_rq(src_cpu);
6265 rq_dest = cpu_rq(dest_cpu);
6266
Peter Zijlstra0122ec52011-04-05 17:23:51 +02006267 raw_spin_lock(&p->pi_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006268 double_rq_lock(rq_src, rq_dest);
6269 /* Already moved. */
6270 if (task_cpu(p) != src_cpu)
Linus Torvaldsb1e38732008-07-10 11:25:03 -07006271 goto done;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006272 /* Affinity changed (again). */
Peter Zijlstrafa17b502011-06-16 12:23:22 +02006273 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
Linus Torvaldsb1e38732008-07-10 11:25:03 -07006274 goto fail;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006275
Peter Zijlstrae2912002009-12-16 18:04:36 +01006276 /*
6277 * If we're not on a rq, the next wake-up will ensure we're
6278 * placed properly.
6279 */
Peter Zijlstrafd2f4412011-04-05 17:23:44 +02006280 if (p->on_rq) {
Ingo Molnar2e1cb742007-08-09 11:16:49 +02006281 deactivate_task(rq_src, p, 0);
Peter Zijlstrae2912002009-12-16 18:04:36 +01006282 set_task_cpu(p, dest_cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02006283 activate_task(rq_dest, p, 0);
Peter Zijlstra15afe092008-09-20 23:38:02 +02006284 check_preempt_curr(rq_dest, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006285 }
Linus Torvaldsb1e38732008-07-10 11:25:03 -07006286done:
Kirill Korotaevefc30812006-06-27 02:54:32 -07006287 ret = 1;
Linus Torvaldsb1e38732008-07-10 11:25:03 -07006288fail:
Linus Torvalds1da177e2005-04-16 15:20:36 -07006289 double_rq_unlock(rq_src, rq_dest);
Peter Zijlstra0122ec52011-04-05 17:23:51 +02006290 raw_spin_unlock(&p->pi_lock);
Kirill Korotaevefc30812006-06-27 02:54:32 -07006291 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006292}
6293
6294/*
Tejun Heo969c7922010-05-06 18:49:21 +02006295 * migration_cpu_stop - this will be executed by a highprio stopper thread
6296 * and performs thread migration by bumping thread off CPU then
6297 * 'pushing' onto another runqueue.
Linus Torvalds1da177e2005-04-16 15:20:36 -07006298 */
Tejun Heo969c7922010-05-06 18:49:21 +02006299static int migration_cpu_stop(void *data)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006300{
Tejun Heo969c7922010-05-06 18:49:21 +02006301 struct migration_arg *arg = data;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006302
Tejun Heo969c7922010-05-06 18:49:21 +02006303 /*
6304 * The original target cpu might have gone down and we might
6305 * be on another cpu but it doesn't matter.
6306 */
6307 local_irq_disable();
6308 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
6309 local_irq_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07006310 return 0;
6311}
6312
6313#ifdef CONFIG_HOTPLUG_CPU
Linus Torvalds1da177e2005-04-16 15:20:36 -07006314
Ingo Molnar48f24c42006-07-03 00:25:40 -07006315/*
6316 * Ensures that the idle task is using init_mm right before its cpu goes
Linus Torvalds1da177e2005-04-16 15:20:36 -07006317 * offline.
6318 */
6319void idle_task_exit(void)
6320{
6321 struct mm_struct *mm = current->active_mm;
6322
6323 BUG_ON(cpu_online(smp_processor_id()));
6324
6325 if (mm != &init_mm)
6326 switch_mm(mm, &init_mm, current);
6327 mmdrop(mm);
6328}
6329
Peter Zijlstra48c5cca2010-11-13 19:32:29 +01006330/*
6331 * While a dead CPU has no uninterruptible tasks queued at this point,
6332 * it might still have a nonzero ->nr_uninterruptible counter, because
6333 * for performance reasons the counter is not stricly tracking tasks to
6334 * their home CPUs. So we just add the counter to another CPU's counter,
6335 * to keep the global sum constant after CPU-down:
6336 */
6337static void migrate_nr_uninterruptible(struct rq *rq_src)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006338{
Peter Zijlstra48c5cca2010-11-13 19:32:29 +01006339 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
Linus Torvalds1da177e2005-04-16 15:20:36 -07006340
Peter Zijlstra48c5cca2010-11-13 19:32:29 +01006341 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
6342 rq_src->nr_uninterruptible = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006343}
Thomas Gleixnerdce48a82009-04-11 10:43:41 +02006344
6345/*
6346 * remove the tasks which were accounted by rq from calc_load_tasks.
6347 */
6348static void calc_global_load_remove(struct rq *rq)
6349{
6350 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
Thomas Gleixnera468d382009-07-17 14:15:46 +02006351 rq->calc_load_active = 0;
Thomas Gleixnerdce48a82009-04-11 10:43:41 +02006352}
Peter Zijlstra48c5cca2010-11-13 19:32:29 +01006353
Paul Turner8cb120d2011-07-21 09:43:38 -07006354#ifdef CONFIG_CFS_BANDWIDTH
6355static void unthrottle_offline_cfs_rqs(struct rq *rq)
6356{
6357 struct cfs_rq *cfs_rq;
6358
6359 for_each_leaf_cfs_rq(rq, cfs_rq) {
6360 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
6361
6362 if (!cfs_rq->runtime_enabled)
6363 continue;
6364
6365 /*
6366 * clock_task is not advancing so we just need to make sure
6367 * there's some valid quota amount
6368 */
6369 cfs_rq->runtime_remaining = cfs_b->quota;
6370 if (cfs_rq_throttled(cfs_rq))
6371 unthrottle_cfs_rq(cfs_rq);
6372 }
6373}
6374#else
6375static void unthrottle_offline_cfs_rqs(struct rq *rq) {}
6376#endif
6377
Peter Zijlstra48c5cca2010-11-13 19:32:29 +01006378/*
6379 * Migrate all tasks from the rq, sleeping tasks will be migrated by
6380 * try_to_wake_up()->select_task_rq().
6381 *
6382 * Called with rq->lock held even though we'er in stop_machine() and
6383 * there's no concurrency possible, we hold the required locks anyway
6384 * because of lock validation efforts.
6385 */
6386static void migrate_tasks(unsigned int dead_cpu)
6387{
6388 struct rq *rq = cpu_rq(dead_cpu);
6389 struct task_struct *next, *stop = rq->stop;
6390 int dest_cpu;
6391
6392 /*
6393 * Fudge the rq selection such that the below task selection loop
6394 * doesn't get stuck on the currently eligible stop task.
6395 *
6396 * We're currently inside stop_machine() and the rq is either stuck
6397 * in the stop_machine_cpu_stop() loop, or we're executing this code,
6398 * either way we should never end up calling schedule() until we're
6399 * done here.
6400 */
6401 rq->stop = NULL;
6402
Paul Turner8cb120d2011-07-21 09:43:38 -07006403 /* Ensure any throttled groups are reachable by pick_next_task */
6404 unthrottle_offline_cfs_rqs(rq);
6405
Peter Zijlstra48c5cca2010-11-13 19:32:29 +01006406 for ( ; ; ) {
6407 /*
6408 * There's this thread running, bail when that's the only
6409 * remaining thread.
6410 */
6411 if (rq->nr_running == 1)
6412 break;
6413
6414 next = pick_next_task(rq);
6415 BUG_ON(!next);
6416 next->sched_class->put_prev_task(rq, next);
6417
6418 /* Find suitable destination for @next, with force if needed. */
6419 dest_cpu = select_fallback_rq(dead_cpu, next);
6420 raw_spin_unlock(&rq->lock);
6421
6422 __migrate_task(next, dead_cpu, dest_cpu);
6423
6424 raw_spin_lock(&rq->lock);
6425 }
6426
6427 rq->stop = stop;
6428}
6429
Linus Torvalds1da177e2005-04-16 15:20:36 -07006430#endif /* CONFIG_HOTPLUG_CPU */
6431
Nick Piggine692ab52007-07-26 13:40:43 +02006432#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
6433
6434static struct ctl_table sd_ctl_dir[] = {
Alexey Dobriyane0361852007-08-09 11:16:46 +02006435 {
6436 .procname = "sched_domain",
Eric W. Biedermanc57baf12007-08-23 15:18:02 +02006437 .mode = 0555,
Alexey Dobriyane0361852007-08-09 11:16:46 +02006438 },
Eric W. Biederman56992302009-11-05 15:38:40 -08006439 {}
Nick Piggine692ab52007-07-26 13:40:43 +02006440};
6441
6442static struct ctl_table sd_ctl_root[] = {
Alexey Dobriyane0361852007-08-09 11:16:46 +02006443 {
6444 .procname = "kernel",
Eric W. Biedermanc57baf12007-08-23 15:18:02 +02006445 .mode = 0555,
Alexey Dobriyane0361852007-08-09 11:16:46 +02006446 .child = sd_ctl_dir,
6447 },
Eric W. Biederman56992302009-11-05 15:38:40 -08006448 {}
Nick Piggine692ab52007-07-26 13:40:43 +02006449};
6450
6451static struct ctl_table *sd_alloc_ctl_entry(int n)
6452{
6453 struct ctl_table *entry =
Milton Miller5cf9f062007-10-15 17:00:19 +02006454 kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
Nick Piggine692ab52007-07-26 13:40:43 +02006455
Nick Piggine692ab52007-07-26 13:40:43 +02006456 return entry;
6457}
6458
Milton Miller6382bc92007-10-15 17:00:19 +02006459static void sd_free_ctl_entry(struct ctl_table **tablep)
6460{
Milton Millercd7900762007-10-17 16:55:11 +02006461 struct ctl_table *entry;
Milton Miller6382bc92007-10-15 17:00:19 +02006462
Milton Millercd7900762007-10-17 16:55:11 +02006463 /*
6464 * In the intermediate directories, both the child directory and
6465 * procname are dynamically allocated and could fail but the mode
Ingo Molnar41a2d6c2007-12-05 15:46:09 +01006466 * will always be set. In the lowest directory the names are
Milton Millercd7900762007-10-17 16:55:11 +02006467 * static strings and all have proc handlers.
6468 */
6469 for (entry = *tablep; entry->mode; entry++) {
Milton Miller6382bc92007-10-15 17:00:19 +02006470 if (entry->child)
6471 sd_free_ctl_entry(&entry->child);
Milton Millercd7900762007-10-17 16:55:11 +02006472 if (entry->proc_handler == NULL)
6473 kfree(entry->procname);
6474 }
Milton Miller6382bc92007-10-15 17:00:19 +02006475
6476 kfree(*tablep);
6477 *tablep = NULL;
6478}
6479
Nick Piggine692ab52007-07-26 13:40:43 +02006480static void
Alexey Dobriyane0361852007-08-09 11:16:46 +02006481set_table_entry(struct ctl_table *entry,
Nick Piggine692ab52007-07-26 13:40:43 +02006482 const char *procname, void *data, int maxlen,
6483 mode_t mode, proc_handler *proc_handler)
6484{
Nick Piggine692ab52007-07-26 13:40:43 +02006485 entry->procname = procname;
6486 entry->data = data;
6487 entry->maxlen = maxlen;
6488 entry->mode = mode;
6489 entry->proc_handler = proc_handler;
6490}
6491
6492static struct ctl_table *
6493sd_alloc_ctl_domain_table(struct sched_domain *sd)
6494{
Ingo Molnara5d8c342008-10-09 11:35:51 +02006495 struct ctl_table *table = sd_alloc_ctl_entry(13);
Nick Piggine692ab52007-07-26 13:40:43 +02006496
Milton Millerad1cdc12007-10-15 17:00:19 +02006497 if (table == NULL)
6498 return NULL;
6499
Alexey Dobriyane0361852007-08-09 11:16:46 +02006500 set_table_entry(&table[0], "min_interval", &sd->min_interval,
Nick Piggine692ab52007-07-26 13:40:43 +02006501 sizeof(long), 0644, proc_doulongvec_minmax);
Alexey Dobriyane0361852007-08-09 11:16:46 +02006502 set_table_entry(&table[1], "max_interval", &sd->max_interval,
Nick Piggine692ab52007-07-26 13:40:43 +02006503 sizeof(long), 0644, proc_doulongvec_minmax);
Alexey Dobriyane0361852007-08-09 11:16:46 +02006504 set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
Nick Piggine692ab52007-07-26 13:40:43 +02006505 sizeof(int), 0644, proc_dointvec_minmax);
Alexey Dobriyane0361852007-08-09 11:16:46 +02006506 set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
Nick Piggine692ab52007-07-26 13:40:43 +02006507 sizeof(int), 0644, proc_dointvec_minmax);
Alexey Dobriyane0361852007-08-09 11:16:46 +02006508 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
Nick Piggine692ab52007-07-26 13:40:43 +02006509 sizeof(int), 0644, proc_dointvec_minmax);
Alexey Dobriyane0361852007-08-09 11:16:46 +02006510 set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
Nick Piggine692ab52007-07-26 13:40:43 +02006511 sizeof(int), 0644, proc_dointvec_minmax);
Alexey Dobriyane0361852007-08-09 11:16:46 +02006512 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
Nick Piggine692ab52007-07-26 13:40:43 +02006513 sizeof(int), 0644, proc_dointvec_minmax);
Alexey Dobriyane0361852007-08-09 11:16:46 +02006514 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
Nick Piggine692ab52007-07-26 13:40:43 +02006515 sizeof(int), 0644, proc_dointvec_minmax);
Alexey Dobriyane0361852007-08-09 11:16:46 +02006516 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
Nick Piggine692ab52007-07-26 13:40:43 +02006517 sizeof(int), 0644, proc_dointvec_minmax);
Zou Nan haiace8b3d2007-10-15 17:00:14 +02006518 set_table_entry(&table[9], "cache_nice_tries",
Nick Piggine692ab52007-07-26 13:40:43 +02006519 &sd->cache_nice_tries,
6520 sizeof(int), 0644, proc_dointvec_minmax);
Zou Nan haiace8b3d2007-10-15 17:00:14 +02006521 set_table_entry(&table[10], "flags", &sd->flags,
Nick Piggine692ab52007-07-26 13:40:43 +02006522 sizeof(int), 0644, proc_dointvec_minmax);
Ingo Molnara5d8c342008-10-09 11:35:51 +02006523 set_table_entry(&table[11], "name", sd->name,
6524 CORENAME_MAX_SIZE, 0444, proc_dostring);
6525 /* &table[12] is terminator */
Nick Piggine692ab52007-07-26 13:40:43 +02006526
6527 return table;
6528}
6529
Ingo Molnar9a4e7152007-11-28 15:52:56 +01006530static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
Nick Piggine692ab52007-07-26 13:40:43 +02006531{
6532 struct ctl_table *entry, *table;
6533 struct sched_domain *sd;
6534 int domain_num = 0, i;
6535 char buf[32];
6536
6537 for_each_domain(cpu, sd)
6538 domain_num++;
6539 entry = table = sd_alloc_ctl_entry(domain_num + 1);
Milton Millerad1cdc12007-10-15 17:00:19 +02006540 if (table == NULL)
6541 return NULL;
Nick Piggine692ab52007-07-26 13:40:43 +02006542
6543 i = 0;
6544 for_each_domain(cpu, sd) {
6545 snprintf(buf, 32, "domain%d", i);
Nick Piggine692ab52007-07-26 13:40:43 +02006546 entry->procname = kstrdup(buf, GFP_KERNEL);
Eric W. Biedermanc57baf12007-08-23 15:18:02 +02006547 entry->mode = 0555;
Nick Piggine692ab52007-07-26 13:40:43 +02006548 entry->child = sd_alloc_ctl_domain_table(sd);
6549 entry++;
6550 i++;
6551 }
6552 return table;
6553}
6554
6555static struct ctl_table_header *sd_sysctl_header;
Milton Miller6382bc92007-10-15 17:00:19 +02006556static void register_sched_domain_sysctl(void)
Nick Piggine692ab52007-07-26 13:40:43 +02006557{
Peter Zijlstra6ad4c182009-11-25 13:31:39 +01006558 int i, cpu_num = num_possible_cpus();
Nick Piggine692ab52007-07-26 13:40:43 +02006559 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
6560 char buf[32];
6561
Milton Miller73785472007-10-24 18:23:48 +02006562 WARN_ON(sd_ctl_dir[0].child);
6563 sd_ctl_dir[0].child = entry;
6564
Milton Millerad1cdc12007-10-15 17:00:19 +02006565 if (entry == NULL)
6566 return;
6567
Peter Zijlstra6ad4c182009-11-25 13:31:39 +01006568 for_each_possible_cpu(i) {
Nick Piggine692ab52007-07-26 13:40:43 +02006569 snprintf(buf, 32, "cpu%d", i);
Nick Piggine692ab52007-07-26 13:40:43 +02006570 entry->procname = kstrdup(buf, GFP_KERNEL);
Eric W. Biedermanc57baf12007-08-23 15:18:02 +02006571 entry->mode = 0555;
Nick Piggine692ab52007-07-26 13:40:43 +02006572 entry->child = sd_alloc_ctl_cpu_table(i);
Milton Miller97b6ea72007-10-15 17:00:19 +02006573 entry++;
Nick Piggine692ab52007-07-26 13:40:43 +02006574 }
Milton Miller73785472007-10-24 18:23:48 +02006575
6576 WARN_ON(sd_sysctl_header);
Nick Piggine692ab52007-07-26 13:40:43 +02006577 sd_sysctl_header = register_sysctl_table(sd_ctl_root);
6578}
Milton Miller6382bc92007-10-15 17:00:19 +02006579
Milton Miller73785472007-10-24 18:23:48 +02006580/* may be called multiple times per register */
Milton Miller6382bc92007-10-15 17:00:19 +02006581static void unregister_sched_domain_sysctl(void)
6582{
Milton Miller73785472007-10-24 18:23:48 +02006583 if (sd_sysctl_header)
6584 unregister_sysctl_table(sd_sysctl_header);
Milton Miller6382bc92007-10-15 17:00:19 +02006585 sd_sysctl_header = NULL;
Milton Miller73785472007-10-24 18:23:48 +02006586 if (sd_ctl_dir[0].child)
6587 sd_free_ctl_entry(&sd_ctl_dir[0].child);
Milton Miller6382bc92007-10-15 17:00:19 +02006588}
Nick Piggine692ab52007-07-26 13:40:43 +02006589#else
Milton Miller6382bc92007-10-15 17:00:19 +02006590static void register_sched_domain_sysctl(void)
6591{
6592}
6593static void unregister_sched_domain_sysctl(void)
Nick Piggine692ab52007-07-26 13:40:43 +02006594{
6595}
6596#endif
6597
Gregory Haskins1f11eb62008-06-04 15:04:05 -04006598static void set_rq_online(struct rq *rq)
6599{
6600 if (!rq->online) {
6601 const struct sched_class *class;
6602
Rusty Russellc6c49272008-11-25 02:35:05 +10306603 cpumask_set_cpu(rq->cpu, rq->rd->online);
Gregory Haskins1f11eb62008-06-04 15:04:05 -04006604 rq->online = 1;
6605
6606 for_each_class(class) {
6607 if (class->rq_online)
6608 class->rq_online(rq);
6609 }
6610 }
6611}
6612
6613static void set_rq_offline(struct rq *rq)
6614{
6615 if (rq->online) {
6616 const struct sched_class *class;
6617
6618 for_each_class(class) {
6619 if (class->rq_offline)
6620 class->rq_offline(rq);
6621 }
6622
Rusty Russellc6c49272008-11-25 02:35:05 +10306623 cpumask_clear_cpu(rq->cpu, rq->rd->online);
Gregory Haskins1f11eb62008-06-04 15:04:05 -04006624 rq->online = 0;
6625 }
6626}
6627
Linus Torvalds1da177e2005-04-16 15:20:36 -07006628/*
6629 * migration_call - callback that gets triggered when a CPU is added.
6630 * Here we can start up the necessary migration thread for the new CPU.
6631 */
Ingo Molnar48f24c42006-07-03 00:25:40 -07006632static int __cpuinit
6633migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006634{
Ingo Molnar48f24c42006-07-03 00:25:40 -07006635 int cpu = (long)hcpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006636 unsigned long flags;
Tejun Heo969c7922010-05-06 18:49:21 +02006637 struct rq *rq = cpu_rq(cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006638
Peter Zijlstra48c5cca2010-11-13 19:32:29 +01006639 switch (action & ~CPU_TASKS_FROZEN) {
Gautham R Shenoy5be93612007-05-09 02:34:04 -07006640
Linus Torvalds1da177e2005-04-16 15:20:36 -07006641 case CPU_UP_PREPARE:
Thomas Gleixnera468d382009-07-17 14:15:46 +02006642 rq->calc_load_update = calc_load_update;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006643 break;
Ingo Molnar48f24c42006-07-03 00:25:40 -07006644
Linus Torvalds1da177e2005-04-16 15:20:36 -07006645 case CPU_ONLINE:
Gregory Haskins1f94ef52008-03-10 16:52:41 -04006646 /* Update our root-domain */
Thomas Gleixner05fa7852009-11-17 14:28:38 +01006647 raw_spin_lock_irqsave(&rq->lock, flags);
Gregory Haskins1f94ef52008-03-10 16:52:41 -04006648 if (rq->rd) {
Rusty Russellc6c49272008-11-25 02:35:05 +10306649 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
Gregory Haskins1f11eb62008-06-04 15:04:05 -04006650
6651 set_rq_online(rq);
Gregory Haskins1f94ef52008-03-10 16:52:41 -04006652 }
Thomas Gleixner05fa7852009-11-17 14:28:38 +01006653 raw_spin_unlock_irqrestore(&rq->lock, flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006654 break;
Ingo Molnar48f24c42006-07-03 00:25:40 -07006655
Linus Torvalds1da177e2005-04-16 15:20:36 -07006656#ifdef CONFIG_HOTPLUG_CPU
Gregory Haskins08f503b2008-03-10 17:59:11 -04006657 case CPU_DYING:
Peter Zijlstra317f3942011-04-05 17:23:58 +02006658 sched_ttwu_pending();
Gregory Haskins57d885f2008-01-25 21:08:18 +01006659 /* Update our root-domain */
Thomas Gleixner05fa7852009-11-17 14:28:38 +01006660 raw_spin_lock_irqsave(&rq->lock, flags);
Gregory Haskins57d885f2008-01-25 21:08:18 +01006661 if (rq->rd) {
Rusty Russellc6c49272008-11-25 02:35:05 +10306662 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
Gregory Haskins1f11eb62008-06-04 15:04:05 -04006663 set_rq_offline(rq);
Gregory Haskins57d885f2008-01-25 21:08:18 +01006664 }
Peter Zijlstra48c5cca2010-11-13 19:32:29 +01006665 migrate_tasks(cpu);
6666 BUG_ON(rq->nr_running != 1); /* the migration thread */
Thomas Gleixner05fa7852009-11-17 14:28:38 +01006667 raw_spin_unlock_irqrestore(&rq->lock, flags);
Peter Zijlstra48c5cca2010-11-13 19:32:29 +01006668
6669 migrate_nr_uninterruptible(rq);
6670 calc_global_load_remove(rq);
Gregory Haskins57d885f2008-01-25 21:08:18 +01006671 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006672#endif
6673 }
Peter Zijlstra49c022e2011-04-05 10:14:25 +02006674
6675 update_max_interval();
6676
Linus Torvalds1da177e2005-04-16 15:20:36 -07006677 return NOTIFY_OK;
6678}
6679
Paul Mackerrasf38b0822009-06-02 21:05:16 +10006680/*
6681 * Register at high priority so that task migration (migrate_all_tasks)
6682 * happens before everything else. This has to be lower priority than
Ingo Molnarcdd6c482009-09-21 12:02:48 +02006683 * the notifier in the perf_event subsystem, though.
Linus Torvalds1da177e2005-04-16 15:20:36 -07006684 */
Chandra Seetharaman26c21432006-06-27 02:54:10 -07006685static struct notifier_block __cpuinitdata migration_notifier = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07006686 .notifier_call = migration_call,
Tejun Heo50a323b2010-06-08 21:40:36 +02006687 .priority = CPU_PRI_MIGRATION,
Linus Torvalds1da177e2005-04-16 15:20:36 -07006688};
6689
Tejun Heo3a101d02010-06-08 21:40:36 +02006690static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
6691 unsigned long action, void *hcpu)
6692{
6693 switch (action & ~CPU_TASKS_FROZEN) {
6694 case CPU_ONLINE:
6695 case CPU_DOWN_FAILED:
6696 set_cpu_active((long)hcpu, true);
6697 return NOTIFY_OK;
6698 default:
6699 return NOTIFY_DONE;
6700 }
6701}
6702
6703static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
6704 unsigned long action, void *hcpu)
6705{
6706 switch (action & ~CPU_TASKS_FROZEN) {
6707 case CPU_DOWN_PREPARE:
6708 set_cpu_active((long)hcpu, false);
6709 return NOTIFY_OK;
6710 default:
6711 return NOTIFY_DONE;
6712 }
6713}
6714
Eduard - Gabriel Munteanu7babe8d2008-07-25 19:45:11 -07006715static int __init migration_init(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006716{
6717 void *cpu = (void *)(long)smp_processor_id();
Akinobu Mita07dccf32006-09-29 02:00:22 -07006718 int err;
Ingo Molnar48f24c42006-07-03 00:25:40 -07006719
Tejun Heo3a101d02010-06-08 21:40:36 +02006720 /* Initialize migration for the boot CPU */
Akinobu Mita07dccf32006-09-29 02:00:22 -07006721 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
6722 BUG_ON(err == NOTIFY_BAD);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006723 migration_call(&migration_notifier, CPU_ONLINE, cpu);
6724 register_cpu_notifier(&migration_notifier);
Eduard - Gabriel Munteanu7babe8d2008-07-25 19:45:11 -07006725
Tejun Heo3a101d02010-06-08 21:40:36 +02006726 /* Register cpu active notifiers */
6727 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
6728 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
6729
Thomas Gleixnera004cd42009-07-21 09:54:05 +02006730 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006731}
Eduard - Gabriel Munteanu7babe8d2008-07-25 19:45:11 -07006732early_initcall(migration_init);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006733#endif
6734
6735#ifdef CONFIG_SMP
Christoph Lameter476f3532007-05-06 14:48:58 -07006736
Peter Zijlstra4cb98832011-04-07 14:09:58 +02006737static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
6738
Ingo Molnar3e9830d2007-10-15 17:00:13 +02006739#ifdef CONFIG_SCHED_DEBUG
Ingo Molnar4dcf6af2007-10-24 18:23:48 +02006740
Mike Travisf6630112009-11-17 18:22:15 -06006741static __read_mostly int sched_domain_debug_enabled;
6742
6743static int __init sched_domain_debug_setup(char *str)
6744{
6745 sched_domain_debug_enabled = 1;
6746
6747 return 0;
6748}
6749early_param("sched_debug", sched_domain_debug_setup);
6750
Mike Travis7c16ec52008-04-04 18:11:11 -07006751static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
Rusty Russell96f874e2008-11-25 02:35:14 +10306752 struct cpumask *groupmask)
Ingo Molnar4dcf6af2007-10-24 18:23:48 +02006753{
6754 struct sched_group *group = sd->groups;
Mike Travis434d53b2008-04-04 18:11:04 -07006755 char str[256];
Ingo Molnar4dcf6af2007-10-24 18:23:48 +02006756
Rusty Russell968ea6d2008-12-13 21:55:51 +10306757 cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
Rusty Russell96f874e2008-11-25 02:35:14 +10306758 cpumask_clear(groupmask);
Ingo Molnar4dcf6af2007-10-24 18:23:48 +02006759
6760 printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
6761
6762 if (!(sd->flags & SD_LOAD_BALANCE)) {
Peter Zijlstra3df0fc52009-12-20 14:23:57 +01006763 printk("does not load-balance\n");
Ingo Molnar4dcf6af2007-10-24 18:23:48 +02006764 if (sd->parent)
Peter Zijlstra3df0fc52009-12-20 14:23:57 +01006765 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
6766 " has parent");
Ingo Molnar4dcf6af2007-10-24 18:23:48 +02006767 return -1;
6768 }
6769
Peter Zijlstra3df0fc52009-12-20 14:23:57 +01006770 printk(KERN_CONT "span %s level %s\n", str, sd->name);
Ingo Molnar4dcf6af2007-10-24 18:23:48 +02006771
Rusty Russell758b2cd2008-11-25 02:35:04 +10306772 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
Peter Zijlstra3df0fc52009-12-20 14:23:57 +01006773 printk(KERN_ERR "ERROR: domain->span does not contain "
6774 "CPU%d\n", cpu);
Ingo Molnar4dcf6af2007-10-24 18:23:48 +02006775 }
Rusty Russell758b2cd2008-11-25 02:35:04 +10306776 if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
Peter Zijlstra3df0fc52009-12-20 14:23:57 +01006777 printk(KERN_ERR "ERROR: domain->groups does not contain"
6778 " CPU%d\n", cpu);
Ingo Molnar4dcf6af2007-10-24 18:23:48 +02006779 }
6780
6781 printk(KERN_DEBUG "%*s groups:", level + 1, "");
6782 do {
6783 if (!group) {
Peter Zijlstra3df0fc52009-12-20 14:23:57 +01006784 printk("\n");
6785 printk(KERN_ERR "ERROR: group is NULL\n");
Ingo Molnar4dcf6af2007-10-24 18:23:48 +02006786 break;
6787 }
6788
Peter Zijlstra9c3f75c2011-07-14 13:00:06 +02006789 if (!group->sgp->power) {
Peter Zijlstra3df0fc52009-12-20 14:23:57 +01006790 printk(KERN_CONT "\n");
6791 printk(KERN_ERR "ERROR: domain->cpu_power not "
6792 "set\n");
Ingo Molnar4dcf6af2007-10-24 18:23:48 +02006793 break;
6794 }
6795
Rusty Russell758b2cd2008-11-25 02:35:04 +10306796 if (!cpumask_weight(sched_group_cpus(group))) {
Peter Zijlstra3df0fc52009-12-20 14:23:57 +01006797 printk(KERN_CONT "\n");
6798 printk(KERN_ERR "ERROR: empty group\n");
Ingo Molnar4dcf6af2007-10-24 18:23:48 +02006799 break;
6800 }
6801
Rusty Russell758b2cd2008-11-25 02:35:04 +10306802 if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
Peter Zijlstra3df0fc52009-12-20 14:23:57 +01006803 printk(KERN_CONT "\n");
6804 printk(KERN_ERR "ERROR: repeated CPUs\n");
Ingo Molnar4dcf6af2007-10-24 18:23:48 +02006805 break;
6806 }
6807
Rusty Russell758b2cd2008-11-25 02:35:04 +10306808 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
Ingo Molnar4dcf6af2007-10-24 18:23:48 +02006809
Rusty Russell968ea6d2008-12-13 21:55:51 +10306810 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
Gautham R Shenoy381512c2009-04-14 09:09:36 +05306811
Peter Zijlstra3df0fc52009-12-20 14:23:57 +01006812 printk(KERN_CONT " %s", str);
Peter Zijlstra9c3f75c2011-07-14 13:00:06 +02006813 if (group->sgp->power != SCHED_POWER_SCALE) {
Peter Zijlstra3df0fc52009-12-20 14:23:57 +01006814 printk(KERN_CONT " (cpu_power = %d)",
Peter Zijlstra9c3f75c2011-07-14 13:00:06 +02006815 group->sgp->power);
Gautham R Shenoy381512c2009-04-14 09:09:36 +05306816 }
Ingo Molnar4dcf6af2007-10-24 18:23:48 +02006817
6818 group = group->next;
6819 } while (group != sd->groups);
Peter Zijlstra3df0fc52009-12-20 14:23:57 +01006820 printk(KERN_CONT "\n");
Ingo Molnar4dcf6af2007-10-24 18:23:48 +02006821
Rusty Russell758b2cd2008-11-25 02:35:04 +10306822 if (!cpumask_equal(sched_domain_span(sd), groupmask))
Peter Zijlstra3df0fc52009-12-20 14:23:57 +01006823 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
Ingo Molnar4dcf6af2007-10-24 18:23:48 +02006824
Rusty Russell758b2cd2008-11-25 02:35:04 +10306825 if (sd->parent &&
6826 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
Peter Zijlstra3df0fc52009-12-20 14:23:57 +01006827 printk(KERN_ERR "ERROR: parent span is not a superset "
6828 "of domain->span\n");
Ingo Molnar4dcf6af2007-10-24 18:23:48 +02006829 return 0;
6830}
6831
Linus Torvalds1da177e2005-04-16 15:20:36 -07006832static void sched_domain_debug(struct sched_domain *sd, int cpu)
6833{
6834 int level = 0;
6835
Mike Travisf6630112009-11-17 18:22:15 -06006836 if (!sched_domain_debug_enabled)
6837 return;
6838
Nick Piggin41c7ce92005-06-25 14:57:24 -07006839 if (!sd) {
6840 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
6841 return;
6842 }
6843
Linus Torvalds1da177e2005-04-16 15:20:36 -07006844 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6845
Ingo Molnar4dcf6af2007-10-24 18:23:48 +02006846 for (;;) {
Peter Zijlstra4cb98832011-04-07 14:09:58 +02006847 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
Linus Torvalds1da177e2005-04-16 15:20:36 -07006848 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006849 level++;
6850 sd = sd->parent;
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08006851 if (!sd)
Ingo Molnar4dcf6af2007-10-24 18:23:48 +02006852 break;
6853 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07006854}
Dhaval Giani6d6bc0a2008-05-30 14:23:45 +02006855#else /* !CONFIG_SCHED_DEBUG */
Ingo Molnar48f24c42006-07-03 00:25:40 -07006856# define sched_domain_debug(sd, cpu) do { } while (0)
Dhaval Giani6d6bc0a2008-05-30 14:23:45 +02006857#endif /* CONFIG_SCHED_DEBUG */
Linus Torvalds1da177e2005-04-16 15:20:36 -07006858
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006859static int sd_degenerate(struct sched_domain *sd)
Suresh Siddha245af2c2005-06-25 14:57:25 -07006860{
Rusty Russell758b2cd2008-11-25 02:35:04 +10306861 if (cpumask_weight(sched_domain_span(sd)) == 1)
Suresh Siddha245af2c2005-06-25 14:57:25 -07006862 return 1;
6863
6864 /* Following flags need at least 2 groups */
6865 if (sd->flags & (SD_LOAD_BALANCE |
6866 SD_BALANCE_NEWIDLE |
6867 SD_BALANCE_FORK |
Siddha, Suresh B89c47102006-10-03 01:14:09 -07006868 SD_BALANCE_EXEC |
6869 SD_SHARE_CPUPOWER |
6870 SD_SHARE_PKG_RESOURCES)) {
Suresh Siddha245af2c2005-06-25 14:57:25 -07006871 if (sd->groups != sd->groups->next)
6872 return 0;
6873 }
6874
6875 /* Following flags don't use groups */
Peter Zijlstrac88d5912009-09-10 13:50:02 +02006876 if (sd->flags & (SD_WAKE_AFFINE))
Suresh Siddha245af2c2005-06-25 14:57:25 -07006877 return 0;
6878
6879 return 1;
6880}
6881
Ingo Molnar48f24c42006-07-03 00:25:40 -07006882static int
6883sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
Suresh Siddha245af2c2005-06-25 14:57:25 -07006884{
6885 unsigned long cflags = sd->flags, pflags = parent->flags;
6886
6887 if (sd_degenerate(parent))
6888 return 1;
6889
Rusty Russell758b2cd2008-11-25 02:35:04 +10306890 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
Suresh Siddha245af2c2005-06-25 14:57:25 -07006891 return 0;
6892
Suresh Siddha245af2c2005-06-25 14:57:25 -07006893 /* Flags needing groups don't count if only 1 group in parent */
6894 if (parent->groups == parent->groups->next) {
6895 pflags &= ~(SD_LOAD_BALANCE |
6896 SD_BALANCE_NEWIDLE |
6897 SD_BALANCE_FORK |
Siddha, Suresh B89c47102006-10-03 01:14:09 -07006898 SD_BALANCE_EXEC |
6899 SD_SHARE_CPUPOWER |
6900 SD_SHARE_PKG_RESOURCES);
Ken Chen54364992008-12-07 18:47:37 -08006901 if (nr_node_ids == 1)
6902 pflags &= ~SD_SERIALIZE;
Suresh Siddha245af2c2005-06-25 14:57:25 -07006903 }
6904 if (~cflags & pflags)
6905 return 0;
6906
6907 return 1;
6908}
6909
Peter Zijlstradce840a2011-04-07 14:09:50 +02006910static void free_rootdomain(struct rcu_head *rcu)
Rusty Russellc6c49272008-11-25 02:35:05 +10306911{
Peter Zijlstradce840a2011-04-07 14:09:50 +02006912 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
Peter Zijlstra047106a2009-11-16 10:28:09 +01006913
Rusty Russell68e74562008-11-25 02:35:13 +10306914 cpupri_cleanup(&rd->cpupri);
Rusty Russellc6c49272008-11-25 02:35:05 +10306915 free_cpumask_var(rd->rto_mask);
6916 free_cpumask_var(rd->online);
6917 free_cpumask_var(rd->span);
6918 kfree(rd);
6919}
6920
Gregory Haskins57d885f2008-01-25 21:08:18 +01006921static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6922{
Ingo Molnara0490fa2009-02-12 11:35:40 +01006923 struct root_domain *old_rd = NULL;
Gregory Haskins57d885f2008-01-25 21:08:18 +01006924 unsigned long flags;
Gregory Haskins57d885f2008-01-25 21:08:18 +01006925
Thomas Gleixner05fa7852009-11-17 14:28:38 +01006926 raw_spin_lock_irqsave(&rq->lock, flags);
Gregory Haskins57d885f2008-01-25 21:08:18 +01006927
6928 if (rq->rd) {
Ingo Molnara0490fa2009-02-12 11:35:40 +01006929 old_rd = rq->rd;
Gregory Haskins57d885f2008-01-25 21:08:18 +01006930
Rusty Russellc6c49272008-11-25 02:35:05 +10306931 if (cpumask_test_cpu(rq->cpu, old_rd->online))
Gregory Haskins1f11eb62008-06-04 15:04:05 -04006932 set_rq_offline(rq);
Gregory Haskins57d885f2008-01-25 21:08:18 +01006933
Rusty Russellc6c49272008-11-25 02:35:05 +10306934 cpumask_clear_cpu(rq->cpu, old_rd->span);
Gregory Haskinsdc938522008-01-25 21:08:26 +01006935
Ingo Molnara0490fa2009-02-12 11:35:40 +01006936 /*
6937 * If we dont want to free the old_rt yet then
6938 * set old_rd to NULL to skip the freeing later
6939 * in this function:
6940 */
6941 if (!atomic_dec_and_test(&old_rd->refcount))
6942 old_rd = NULL;
Gregory Haskins57d885f2008-01-25 21:08:18 +01006943 }
6944
6945 atomic_inc(&rd->refcount);
6946 rq->rd = rd;
6947
Rusty Russellc6c49272008-11-25 02:35:05 +10306948 cpumask_set_cpu(rq->cpu, rd->span);
Gregory Haskins00aec932009-07-30 10:57:23 -04006949 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
Gregory Haskins1f11eb62008-06-04 15:04:05 -04006950 set_rq_online(rq);
Gregory Haskins57d885f2008-01-25 21:08:18 +01006951
Thomas Gleixner05fa7852009-11-17 14:28:38 +01006952 raw_spin_unlock_irqrestore(&rq->lock, flags);
Ingo Molnara0490fa2009-02-12 11:35:40 +01006953
6954 if (old_rd)
Peter Zijlstradce840a2011-04-07 14:09:50 +02006955 call_rcu_sched(&old_rd->rcu, free_rootdomain);
Gregory Haskins57d885f2008-01-25 21:08:18 +01006956}
6957
Pekka Enberg68c38fc2010-07-15 23:18:22 +03006958static int init_rootdomain(struct root_domain *rd)
Gregory Haskins57d885f2008-01-25 21:08:18 +01006959{
6960 memset(rd, 0, sizeof(*rd));
6961
Pekka Enberg68c38fc2010-07-15 23:18:22 +03006962 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
Li Zefan0c910d22009-01-06 17:39:06 +08006963 goto out;
Pekka Enberg68c38fc2010-07-15 23:18:22 +03006964 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
Rusty Russellc6c49272008-11-25 02:35:05 +10306965 goto free_span;
Pekka Enberg68c38fc2010-07-15 23:18:22 +03006966 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
Rusty Russellc6c49272008-11-25 02:35:05 +10306967 goto free_online;
Gregory Haskins6e0534f2008-05-12 21:21:01 +02006968
Pekka Enberg68c38fc2010-07-15 23:18:22 +03006969 if (cpupri_init(&rd->cpupri) != 0)
Rusty Russell68e74562008-11-25 02:35:13 +10306970 goto free_rto_mask;
Rusty Russellc6c49272008-11-25 02:35:05 +10306971 return 0;
6972
Rusty Russell68e74562008-11-25 02:35:13 +10306973free_rto_mask:
6974 free_cpumask_var(rd->rto_mask);
Rusty Russellc6c49272008-11-25 02:35:05 +10306975free_online:
6976 free_cpumask_var(rd->online);
6977free_span:
6978 free_cpumask_var(rd->span);
Li Zefan0c910d22009-01-06 17:39:06 +08006979out:
Rusty Russellc6c49272008-11-25 02:35:05 +10306980 return -ENOMEM;
Gregory Haskins57d885f2008-01-25 21:08:18 +01006981}
6982
6983static void init_defrootdomain(void)
6984{
Pekka Enberg68c38fc2010-07-15 23:18:22 +03006985 init_rootdomain(&def_root_domain);
Rusty Russellc6c49272008-11-25 02:35:05 +10306986
Gregory Haskins57d885f2008-01-25 21:08:18 +01006987 atomic_set(&def_root_domain.refcount, 1);
6988}
6989
Gregory Haskinsdc938522008-01-25 21:08:26 +01006990static struct root_domain *alloc_rootdomain(void)
Gregory Haskins57d885f2008-01-25 21:08:18 +01006991{
6992 struct root_domain *rd;
6993
6994 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
6995 if (!rd)
6996 return NULL;
6997
Pekka Enberg68c38fc2010-07-15 23:18:22 +03006998 if (init_rootdomain(rd) != 0) {
Rusty Russellc6c49272008-11-25 02:35:05 +10306999 kfree(rd);
7000 return NULL;
7001 }
Gregory Haskins57d885f2008-01-25 21:08:18 +01007002
7003 return rd;
7004}
7005
Peter Zijlstrae3589f62011-07-15 10:35:52 +02007006static void free_sched_groups(struct sched_group *sg, int free_sgp)
7007{
7008 struct sched_group *tmp, *first;
7009
7010 if (!sg)
7011 return;
7012
7013 first = sg;
7014 do {
7015 tmp = sg->next;
7016
7017 if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
7018 kfree(sg->sgp);
7019
7020 kfree(sg);
7021 sg = tmp;
7022 } while (sg != first);
7023}
7024
Peter Zijlstradce840a2011-04-07 14:09:50 +02007025static void free_sched_domain(struct rcu_head *rcu)
7026{
7027 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
Peter Zijlstrae3589f62011-07-15 10:35:52 +02007028
7029 /*
7030 * If its an overlapping domain it has private groups, iterate and
7031 * nuke them all.
7032 */
7033 if (sd->flags & SD_OVERLAP) {
7034 free_sched_groups(sd->groups, 1);
7035 } else if (atomic_dec_and_test(&sd->groups->ref)) {
Peter Zijlstra9c3f75c2011-07-14 13:00:06 +02007036 kfree(sd->groups->sgp);
Peter Zijlstradce840a2011-04-07 14:09:50 +02007037 kfree(sd->groups);
Peter Zijlstra9c3f75c2011-07-14 13:00:06 +02007038 }
Peter Zijlstradce840a2011-04-07 14:09:50 +02007039 kfree(sd);
7040}
7041
7042static void destroy_sched_domain(struct sched_domain *sd, int cpu)
7043{
7044 call_rcu(&sd->rcu, free_sched_domain);
7045}
7046
7047static void destroy_sched_domains(struct sched_domain *sd, int cpu)
7048{
7049 for (; sd; sd = sd->parent)
7050 destroy_sched_domain(sd, cpu);
7051}
7052
Linus Torvalds1da177e2005-04-16 15:20:36 -07007053/*
Ingo Molnar0eab9142008-01-25 21:08:19 +01007054 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
Linus Torvalds1da177e2005-04-16 15:20:36 -07007055 * hold the hotplug lock.
7056 */
Ingo Molnar0eab9142008-01-25 21:08:19 +01007057static void
7058cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07007059{
Ingo Molnar70b97a72006-07-03 00:25:42 -07007060 struct rq *rq = cpu_rq(cpu);
Suresh Siddha245af2c2005-06-25 14:57:25 -07007061 struct sched_domain *tmp;
7062
7063 /* Remove the sched domains which do not contribute to scheduling. */
Li Zefanf29c9b12008-11-06 09:45:16 +08007064 for (tmp = sd; tmp; ) {
Suresh Siddha245af2c2005-06-25 14:57:25 -07007065 struct sched_domain *parent = tmp->parent;
7066 if (!parent)
7067 break;
Li Zefanf29c9b12008-11-06 09:45:16 +08007068
Siddha, Suresh B1a848872006-10-03 01:14:08 -07007069 if (sd_parent_degenerate(tmp, parent)) {
Suresh Siddha245af2c2005-06-25 14:57:25 -07007070 tmp->parent = parent->parent;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07007071 if (parent->parent)
7072 parent->parent->child = tmp;
Peter Zijlstradce840a2011-04-07 14:09:50 +02007073 destroy_sched_domain(parent, cpu);
Li Zefanf29c9b12008-11-06 09:45:16 +08007074 } else
7075 tmp = tmp->parent;
Suresh Siddha245af2c2005-06-25 14:57:25 -07007076 }
7077
Siddha, Suresh B1a848872006-10-03 01:14:08 -07007078 if (sd && sd_degenerate(sd)) {
Peter Zijlstradce840a2011-04-07 14:09:50 +02007079 tmp = sd;
Suresh Siddha245af2c2005-06-25 14:57:25 -07007080 sd = sd->parent;
Peter Zijlstradce840a2011-04-07 14:09:50 +02007081 destroy_sched_domain(tmp, cpu);
Siddha, Suresh B1a848872006-10-03 01:14:08 -07007082 if (sd)
7083 sd->child = NULL;
7084 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07007085
Peter Zijlstra4cb98832011-04-07 14:09:58 +02007086 sched_domain_debug(sd, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007087
Gregory Haskins57d885f2008-01-25 21:08:18 +01007088 rq_attach_root(rq, rd);
Peter Zijlstradce840a2011-04-07 14:09:50 +02007089 tmp = rq->sd;
Nick Piggin674311d2005-06-25 14:57:27 -07007090 rcu_assign_pointer(rq->sd, sd);
Peter Zijlstradce840a2011-04-07 14:09:50 +02007091 destroy_sched_domains(tmp, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007092}
7093
7094/* cpus with isolated domains */
Rusty Russelldcc30a32008-11-25 02:35:12 +10307095static cpumask_var_t cpu_isolated_map;
Linus Torvalds1da177e2005-04-16 15:20:36 -07007096
7097/* Setup the mask of cpus configured for isolated domains */
7098static int __init isolated_cpu_setup(char *str)
7099{
Rusty Russellbdddd292009-12-02 14:09:16 +10307100 alloc_bootmem_cpumask_var(&cpu_isolated_map);
Rusty Russell968ea6d2008-12-13 21:55:51 +10307101 cpulist_parse(str, cpu_isolated_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007102 return 1;
7103}
7104
Ingo Molnar8927f492007-10-15 17:00:13 +02007105__setup("isolcpus=", isolated_cpu_setup);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007106
John Hawkes9c1cfda2005-09-06 15:18:14 -07007107#ifdef CONFIG_NUMA
akpm@osdl.org198e2f12006-01-12 01:05:30 -08007108
John Hawkes9c1cfda2005-09-06 15:18:14 -07007109/**
7110 * find_next_best_node - find the next node to include in a sched_domain
7111 * @node: node whose sched_domain we're building
7112 * @used_nodes: nodes already in the sched_domain
7113 *
Ingo Molnar41a2d6c2007-12-05 15:46:09 +01007114 * Find the next node to include in a given scheduling domain. Simply
John Hawkes9c1cfda2005-09-06 15:18:14 -07007115 * finds the closest node not already in the @used_nodes map.
7116 *
7117 * Should use nodemask_t.
7118 */
Mike Travisc5f59f02008-04-04 18:11:10 -07007119static int find_next_best_node(int node, nodemask_t *used_nodes)
John Hawkes9c1cfda2005-09-06 15:18:14 -07007120{
Hillf Danton7142d172011-05-05 20:53:20 +08007121 int i, n, val, min_val, best_node = -1;
John Hawkes9c1cfda2005-09-06 15:18:14 -07007122
7123 min_val = INT_MAX;
7124
Mike Travis076ac2a2008-05-12 21:21:12 +02007125 for (i = 0; i < nr_node_ids; i++) {
John Hawkes9c1cfda2005-09-06 15:18:14 -07007126 /* Start at @node */
Mike Travis076ac2a2008-05-12 21:21:12 +02007127 n = (node + i) % nr_node_ids;
John Hawkes9c1cfda2005-09-06 15:18:14 -07007128
7129 if (!nr_cpus_node(n))
7130 continue;
7131
7132 /* Skip already used nodes */
Mike Travisc5f59f02008-04-04 18:11:10 -07007133 if (node_isset(n, *used_nodes))
John Hawkes9c1cfda2005-09-06 15:18:14 -07007134 continue;
7135
7136 /* Simple min distance search */
7137 val = node_distance(node, n);
7138
7139 if (val < min_val) {
7140 min_val = val;
7141 best_node = n;
7142 }
7143 }
7144
Hillf Danton7142d172011-05-05 20:53:20 +08007145 if (best_node != -1)
7146 node_set(best_node, *used_nodes);
John Hawkes9c1cfda2005-09-06 15:18:14 -07007147 return best_node;
7148}
7149
7150/**
7151 * sched_domain_node_span - get a cpumask for a node's sched_domain
7152 * @node: node whose cpumask we're constructing
Randy Dunlap73486722008-04-22 10:07:22 -07007153 * @span: resulting cpumask
John Hawkes9c1cfda2005-09-06 15:18:14 -07007154 *
Ingo Molnar41a2d6c2007-12-05 15:46:09 +01007155 * Given a node, construct a good cpumask for its sched_domain to span. It
John Hawkes9c1cfda2005-09-06 15:18:14 -07007156 * should be one that prevents unnecessary balancing, but also spreads tasks
7157 * out optimally.
7158 */
Rusty Russell96f874e2008-11-25 02:35:14 +10307159static void sched_domain_node_span(int node, struct cpumask *span)
John Hawkes9c1cfda2005-09-06 15:18:14 -07007160{
Mike Travisc5f59f02008-04-04 18:11:10 -07007161 nodemask_t used_nodes;
Ingo Molnar48f24c42006-07-03 00:25:40 -07007162 int i;
John Hawkes9c1cfda2005-09-06 15:18:14 -07007163
Mike Travis6ca09df2008-12-31 18:08:45 -08007164 cpumask_clear(span);
Mike Travisc5f59f02008-04-04 18:11:10 -07007165 nodes_clear(used_nodes);
John Hawkes9c1cfda2005-09-06 15:18:14 -07007166
Mike Travis6ca09df2008-12-31 18:08:45 -08007167 cpumask_or(span, span, cpumask_of_node(node));
Mike Travisc5f59f02008-04-04 18:11:10 -07007168 node_set(node, used_nodes);
John Hawkes9c1cfda2005-09-06 15:18:14 -07007169
7170 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
Mike Travisc5f59f02008-04-04 18:11:10 -07007171 int next_node = find_next_best_node(node, &used_nodes);
Hillf Danton7142d172011-05-05 20:53:20 +08007172 if (next_node < 0)
7173 break;
Mike Travis6ca09df2008-12-31 18:08:45 -08007174 cpumask_or(span, span, cpumask_of_node(next_node));
John Hawkes9c1cfda2005-09-06 15:18:14 -07007175 }
John Hawkes9c1cfda2005-09-06 15:18:14 -07007176}
Peter Zijlstrad3081f52011-04-07 14:09:59 +02007177
7178static const struct cpumask *cpu_node_mask(int cpu)
7179{
7180 lockdep_assert_held(&sched_domains_mutex);
7181
7182 sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
7183
7184 return sched_domains_tmpmask;
7185}
Peter Zijlstra2c402dc2011-04-07 14:10:01 +02007186
7187static const struct cpumask *cpu_allnodes_mask(int cpu)
7188{
7189 return cpu_possible_mask;
7190}
Dhaval Giani6d6bc0a2008-05-30 14:23:45 +02007191#endif /* CONFIG_NUMA */
John Hawkes9c1cfda2005-09-06 15:18:14 -07007192
Peter Zijlstrad3081f52011-04-07 14:09:59 +02007193static const struct cpumask *cpu_cpu_mask(int cpu)
7194{
7195 return cpumask_of_node(cpu_to_node(cpu));
7196}
7197
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07007198int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
Ingo Molnar48f24c42006-07-03 00:25:40 -07007199
Peter Zijlstradce840a2011-04-07 14:09:50 +02007200struct sd_data {
7201 struct sched_domain **__percpu sd;
7202 struct sched_group **__percpu sg;
Peter Zijlstra9c3f75c2011-07-14 13:00:06 +02007203 struct sched_group_power **__percpu sgp;
Peter Zijlstradce840a2011-04-07 14:09:50 +02007204};
7205
Andreas Herrmann49a02c52009-08-18 12:51:52 +02007206struct s_data {
Peter Zijlstra21d42cc2011-04-07 14:09:48 +02007207 struct sched_domain ** __percpu sd;
Andreas Herrmann49a02c52009-08-18 12:51:52 +02007208 struct root_domain *rd;
7209};
7210
Andreas Herrmann2109b992009-08-18 12:53:00 +02007211enum s_alloc {
Andreas Herrmann2109b992009-08-18 12:53:00 +02007212 sa_rootdomain,
Peter Zijlstra21d42cc2011-04-07 14:09:48 +02007213 sa_sd,
Peter Zijlstradce840a2011-04-07 14:09:50 +02007214 sa_sd_storage,
Andreas Herrmann2109b992009-08-18 12:53:00 +02007215 sa_none,
7216};
7217
Peter Zijlstra54ab4ff2011-04-07 14:10:03 +02007218struct sched_domain_topology_level;
7219
7220typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
Peter Zijlstraeb7a74e62011-04-07 14:10:00 +02007221typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
7222
Peter Zijlstrae3589f62011-07-15 10:35:52 +02007223#define SDTL_OVERLAP 0x01
7224
Peter Zijlstraeb7a74e62011-04-07 14:10:00 +02007225struct sched_domain_topology_level {
Peter Zijlstra2c402dc2011-04-07 14:10:01 +02007226 sched_domain_init_f init;
7227 sched_domain_mask_f mask;
Peter Zijlstrae3589f62011-07-15 10:35:52 +02007228 int flags;
Peter Zijlstra54ab4ff2011-04-07 14:10:03 +02007229 struct sd_data data;
Peter Zijlstraeb7a74e62011-04-07 14:10:00 +02007230};
7231
Peter Zijlstrae3589f62011-07-15 10:35:52 +02007232static int
7233build_overlap_sched_groups(struct sched_domain *sd, int cpu)
7234{
7235 struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
7236 const struct cpumask *span = sched_domain_span(sd);
7237 struct cpumask *covered = sched_domains_tmpmask;
7238 struct sd_data *sdd = sd->private;
7239 struct sched_domain *child;
7240 int i;
7241
7242 cpumask_clear(covered);
7243
7244 for_each_cpu(i, span) {
7245 struct cpumask *sg_span;
7246
7247 if (cpumask_test_cpu(i, covered))
7248 continue;
7249
7250 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
7251 GFP_KERNEL, cpu_to_node(i));
7252
7253 if (!sg)
7254 goto fail;
7255
7256 sg_span = sched_group_cpus(sg);
7257
7258 child = *per_cpu_ptr(sdd->sd, i);
7259 if (child->child) {
7260 child = child->child;
7261 cpumask_copy(sg_span, sched_domain_span(child));
7262 } else
7263 cpumask_set_cpu(i, sg_span);
7264
7265 cpumask_or(covered, covered, sg_span);
7266
7267 sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
7268 atomic_inc(&sg->sgp->ref);
7269
7270 if (cpumask_test_cpu(cpu, sg_span))
7271 groups = sg;
7272
7273 if (!first)
7274 first = sg;
7275 if (last)
7276 last->next = sg;
7277 last = sg;
7278 last->next = first;
7279 }
7280 sd->groups = groups;
7281
7282 return 0;
7283
7284fail:
7285 free_sched_groups(first, 0);
7286
7287 return -ENOMEM;
7288}
7289
Peter Zijlstradce840a2011-04-07 14:09:50 +02007290static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07007291{
Peter Zijlstradce840a2011-04-07 14:09:50 +02007292 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
7293 struct sched_domain *child = sd->child;
7294
7295 if (child)
7296 cpu = cpumask_first(sched_domain_span(child));
7297
Peter Zijlstra9c3f75c2011-07-14 13:00:06 +02007298 if (sg) {
Peter Zijlstradce840a2011-04-07 14:09:50 +02007299 *sg = *per_cpu_ptr(sdd->sg, cpu);
Peter Zijlstra9c3f75c2011-07-14 13:00:06 +02007300 (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
Peter Zijlstrae3589f62011-07-15 10:35:52 +02007301 atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
Peter Zijlstra9c3f75c2011-07-14 13:00:06 +02007302 }
Peter Zijlstradce840a2011-04-07 14:09:50 +02007303
Linus Torvalds1da177e2005-04-16 15:20:36 -07007304 return cpu;
7305}
Linus Torvalds1da177e2005-04-16 15:20:36 -07007306
Ingo Molnar48f24c42006-07-03 00:25:40 -07007307/*
Peter Zijlstradce840a2011-04-07 14:09:50 +02007308 * build_sched_groups will build a circular linked list of the groups
7309 * covered by the given span, and will set each group's ->cpumask correctly,
7310 * and ->cpu_power to 0.
Peter Zijlstrae3589f62011-07-15 10:35:52 +02007311 *
7312 * Assumes the sched_domain tree is fully constructed
Ingo Molnar48f24c42006-07-03 00:25:40 -07007313 */
Peter Zijlstrae3589f62011-07-15 10:35:52 +02007314static int
7315build_sched_groups(struct sched_domain *sd, int cpu)
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08007316{
Peter Zijlstradce840a2011-04-07 14:09:50 +02007317 struct sched_group *first = NULL, *last = NULL;
7318 struct sd_data *sdd = sd->private;
7319 const struct cpumask *span = sched_domain_span(sd);
Peter Zijlstraf96225f2011-04-07 14:09:57 +02007320 struct cpumask *covered;
Peter Zijlstradce840a2011-04-07 14:09:50 +02007321 int i;
7322
Peter Zijlstrae3589f62011-07-15 10:35:52 +02007323 get_group(cpu, sdd, &sd->groups);
7324 atomic_inc(&sd->groups->ref);
7325
7326 if (cpu != cpumask_first(sched_domain_span(sd)))
7327 return 0;
7328
Peter Zijlstraf96225f2011-04-07 14:09:57 +02007329 lockdep_assert_held(&sched_domains_mutex);
7330 covered = sched_domains_tmpmask;
7331
Peter Zijlstradce840a2011-04-07 14:09:50 +02007332 cpumask_clear(covered);
7333
7334 for_each_cpu(i, span) {
7335 struct sched_group *sg;
7336 int group = get_group(i, sdd, &sg);
7337 int j;
7338
7339 if (cpumask_test_cpu(i, covered))
7340 continue;
7341
7342 cpumask_clear(sched_group_cpus(sg));
Peter Zijlstra9c3f75c2011-07-14 13:00:06 +02007343 sg->sgp->power = 0;
Peter Zijlstradce840a2011-04-07 14:09:50 +02007344
7345 for_each_cpu(j, span) {
7346 if (get_group(j, sdd, NULL) != group)
7347 continue;
7348
7349 cpumask_set_cpu(j, covered);
7350 cpumask_set_cpu(j, sched_group_cpus(sg));
7351 }
7352
7353 if (!first)
7354 first = sg;
7355 if (last)
7356 last->next = sg;
7357 last = sg;
7358 }
7359 last->next = first;
Peter Zijlstrae3589f62011-07-15 10:35:52 +02007360
7361 return 0;
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08007362}
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07007363
Linus Torvalds1da177e2005-04-16 15:20:36 -07007364/*
Siddha, Suresh B89c47102006-10-03 01:14:09 -07007365 * Initialize sched groups cpu_power.
7366 *
7367 * cpu_power indicates the capacity of sched group, which is used while
7368 * distributing the load between different sched groups in a sched domain.
7369 * Typically cpu_power for all the groups in a sched domain will be same unless
7370 * there are asymmetries in the topology. If there are asymmetries, group
7371 * having more cpu_power will pickup more load compared to the group having
7372 * less cpu_power.
Siddha, Suresh B89c47102006-10-03 01:14:09 -07007373 */
7374static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7375{
Peter Zijlstrae3589f62011-07-15 10:35:52 +02007376 struct sched_group *sg = sd->groups;
Siddha, Suresh B89c47102006-10-03 01:14:09 -07007377
Peter Zijlstrae3589f62011-07-15 10:35:52 +02007378 WARN_ON(!sd || !sg);
7379
7380 do {
7381 sg->group_weight = cpumask_weight(sched_group_cpus(sg));
7382 sg = sg->next;
7383 } while (sg != sd->groups);
7384
7385 if (cpu != group_first_cpu(sg))
Siddha, Suresh B89c47102006-10-03 01:14:09 -07007386 return;
7387
Peter Zijlstrad274cb32011-04-07 14:09:43 +02007388 update_group_power(sd, cpu);
Siddha, Suresh B89c47102006-10-03 01:14:09 -07007389}
7390
7391/*
Mike Travis7c16ec52008-04-04 18:11:11 -07007392 * Initializers for schedule domains
7393 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
7394 */
7395
Ingo Molnara5d8c342008-10-09 11:35:51 +02007396#ifdef CONFIG_SCHED_DEBUG
7397# define SD_INIT_NAME(sd, type) sd->name = #type
7398#else
7399# define SD_INIT_NAME(sd, type) do { } while (0)
7400#endif
7401
Peter Zijlstra54ab4ff2011-04-07 14:10:03 +02007402#define SD_INIT_FUNC(type) \
7403static noinline struct sched_domain * \
7404sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
7405{ \
7406 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
7407 *sd = SD_##type##_INIT; \
Peter Zijlstra54ab4ff2011-04-07 14:10:03 +02007408 SD_INIT_NAME(sd, type); \
7409 sd->private = &tl->data; \
7410 return sd; \
Mike Travis7c16ec52008-04-04 18:11:11 -07007411}
7412
7413SD_INIT_FUNC(CPU)
7414#ifdef CONFIG_NUMA
7415 SD_INIT_FUNC(ALLNODES)
7416 SD_INIT_FUNC(NODE)
7417#endif
7418#ifdef CONFIG_SCHED_SMT
7419 SD_INIT_FUNC(SIBLING)
7420#endif
7421#ifdef CONFIG_SCHED_MC
7422 SD_INIT_FUNC(MC)
7423#endif
Heiko Carstens01a08542010-08-31 10:28:16 +02007424#ifdef CONFIG_SCHED_BOOK
7425 SD_INIT_FUNC(BOOK)
7426#endif
Mike Travis7c16ec52008-04-04 18:11:11 -07007427
Hidetoshi Seto1d3504f2008-04-15 14:04:23 +09007428static int default_relax_domain_level = -1;
Peter Zijlstra60495e72011-04-07 14:10:04 +02007429int sched_domain_level_max;
Hidetoshi Seto1d3504f2008-04-15 14:04:23 +09007430
7431static int __init setup_relax_domain_level(char *str)
7432{
Li Zefan30e0e172008-05-13 10:27:17 +08007433 unsigned long val;
7434
7435 val = simple_strtoul(str, NULL, 0);
Peter Zijlstra60495e72011-04-07 14:10:04 +02007436 if (val < sched_domain_level_max)
Li Zefan30e0e172008-05-13 10:27:17 +08007437 default_relax_domain_level = val;
7438
Hidetoshi Seto1d3504f2008-04-15 14:04:23 +09007439 return 1;
7440}
7441__setup("relax_domain_level=", setup_relax_domain_level);
7442
7443static void set_domain_attribute(struct sched_domain *sd,
7444 struct sched_domain_attr *attr)
7445{
7446 int request;
7447
7448 if (!attr || attr->relax_domain_level < 0) {
7449 if (default_relax_domain_level < 0)
7450 return;
7451 else
7452 request = default_relax_domain_level;
7453 } else
7454 request = attr->relax_domain_level;
7455 if (request < sd->level) {
7456 /* turn off idle balance on this domain */
Peter Zijlstrac88d5912009-09-10 13:50:02 +02007457 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
Hidetoshi Seto1d3504f2008-04-15 14:04:23 +09007458 } else {
7459 /* turn on idle balance on this domain */
Peter Zijlstrac88d5912009-09-10 13:50:02 +02007460 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
Hidetoshi Seto1d3504f2008-04-15 14:04:23 +09007461 }
7462}
7463
Peter Zijlstra54ab4ff2011-04-07 14:10:03 +02007464static void __sdt_free(const struct cpumask *cpu_map);
7465static int __sdt_alloc(const struct cpumask *cpu_map);
7466
Andreas Herrmann2109b992009-08-18 12:53:00 +02007467static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
7468 const struct cpumask *cpu_map)
7469{
7470 switch (what) {
Andreas Herrmann2109b992009-08-18 12:53:00 +02007471 case sa_rootdomain:
Peter Zijlstra822ff792011-04-07 14:09:51 +02007472 if (!atomic_read(&d->rd->refcount))
7473 free_rootdomain(&d->rd->rcu); /* fall through */
Peter Zijlstra21d42cc2011-04-07 14:09:48 +02007474 case sa_sd:
7475 free_percpu(d->sd); /* fall through */
Peter Zijlstradce840a2011-04-07 14:09:50 +02007476 case sa_sd_storage:
Peter Zijlstra54ab4ff2011-04-07 14:10:03 +02007477 __sdt_free(cpu_map); /* fall through */
Andreas Herrmann2109b992009-08-18 12:53:00 +02007478 case sa_none:
7479 break;
7480 }
7481}
7482
7483static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
7484 const struct cpumask *cpu_map)
7485{
Peter Zijlstradce840a2011-04-07 14:09:50 +02007486 memset(d, 0, sizeof(*d));
7487
Peter Zijlstra54ab4ff2011-04-07 14:10:03 +02007488 if (__sdt_alloc(cpu_map))
7489 return sa_sd_storage;
Peter Zijlstra21d42cc2011-04-07 14:09:48 +02007490 d->sd = alloc_percpu(struct sched_domain *);
Peter Zijlstradce840a2011-04-07 14:09:50 +02007491 if (!d->sd)
7492 return sa_sd_storage;
Andreas Herrmann2109b992009-08-18 12:53:00 +02007493 d->rd = alloc_rootdomain();
Peter Zijlstradce840a2011-04-07 14:09:50 +02007494 if (!d->rd)
Peter Zijlstra21d42cc2011-04-07 14:09:48 +02007495 return sa_sd;
Andreas Herrmann2109b992009-08-18 12:53:00 +02007496 return sa_rootdomain;
7497}
7498
Peter Zijlstradce840a2011-04-07 14:09:50 +02007499/*
7500 * NULL the sd_data elements we've used to build the sched_domain and
7501 * sched_group structure so that the subsequent __free_domain_allocs()
7502 * will not free the data we're using.
7503 */
7504static void claim_allocations(int cpu, struct sched_domain *sd)
7505{
7506 struct sd_data *sdd = sd->private;
Peter Zijlstradce840a2011-04-07 14:09:50 +02007507
7508 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
7509 *per_cpu_ptr(sdd->sd, cpu) = NULL;
7510
Peter Zijlstrae3589f62011-07-15 10:35:52 +02007511 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
Peter Zijlstradce840a2011-04-07 14:09:50 +02007512 *per_cpu_ptr(sdd->sg, cpu) = NULL;
Peter Zijlstrae3589f62011-07-15 10:35:52 +02007513
7514 if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
Peter Zijlstra9c3f75c2011-07-14 13:00:06 +02007515 *per_cpu_ptr(sdd->sgp, cpu) = NULL;
Peter Zijlstradce840a2011-04-07 14:09:50 +02007516}
7517
Andreas Herrmannd8173532009-08-18 12:57:03 +02007518#ifdef CONFIG_SCHED_SMT
Peter Zijlstra2c402dc2011-04-07 14:10:01 +02007519static const struct cpumask *cpu_smt_mask(int cpu)
7520{
7521 return topology_thread_cpumask(cpu);
Andreas Herrmannd8173532009-08-18 12:57:03 +02007522}
Peter Zijlstra2c402dc2011-04-07 14:10:01 +02007523#endif
Andreas Herrmannd8173532009-08-18 12:57:03 +02007524
Peter Zijlstrad069b912011-04-07 14:10:02 +02007525/*
7526 * Topology list, bottom-up.
7527 */
Peter Zijlstraeb7a74e62011-04-07 14:10:00 +02007528static struct sched_domain_topology_level default_topology[] = {
Peter Zijlstrad069b912011-04-07 14:10:02 +02007529#ifdef CONFIG_SCHED_SMT
7530 { sd_init_SIBLING, cpu_smt_mask, },
Peter Zijlstra2c402dc2011-04-07 14:10:01 +02007531#endif
7532#ifdef CONFIG_SCHED_MC
7533 { sd_init_MC, cpu_coregroup_mask, },
7534#endif
Peter Zijlstrad069b912011-04-07 14:10:02 +02007535#ifdef CONFIG_SCHED_BOOK
7536 { sd_init_BOOK, cpu_book_mask, },
7537#endif
7538 { sd_init_CPU, cpu_cpu_mask, },
7539#ifdef CONFIG_NUMA
Peter Zijlstrae3589f62011-07-15 10:35:52 +02007540 { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
Peter Zijlstrad069b912011-04-07 14:10:02 +02007541 { sd_init_ALLNODES, cpu_allnodes_mask, },
Peter Zijlstra2c402dc2011-04-07 14:10:01 +02007542#endif
Peter Zijlstraeb7a74e62011-04-07 14:10:00 +02007543 { NULL, },
7544};
7545
7546static struct sched_domain_topology_level *sched_domain_topology = default_topology;
7547
Peter Zijlstra54ab4ff2011-04-07 14:10:03 +02007548static int __sdt_alloc(const struct cpumask *cpu_map)
7549{
7550 struct sched_domain_topology_level *tl;
7551 int j;
7552
7553 for (tl = sched_domain_topology; tl->init; tl++) {
7554 struct sd_data *sdd = &tl->data;
7555
7556 sdd->sd = alloc_percpu(struct sched_domain *);
7557 if (!sdd->sd)
7558 return -ENOMEM;
7559
7560 sdd->sg = alloc_percpu(struct sched_group *);
7561 if (!sdd->sg)
7562 return -ENOMEM;
7563
Peter Zijlstra9c3f75c2011-07-14 13:00:06 +02007564 sdd->sgp = alloc_percpu(struct sched_group_power *);
7565 if (!sdd->sgp)
7566 return -ENOMEM;
7567
Peter Zijlstra54ab4ff2011-04-07 14:10:03 +02007568 for_each_cpu(j, cpu_map) {
7569 struct sched_domain *sd;
7570 struct sched_group *sg;
Peter Zijlstra9c3f75c2011-07-14 13:00:06 +02007571 struct sched_group_power *sgp;
Peter Zijlstra54ab4ff2011-04-07 14:10:03 +02007572
7573 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
7574 GFP_KERNEL, cpu_to_node(j));
7575 if (!sd)
7576 return -ENOMEM;
7577
7578 *per_cpu_ptr(sdd->sd, j) = sd;
7579
7580 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
7581 GFP_KERNEL, cpu_to_node(j));
7582 if (!sg)
7583 return -ENOMEM;
7584
7585 *per_cpu_ptr(sdd->sg, j) = sg;
Peter Zijlstra9c3f75c2011-07-14 13:00:06 +02007586
7587 sgp = kzalloc_node(sizeof(struct sched_group_power),
7588 GFP_KERNEL, cpu_to_node(j));
7589 if (!sgp)
7590 return -ENOMEM;
7591
7592 *per_cpu_ptr(sdd->sgp, j) = sgp;
Peter Zijlstra54ab4ff2011-04-07 14:10:03 +02007593 }
7594 }
7595
7596 return 0;
7597}
7598
7599static void __sdt_free(const struct cpumask *cpu_map)
7600{
7601 struct sched_domain_topology_level *tl;
7602 int j;
7603
7604 for (tl = sched_domain_topology; tl->init; tl++) {
7605 struct sd_data *sdd = &tl->data;
7606
7607 for_each_cpu(j, cpu_map) {
Peter Zijlstrae3589f62011-07-15 10:35:52 +02007608 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
7609 if (sd && (sd->flags & SD_OVERLAP))
7610 free_sched_groups(sd->groups, 0);
WANG Congfeff8fa2011-08-18 20:36:57 +08007611 kfree(*per_cpu_ptr(sdd->sd, j));
Peter Zijlstra54ab4ff2011-04-07 14:10:03 +02007612 kfree(*per_cpu_ptr(sdd->sg, j));
Peter Zijlstra9c3f75c2011-07-14 13:00:06 +02007613 kfree(*per_cpu_ptr(sdd->sgp, j));
Peter Zijlstra54ab4ff2011-04-07 14:10:03 +02007614 }
7615 free_percpu(sdd->sd);
7616 free_percpu(sdd->sg);
Peter Zijlstra9c3f75c2011-07-14 13:00:06 +02007617 free_percpu(sdd->sgp);
Peter Zijlstra54ab4ff2011-04-07 14:10:03 +02007618 }
7619}
7620
Peter Zijlstra2c402dc2011-04-07 14:10:01 +02007621struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
7622 struct s_data *d, const struct cpumask *cpu_map,
Peter Zijlstrad069b912011-04-07 14:10:02 +02007623 struct sched_domain_attr *attr, struct sched_domain *child,
Peter Zijlstra2c402dc2011-04-07 14:10:01 +02007624 int cpu)
7625{
Peter Zijlstra54ab4ff2011-04-07 14:10:03 +02007626 struct sched_domain *sd = tl->init(tl, cpu);
Peter Zijlstra2c402dc2011-04-07 14:10:01 +02007627 if (!sd)
Peter Zijlstrad069b912011-04-07 14:10:02 +02007628 return child;
Peter Zijlstra2c402dc2011-04-07 14:10:01 +02007629
7630 set_domain_attribute(sd, attr);
7631 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
Peter Zijlstra60495e72011-04-07 14:10:04 +02007632 if (child) {
7633 sd->level = child->level + 1;
7634 sched_domain_level_max = max(sched_domain_level_max, sd->level);
Peter Zijlstrad069b912011-04-07 14:10:02 +02007635 child->parent = sd;
Peter Zijlstra60495e72011-04-07 14:10:04 +02007636 }
Peter Zijlstrad069b912011-04-07 14:10:02 +02007637 sd->child = child;
Peter Zijlstra2c402dc2011-04-07 14:10:01 +02007638
7639 return sd;
7640}
7641
Mike Travis7c16ec52008-04-04 18:11:11 -07007642/*
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07007643 * Build sched domains for a given set of cpus and attach the sched domains
7644 * to the individual cpus
Linus Torvalds1da177e2005-04-16 15:20:36 -07007645 */
Peter Zijlstradce840a2011-04-07 14:09:50 +02007646static int build_sched_domains(const struct cpumask *cpu_map,
7647 struct sched_domain_attr *attr)
Linus Torvalds1da177e2005-04-16 15:20:36 -07007648{
Andreas Herrmann2109b992009-08-18 12:53:00 +02007649 enum s_alloc alloc_state = sa_none;
Peter Zijlstradce840a2011-04-07 14:09:50 +02007650 struct sched_domain *sd;
Andreas Herrmann49a02c52009-08-18 12:51:52 +02007651 struct s_data d;
Peter Zijlstra822ff792011-04-07 14:09:51 +02007652 int i, ret = -ENOMEM;
Rusty Russell3404c8d2008-11-25 02:35:03 +10307653
Andreas Herrmann2109b992009-08-18 12:53:00 +02007654 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
7655 if (alloc_state != sa_rootdomain)
7656 goto error;
Mike Travis7c16ec52008-04-04 18:11:11 -07007657
Peter Zijlstradce840a2011-04-07 14:09:50 +02007658 /* Set up domains for cpus specified by the cpu_map. */
Rusty Russellabcd0832008-11-25 02:35:02 +10307659 for_each_cpu(i, cpu_map) {
Peter Zijlstraeb7a74e62011-04-07 14:10:00 +02007660 struct sched_domain_topology_level *tl;
7661
Peter Zijlstra3bd65a82011-04-07 14:09:54 +02007662 sd = NULL;
Peter Zijlstrae3589f62011-07-15 10:35:52 +02007663 for (tl = sched_domain_topology; tl->init; tl++) {
Peter Zijlstra2c402dc2011-04-07 14:10:01 +02007664 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
Peter Zijlstrae3589f62011-07-15 10:35:52 +02007665 if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
7666 sd->flags |= SD_OVERLAP;
Peter Zijlstrad1102352011-07-20 18:42:57 +02007667 if (cpumask_equal(cpu_map, sched_domain_span(sd)))
7668 break;
Peter Zijlstrae3589f62011-07-15 10:35:52 +02007669 }
Peter Zijlstrad274cb32011-04-07 14:09:43 +02007670
Peter Zijlstrad069b912011-04-07 14:10:02 +02007671 while (sd->child)
7672 sd = sd->child;
7673
Peter Zijlstra21d42cc2011-04-07 14:09:48 +02007674 *per_cpu_ptr(d.sd, i) = sd;
Peter Zijlstradce840a2011-04-07 14:09:50 +02007675 }
Peter Zijlstra21d42cc2011-04-07 14:09:48 +02007676
Peter Zijlstradce840a2011-04-07 14:09:50 +02007677 /* Build the groups for the domains */
7678 for_each_cpu(i, cpu_map) {
7679 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7680 sd->span_weight = cpumask_weight(sched_domain_span(sd));
Peter Zijlstrae3589f62011-07-15 10:35:52 +02007681 if (sd->flags & SD_OVERLAP) {
7682 if (build_overlap_sched_groups(sd, i))
7683 goto error;
7684 } else {
7685 if (build_sched_groups(sd, i))
7686 goto error;
7687 }
Peter Zijlstra1cf519022011-04-07 14:09:47 +02007688 }
Peter Zijlstraa06dadb2011-04-07 14:09:44 +02007689 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07007690
Linus Torvalds1da177e2005-04-16 15:20:36 -07007691 /* Calculate CPU power for physical packages and nodes */
Peter Zijlstraa9c9a9b2011-04-07 14:09:49 +02007692 for (i = nr_cpumask_bits-1; i >= 0; i--) {
7693 if (!cpumask_test_cpu(i, cpu_map))
7694 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07007695
Peter Zijlstradce840a2011-04-07 14:09:50 +02007696 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7697 claim_allocations(i, sd);
Peter Zijlstracd4ea6a2011-04-07 14:09:45 +02007698 init_sched_groups_power(i, sd);
Peter Zijlstradce840a2011-04-07 14:09:50 +02007699 }
Siddha, Suresh Bf712c0c2006-07-30 03:02:59 -07007700 }
John Hawkes9c1cfda2005-09-06 15:18:14 -07007701
Linus Torvalds1da177e2005-04-16 15:20:36 -07007702 /* Attach the domains */
Peter Zijlstradce840a2011-04-07 14:09:50 +02007703 rcu_read_lock();
Rusty Russellabcd0832008-11-25 02:35:02 +10307704 for_each_cpu(i, cpu_map) {
Peter Zijlstra21d42cc2011-04-07 14:09:48 +02007705 sd = *per_cpu_ptr(d.sd, i);
Andreas Herrmann49a02c52009-08-18 12:51:52 +02007706 cpu_attach_domain(sd, d.rd, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07007707 }
Peter Zijlstradce840a2011-04-07 14:09:50 +02007708 rcu_read_unlock();
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07007709
Peter Zijlstra822ff792011-04-07 14:09:51 +02007710 ret = 0;
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07007711error:
Andreas Herrmann2109b992009-08-18 12:53:00 +02007712 __free_domain_allocs(&d, alloc_state, cpu_map);
Peter Zijlstra822ff792011-04-07 14:09:51 +02007713 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07007714}
Paul Jackson029190c2007-10-18 23:40:20 -07007715
Rusty Russellacc3f5d2009-11-03 14:53:40 +10307716static cpumask_var_t *doms_cur; /* current sched domains */
Paul Jackson029190c2007-10-18 23:40:20 -07007717static int ndoms_cur; /* number of sched domains in 'doms_cur' */
Ingo Molnar4285f5942008-05-16 17:47:14 +02007718static struct sched_domain_attr *dattr_cur;
7719 /* attribues of custom domains in 'doms_cur' */
Paul Jackson029190c2007-10-18 23:40:20 -07007720
7721/*
7722 * Special case: If a kmalloc of a doms_cur partition (array of
Rusty Russell42128232008-11-25 02:35:12 +10307723 * cpumask) fails, then fallback to a single sched domain,
7724 * as determined by the single cpumask fallback_doms.
Paul Jackson029190c2007-10-18 23:40:20 -07007725 */
Rusty Russell42128232008-11-25 02:35:12 +10307726static cpumask_var_t fallback_doms;
Paul Jackson029190c2007-10-18 23:40:20 -07007727
Heiko Carstensee79d1b2008-12-09 18:49:50 +01007728/*
7729 * arch_update_cpu_topology lets virtualized architectures update the
7730 * cpu core maps. It is supposed to return 1 if the topology changed
7731 * or 0 if it stayed the same.
7732 */
7733int __attribute__((weak)) arch_update_cpu_topology(void)
Heiko Carstens22e52b02008-03-12 18:31:59 +01007734{
Heiko Carstensee79d1b2008-12-09 18:49:50 +01007735 return 0;
Heiko Carstens22e52b02008-03-12 18:31:59 +01007736}
7737
Rusty Russellacc3f5d2009-11-03 14:53:40 +10307738cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
7739{
7740 int i;
7741 cpumask_var_t *doms;
7742
7743 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
7744 if (!doms)
7745 return NULL;
7746 for (i = 0; i < ndoms; i++) {
7747 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
7748 free_sched_domains(doms, i);
7749 return NULL;
7750 }
7751 }
7752 return doms;
7753}
7754
7755void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
7756{
7757 unsigned int i;
7758 for (i = 0; i < ndoms; i++)
7759 free_cpumask_var(doms[i]);
7760 kfree(doms);
7761}
7762
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07007763/*
Ingo Molnar41a2d6c2007-12-05 15:46:09 +01007764 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
Paul Jackson029190c2007-10-18 23:40:20 -07007765 * For now this just excludes isolated cpus, but could be used to
7766 * exclude other special cases in the future.
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07007767 */
Peter Zijlstrac4a88492011-04-07 14:09:42 +02007768static int init_sched_domains(const struct cpumask *cpu_map)
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07007769{
Milton Miller73785472007-10-24 18:23:48 +02007770 int err;
7771
Heiko Carstens22e52b02008-03-12 18:31:59 +01007772 arch_update_cpu_topology();
Paul Jackson029190c2007-10-18 23:40:20 -07007773 ndoms_cur = 1;
Rusty Russellacc3f5d2009-11-03 14:53:40 +10307774 doms_cur = alloc_sched_domains(ndoms_cur);
Paul Jackson029190c2007-10-18 23:40:20 -07007775 if (!doms_cur)
Rusty Russellacc3f5d2009-11-03 14:53:40 +10307776 doms_cur = &fallback_doms;
7777 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
Hidetoshi Seto1d3504f2008-04-15 14:04:23 +09007778 dattr_cur = NULL;
Peter Zijlstradce840a2011-04-07 14:09:50 +02007779 err = build_sched_domains(doms_cur[0], NULL);
Milton Miller6382bc92007-10-15 17:00:19 +02007780 register_sched_domain_sysctl();
Milton Miller73785472007-10-24 18:23:48 +02007781
7782 return err;
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07007783}
7784
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07007785/*
7786 * Detach sched domains from a group of cpus specified in cpu_map
7787 * These cpus will now be attached to the NULL domain
7788 */
Rusty Russell96f874e2008-11-25 02:35:14 +10307789static void detach_destroy_domains(const struct cpumask *cpu_map)
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07007790{
7791 int i;
7792
Peter Zijlstradce840a2011-04-07 14:09:50 +02007793 rcu_read_lock();
Rusty Russellabcd0832008-11-25 02:35:02 +10307794 for_each_cpu(i, cpu_map)
Gregory Haskins57d885f2008-01-25 21:08:18 +01007795 cpu_attach_domain(NULL, &def_root_domain, i);
Peter Zijlstradce840a2011-04-07 14:09:50 +02007796 rcu_read_unlock();
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07007797}
7798
Hidetoshi Seto1d3504f2008-04-15 14:04:23 +09007799/* handle null as "default" */
7800static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7801 struct sched_domain_attr *new, int idx_new)
7802{
7803 struct sched_domain_attr tmp;
7804
7805 /* fast path */
7806 if (!new && !cur)
7807 return 1;
7808
7809 tmp = SD_ATTR_INIT;
7810 return !memcmp(cur ? (cur + idx_cur) : &tmp,
7811 new ? (new + idx_new) : &tmp,
7812 sizeof(struct sched_domain_attr));
7813}
7814
Paul Jackson029190c2007-10-18 23:40:20 -07007815/*
7816 * Partition sched domains as specified by the 'ndoms_new'
Ingo Molnar41a2d6c2007-12-05 15:46:09 +01007817 * cpumasks in the array doms_new[] of cpumasks. This compares
Paul Jackson029190c2007-10-18 23:40:20 -07007818 * doms_new[] to the current sched domain partitioning, doms_cur[].
7819 * It destroys each deleted domain and builds each new domain.
7820 *
Rusty Russellacc3f5d2009-11-03 14:53:40 +10307821 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
Ingo Molnar41a2d6c2007-12-05 15:46:09 +01007822 * The masks don't intersect (don't overlap.) We should setup one
7823 * sched domain for each mask. CPUs not in any of the cpumasks will
7824 * not be load balanced. If the same cpumask appears both in the
Paul Jackson029190c2007-10-18 23:40:20 -07007825 * current 'doms_cur' domains and in the new 'doms_new', we can leave
7826 * it as it is.
7827 *
Rusty Russellacc3f5d2009-11-03 14:53:40 +10307828 * The passed in 'doms_new' should be allocated using
7829 * alloc_sched_domains. This routine takes ownership of it and will
7830 * free_sched_domains it when done with it. If the caller failed the
7831 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
7832 * and partition_sched_domains() will fallback to the single partition
7833 * 'fallback_doms', it also forces the domains to be rebuilt.
Paul Jackson029190c2007-10-18 23:40:20 -07007834 *
Rusty Russell96f874e2008-11-25 02:35:14 +10307835 * If doms_new == NULL it will be replaced with cpu_online_mask.
Li Zefan700018e2008-11-18 14:02:03 +08007836 * ndoms_new == 0 is a special case for destroying existing domains,
7837 * and it will not create the default domain.
Max Krasnyanskydfb512e2008-08-29 13:11:41 -07007838 *
Paul Jackson029190c2007-10-18 23:40:20 -07007839 * Call with hotplug lock held
7840 */
Rusty Russellacc3f5d2009-11-03 14:53:40 +10307841void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
Hidetoshi Seto1d3504f2008-04-15 14:04:23 +09007842 struct sched_domain_attr *dattr_new)
Paul Jackson029190c2007-10-18 23:40:20 -07007843{
Max Krasnyanskydfb512e2008-08-29 13:11:41 -07007844 int i, j, n;
Heiko Carstensd65bd5e2008-12-09 18:49:51 +01007845 int new_topology;
Paul Jackson029190c2007-10-18 23:40:20 -07007846
Heiko Carstens712555e2008-04-28 11:33:07 +02007847 mutex_lock(&sched_domains_mutex);
Srivatsa Vaddagiria1835612008-01-25 21:08:00 +01007848
Milton Miller73785472007-10-24 18:23:48 +02007849 /* always unregister in case we don't destroy any domains */
7850 unregister_sched_domain_sysctl();
7851
Heiko Carstensd65bd5e2008-12-09 18:49:51 +01007852 /* Let architecture update cpu core mappings. */
7853 new_topology = arch_update_cpu_topology();
7854
Max Krasnyanskydfb512e2008-08-29 13:11:41 -07007855 n = doms_new ? ndoms_new : 0;
Paul Jackson029190c2007-10-18 23:40:20 -07007856
7857 /* Destroy deleted domains */
7858 for (i = 0; i < ndoms_cur; i++) {
Heiko Carstensd65bd5e2008-12-09 18:49:51 +01007859 for (j = 0; j < n && !new_topology; j++) {
Rusty Russellacc3f5d2009-11-03 14:53:40 +10307860 if (cpumask_equal(doms_cur[i], doms_new[j])
Hidetoshi Seto1d3504f2008-04-15 14:04:23 +09007861 && dattrs_equal(dattr_cur, i, dattr_new, j))
Paul Jackson029190c2007-10-18 23:40:20 -07007862 goto match1;
7863 }
7864 /* no match - a current sched domain not in new doms_new[] */
Rusty Russellacc3f5d2009-11-03 14:53:40 +10307865 detach_destroy_domains(doms_cur[i]);
Paul Jackson029190c2007-10-18 23:40:20 -07007866match1:
7867 ;
7868 }
7869
Max Krasnyanskye761b772008-07-15 04:43:49 -07007870 if (doms_new == NULL) {
7871 ndoms_cur = 0;
Rusty Russellacc3f5d2009-11-03 14:53:40 +10307872 doms_new = &fallback_doms;
Peter Zijlstra6ad4c182009-11-25 13:31:39 +01007873 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
Li Zefanfaa2f982008-11-04 16:20:23 +08007874 WARN_ON_ONCE(dattr_new);
Max Krasnyanskye761b772008-07-15 04:43:49 -07007875 }
7876
Paul Jackson029190c2007-10-18 23:40:20 -07007877 /* Build new domains */
7878 for (i = 0; i < ndoms_new; i++) {
Heiko Carstensd65bd5e2008-12-09 18:49:51 +01007879 for (j = 0; j < ndoms_cur && !new_topology; j++) {
Rusty Russellacc3f5d2009-11-03 14:53:40 +10307880 if (cpumask_equal(doms_new[i], doms_cur[j])
Hidetoshi Seto1d3504f2008-04-15 14:04:23 +09007881 && dattrs_equal(dattr_new, i, dattr_cur, j))
Paul Jackson029190c2007-10-18 23:40:20 -07007882 goto match2;
7883 }
7884 /* no match - add a new doms_new */
Peter Zijlstradce840a2011-04-07 14:09:50 +02007885 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
Paul Jackson029190c2007-10-18 23:40:20 -07007886match2:
7887 ;
7888 }
7889
7890 /* Remember the new sched domains */
Rusty Russellacc3f5d2009-11-03 14:53:40 +10307891 if (doms_cur != &fallback_doms)
7892 free_sched_domains(doms_cur, ndoms_cur);
Hidetoshi Seto1d3504f2008-04-15 14:04:23 +09007893 kfree(dattr_cur); /* kfree(NULL) is safe */
Paul Jackson029190c2007-10-18 23:40:20 -07007894 doms_cur = doms_new;
Hidetoshi Seto1d3504f2008-04-15 14:04:23 +09007895 dattr_cur = dattr_new;
Paul Jackson029190c2007-10-18 23:40:20 -07007896 ndoms_cur = ndoms_new;
Milton Miller73785472007-10-24 18:23:48 +02007897
7898 register_sched_domain_sysctl();
Srivatsa Vaddagiria1835612008-01-25 21:08:00 +01007899
Heiko Carstens712555e2008-04-28 11:33:07 +02007900 mutex_unlock(&sched_domains_mutex);
Paul Jackson029190c2007-10-18 23:40:20 -07007901}
7902
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07007903#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
Peter Zijlstrac4a88492011-04-07 14:09:42 +02007904static void reinit_sched_domains(void)
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07007905{
Gautham R Shenoy95402b32008-01-25 21:08:02 +01007906 get_online_cpus();
Max Krasnyanskydfb512e2008-08-29 13:11:41 -07007907
7908 /* Destroy domains first to force the rebuild */
7909 partition_sched_domains(0, NULL, NULL);
7910
Max Krasnyanskye761b772008-07-15 04:43:49 -07007911 rebuild_sched_domains();
Gautham R Shenoy95402b32008-01-25 21:08:02 +01007912 put_online_cpus();
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07007913}
7914
7915static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7916{
Gautham R Shenoyafb8a9b2008-12-18 23:26:09 +05307917 unsigned int level = 0;
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07007918
Gautham R Shenoyafb8a9b2008-12-18 23:26:09 +05307919 if (sscanf(buf, "%u", &level) != 1)
7920 return -EINVAL;
7921
7922 /*
7923 * level is always be positive so don't check for
7924 * level < POWERSAVINGS_BALANCE_NONE which is 0
7925 * What happens on 0 or 1 byte write,
7926 * need to check for count as well?
7927 */
7928
7929 if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07007930 return -EINVAL;
7931
7932 if (smt)
Gautham R Shenoyafb8a9b2008-12-18 23:26:09 +05307933 sched_smt_power_savings = level;
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07007934 else
Gautham R Shenoyafb8a9b2008-12-18 23:26:09 +05307935 sched_mc_power_savings = level;
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07007936
Peter Zijlstrac4a88492011-04-07 14:09:42 +02007937 reinit_sched_domains();
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07007938
Li Zefanc70f22d2009-01-05 19:07:50 +08007939 return count;
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07007940}
7941
Adrian Bunk6707de002007-08-12 18:08:19 +02007942#ifdef CONFIG_SCHED_MC
Andi Kleenf718cd42008-07-29 22:33:52 -07007943static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
Andi Kleenc9be0a32010-01-05 12:47:58 +01007944 struct sysdev_class_attribute *attr,
Andi Kleenf718cd42008-07-29 22:33:52 -07007945 char *page)
Adrian Bunk6707de002007-08-12 18:08:19 +02007946{
7947 return sprintf(page, "%u\n", sched_mc_power_savings);
7948}
Andi Kleenf718cd42008-07-29 22:33:52 -07007949static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
Andi Kleenc9be0a32010-01-05 12:47:58 +01007950 struct sysdev_class_attribute *attr,
Adrian Bunk6707de002007-08-12 18:08:19 +02007951 const char *buf, size_t count)
7952{
7953 return sched_power_savings_store(buf, count, 0);
7954}
Andi Kleenf718cd42008-07-29 22:33:52 -07007955static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
7956 sched_mc_power_savings_show,
7957 sched_mc_power_savings_store);
Adrian Bunk6707de002007-08-12 18:08:19 +02007958#endif
7959
7960#ifdef CONFIG_SCHED_SMT
Andi Kleenf718cd42008-07-29 22:33:52 -07007961static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
Andi Kleenc9be0a32010-01-05 12:47:58 +01007962 struct sysdev_class_attribute *attr,
Andi Kleenf718cd42008-07-29 22:33:52 -07007963 char *page)
Adrian Bunk6707de002007-08-12 18:08:19 +02007964{
7965 return sprintf(page, "%u\n", sched_smt_power_savings);
7966}
Andi Kleenf718cd42008-07-29 22:33:52 -07007967static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
Andi Kleenc9be0a32010-01-05 12:47:58 +01007968 struct sysdev_class_attribute *attr,
Adrian Bunk6707de002007-08-12 18:08:19 +02007969 const char *buf, size_t count)
7970{
7971 return sched_power_savings_store(buf, count, 1);
7972}
Andi Kleenf718cd42008-07-29 22:33:52 -07007973static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
7974 sched_smt_power_savings_show,
Adrian Bunk6707de002007-08-12 18:08:19 +02007975 sched_smt_power_savings_store);
7976#endif
7977
Li Zefan39aac642009-01-05 19:18:02 +08007978int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07007979{
7980 int err = 0;
Ingo Molnar48f24c42006-07-03 00:25:40 -07007981
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07007982#ifdef CONFIG_SCHED_SMT
7983 if (smt_capable())
7984 err = sysfs_create_file(&cls->kset.kobj,
7985 &attr_sched_smt_power_savings.attr);
7986#endif
7987#ifdef CONFIG_SCHED_MC
7988 if (!err && mc_capable())
7989 err = sysfs_create_file(&cls->kset.kobj,
7990 &attr_sched_mc_power_savings.attr);
7991#endif
7992 return err;
7993}
Dhaval Giani6d6bc0a2008-05-30 14:23:45 +02007994#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07007995
Linus Torvalds1da177e2005-04-16 15:20:36 -07007996/*
Tejun Heo3a101d02010-06-08 21:40:36 +02007997 * Update cpusets according to cpu_active mask. If cpusets are
7998 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
7999 * around partition_sched_domains().
Linus Torvalds1da177e2005-04-16 15:20:36 -07008000 */
Tejun Heo0b2e9182010-06-21 23:53:31 +02008001static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
8002 void *hcpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07008003{
Tejun Heo3a101d02010-06-08 21:40:36 +02008004 switch (action & ~CPU_TASKS_FROZEN) {
Max Krasnyanskye761b772008-07-15 04:43:49 -07008005 case CPU_ONLINE:
Peter Zijlstra6ad4c182009-11-25 13:31:39 +01008006 case CPU_DOWN_FAILED:
Tejun Heo3a101d02010-06-08 21:40:36 +02008007 cpuset_update_active_cpus();
Max Krasnyanskye761b772008-07-15 04:43:49 -07008008 return NOTIFY_OK;
Max Krasnyanskye761b772008-07-15 04:43:49 -07008009 default:
8010 return NOTIFY_DONE;
8011 }
8012}
Tejun Heo3a101d02010-06-08 21:40:36 +02008013
Tejun Heo0b2e9182010-06-21 23:53:31 +02008014static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
8015 void *hcpu)
Tejun Heo3a101d02010-06-08 21:40:36 +02008016{
8017 switch (action & ~CPU_TASKS_FROZEN) {
8018 case CPU_DOWN_PREPARE:
8019 cpuset_update_active_cpus();
8020 return NOTIFY_OK;
8021 default:
8022 return NOTIFY_DONE;
8023 }
8024}
Max Krasnyanskye761b772008-07-15 04:43:49 -07008025
8026static int update_runtime(struct notifier_block *nfb,
8027 unsigned long action, void *hcpu)
8028{
Peter Zijlstra7def2be2008-06-05 14:49:58 +02008029 int cpu = (int)(long)hcpu;
8030
Linus Torvalds1da177e2005-04-16 15:20:36 -07008031 switch (action) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07008032 case CPU_DOWN_PREPARE:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07008033 case CPU_DOWN_PREPARE_FROZEN:
Peter Zijlstra7def2be2008-06-05 14:49:58 +02008034 disable_runtime(cpu_rq(cpu));
Linus Torvalds1da177e2005-04-16 15:20:36 -07008035 return NOTIFY_OK;
8036
Linus Torvalds1da177e2005-04-16 15:20:36 -07008037 case CPU_DOWN_FAILED:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07008038 case CPU_DOWN_FAILED_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07008039 case CPU_ONLINE:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07008040 case CPU_ONLINE_FROZEN:
Peter Zijlstra7def2be2008-06-05 14:49:58 +02008041 enable_runtime(cpu_rq(cpu));
Max Krasnyanskye761b772008-07-15 04:43:49 -07008042 return NOTIFY_OK;
8043
Linus Torvalds1da177e2005-04-16 15:20:36 -07008044 default:
8045 return NOTIFY_DONE;
8046 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07008047}
Linus Torvalds1da177e2005-04-16 15:20:36 -07008048
8049void __init sched_init_smp(void)
8050{
Rusty Russelldcc30a32008-11-25 02:35:12 +10308051 cpumask_var_t non_isolated_cpus;
8052
8053 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
Yong Zhangcb5fd132009-09-14 20:20:16 +08008054 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
Nick Piggin5c1e1762006-10-03 01:14:04 -07008055
Gautham R Shenoy95402b32008-01-25 21:08:02 +01008056 get_online_cpus();
Heiko Carstens712555e2008-04-28 11:33:07 +02008057 mutex_lock(&sched_domains_mutex);
Peter Zijlstrac4a88492011-04-07 14:09:42 +02008058 init_sched_domains(cpu_active_mask);
Rusty Russelldcc30a32008-11-25 02:35:12 +10308059 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
8060 if (cpumask_empty(non_isolated_cpus))
8061 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
Heiko Carstens712555e2008-04-28 11:33:07 +02008062 mutex_unlock(&sched_domains_mutex);
Gautham R Shenoy95402b32008-01-25 21:08:02 +01008063 put_online_cpus();
Max Krasnyanskye761b772008-07-15 04:43:49 -07008064
Tejun Heo3a101d02010-06-08 21:40:36 +02008065 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
8066 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
Max Krasnyanskye761b772008-07-15 04:43:49 -07008067
8068 /* RT runtime code needs to handle some hotplug events */
8069 hotcpu_notifier(update_runtime, 0);
8070
Peter Zijlstrab328ca12008-04-29 10:02:46 +02008071 init_hrtick();
Nick Piggin5c1e1762006-10-03 01:14:04 -07008072
8073 /* Move init over to a non-isolated CPU */
Rusty Russelldcc30a32008-11-25 02:35:12 +10308074 if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
Nick Piggin5c1e1762006-10-03 01:14:04 -07008075 BUG();
Ingo Molnar19978ca2007-11-09 22:39:38 +01008076 sched_init_granularity();
Rusty Russelldcc30a32008-11-25 02:35:12 +10308077 free_cpumask_var(non_isolated_cpus);
Rusty Russell42128232008-11-25 02:35:12 +10308078
Rusty Russell0e3900e2008-11-25 02:35:13 +10308079 init_sched_rt_class();
Linus Torvalds1da177e2005-04-16 15:20:36 -07008080}
8081#else
8082void __init sched_init_smp(void)
8083{
Ingo Molnar19978ca2007-11-09 22:39:38 +01008084 sched_init_granularity();
Linus Torvalds1da177e2005-04-16 15:20:36 -07008085}
8086#endif /* CONFIG_SMP */
8087
Arun R Bharadwajcd1bb942009-04-16 12:15:34 +05308088const_debug unsigned int sysctl_timer_migration = 1;
8089
Linus Torvalds1da177e2005-04-16 15:20:36 -07008090int in_sched_functions(unsigned long addr)
8091{
Linus Torvalds1da177e2005-04-16 15:20:36 -07008092 return in_lock_functions(addr) ||
8093 (addr >= (unsigned long)__sched_text_start
8094 && addr < (unsigned long)__sched_text_end);
8095}
8096
Jan H. Schönherracb5a9b2011-07-14 18:32:43 +02008097static void init_cfs_rq(struct cfs_rq *cfs_rq)
Ingo Molnardd41f592007-07-09 18:51:59 +02008098{
8099 cfs_rq->tasks_timeline = RB_ROOT;
Peter Zijlstra4a55bd52008-04-19 19:45:00 +02008100 INIT_LIST_HEAD(&cfs_rq->tasks);
Peter Zijlstra67e9fb22007-10-15 17:00:10 +02008101 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
Peter Zijlstrac64be782011-07-11 16:28:50 +02008102#ifndef CONFIG_64BIT
8103 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
8104#endif
Ingo Molnardd41f592007-07-09 18:51:59 +02008105}
8106
Peter Zijlstrafa85ae22008-01-25 21:08:29 +01008107static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
8108{
8109 struct rt_prio_array *array;
8110 int i;
8111
8112 array = &rt_rq->active;
8113 for (i = 0; i < MAX_RT_PRIO; i++) {
8114 INIT_LIST_HEAD(array->queue + i);
8115 __clear_bit(i, array->bitmap);
8116 }
8117 /* delimiter for bitsearch: */
8118 __set_bit(MAX_RT_PRIO, array->bitmap);
8119
Jan H. Schönherracb5a9b2011-07-14 18:32:43 +02008120#if defined CONFIG_SMP
Gregory Haskinse864c492008-12-29 09:39:49 -05008121 rt_rq->highest_prio.curr = MAX_RT_PRIO;
8122 rt_rq->highest_prio.next = MAX_RT_PRIO;
Peter Zijlstrafa85ae22008-01-25 21:08:29 +01008123 rt_rq->rt_nr_migratory = 0;
Peter Zijlstrafa85ae22008-01-25 21:08:29 +01008124 rt_rq->overloaded = 0;
Dima Zavin732375c2011-07-07 17:27:59 -07008125 plist_head_init(&rt_rq->pushable_tasks);
Peter Zijlstrafa85ae22008-01-25 21:08:29 +01008126#endif
8127
8128 rt_rq->rt_time = 0;
8129 rt_rq->rt_throttled = 0;
Peter Zijlstraac086bc2008-04-19 19:44:58 +02008130 rt_rq->rt_runtime = 0;
Thomas Gleixner0986b112009-11-17 15:32:06 +01008131 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
Peter Zijlstrafa85ae22008-01-25 21:08:29 +01008132}
8133
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008134#ifdef CONFIG_FAIR_GROUP_SCHED
Dhaval Gianiec7dc8a2008-04-19 19:44:59 +02008135static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
Peter Zijlstra3d4b47b2010-11-15 15:47:01 -08008136 struct sched_entity *se, int cpu,
Dhaval Gianiec7dc8a2008-04-19 19:44:59 +02008137 struct sched_entity *parent)
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008138{
Dhaval Gianiec7dc8a2008-04-19 19:44:59 +02008139 struct rq *rq = cpu_rq(cpu);
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008140
Jan H. Schönherracb5a9b2011-07-14 18:32:43 +02008141 cfs_rq->tg = tg;
8142 cfs_rq->rq = rq;
8143#ifdef CONFIG_SMP
8144 /* allow initial update_cfs_load() to truncate */
8145 cfs_rq->load_stamp = 1;
8146#endif
Paul Turnerab84d312011-07-21 09:43:28 -07008147 init_cfs_rq_runtime(cfs_rq);
Jan H. Schönherracb5a9b2011-07-14 18:32:43 +02008148
8149 tg->cfs_rq[cpu] = cfs_rq;
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008150 tg->se[cpu] = se;
Jan H. Schönherracb5a9b2011-07-14 18:32:43 +02008151
Yong Zhang07e06b02011-01-07 15:17:36 +08008152 /* se could be NULL for root_task_group */
Dhaval Giani354d60c2008-04-19 19:44:59 +02008153 if (!se)
8154 return;
8155
Dhaval Gianiec7dc8a2008-04-19 19:44:59 +02008156 if (!parent)
8157 se->cfs_rq = &rq->cfs;
8158 else
8159 se->cfs_rq = parent->my_q;
8160
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008161 se->my_q = cfs_rq;
Paul Turner94371782010-11-15 15:47:10 -08008162 update_load_set(&se->load, 0);
Dhaval Gianiec7dc8a2008-04-19 19:44:59 +02008163 se->parent = parent;
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008164}
Peter Zijlstra052f1dc2008-02-13 15:45:40 +01008165#endif
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008166
Peter Zijlstra052f1dc2008-02-13 15:45:40 +01008167#ifdef CONFIG_RT_GROUP_SCHED
Dhaval Gianiec7dc8a2008-04-19 19:44:59 +02008168static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
Peter Zijlstra3d4b47b2010-11-15 15:47:01 -08008169 struct sched_rt_entity *rt_se, int cpu,
Dhaval Gianiec7dc8a2008-04-19 19:44:59 +02008170 struct sched_rt_entity *parent)
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008171{
Dhaval Gianiec7dc8a2008-04-19 19:44:59 +02008172 struct rq *rq = cpu_rq(cpu);
8173
Jan H. Schönherracb5a9b2011-07-14 18:32:43 +02008174 rt_rq->highest_prio.curr = MAX_RT_PRIO;
8175 rt_rq->rt_nr_boosted = 0;
8176 rt_rq->rq = rq;
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008177 rt_rq->tg = tg;
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008178
Jan H. Schönherracb5a9b2011-07-14 18:32:43 +02008179 tg->rt_rq[cpu] = rt_rq;
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008180 tg->rt_se[cpu] = rt_se;
Jan H. Schönherracb5a9b2011-07-14 18:32:43 +02008181
Dhaval Giani354d60c2008-04-19 19:44:59 +02008182 if (!rt_se)
8183 return;
8184
Dhaval Gianiec7dc8a2008-04-19 19:44:59 +02008185 if (!parent)
8186 rt_se->rt_rq = &rq->rt;
8187 else
8188 rt_se->rt_rq = parent->my_q;
8189
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008190 rt_se->my_q = rt_rq;
Dhaval Gianiec7dc8a2008-04-19 19:44:59 +02008191 rt_se->parent = parent;
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008192 INIT_LIST_HEAD(&rt_se->run_list);
8193}
8194#endif
8195
Linus Torvalds1da177e2005-04-16 15:20:36 -07008196void __init sched_init(void)
8197{
Ingo Molnardd41f592007-07-09 18:51:59 +02008198 int i, j;
Mike Travis434d53b2008-04-04 18:11:04 -07008199 unsigned long alloc_size = 0, ptr;
8200
8201#ifdef CONFIG_FAIR_GROUP_SCHED
8202 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
8203#endif
8204#ifdef CONFIG_RT_GROUP_SCHED
8205 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
8206#endif
Rusty Russelldf7c8e82009-03-19 15:22:20 +10308207#ifdef CONFIG_CPUMASK_OFFSTACK
Rusty Russell8c083f02009-03-19 15:22:20 +10308208 alloc_size += num_possible_cpus() * cpumask_size();
Rusty Russelldf7c8e82009-03-19 15:22:20 +10308209#endif
Mike Travis434d53b2008-04-04 18:11:04 -07008210 if (alloc_size) {
Pekka Enberg36b7b6d2009-06-10 23:42:36 +03008211 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
Mike Travis434d53b2008-04-04 18:11:04 -07008212
8213#ifdef CONFIG_FAIR_GROUP_SCHED
Yong Zhang07e06b02011-01-07 15:17:36 +08008214 root_task_group.se = (struct sched_entity **)ptr;
Mike Travis434d53b2008-04-04 18:11:04 -07008215 ptr += nr_cpu_ids * sizeof(void **);
8216
Yong Zhang07e06b02011-01-07 15:17:36 +08008217 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
Mike Travis434d53b2008-04-04 18:11:04 -07008218 ptr += nr_cpu_ids * sizeof(void **);
Peter Zijlstraeff766a2008-04-19 19:45:00 +02008219
Dhaval Giani6d6bc0a2008-05-30 14:23:45 +02008220#endif /* CONFIG_FAIR_GROUP_SCHED */
Mike Travis434d53b2008-04-04 18:11:04 -07008221#ifdef CONFIG_RT_GROUP_SCHED
Yong Zhang07e06b02011-01-07 15:17:36 +08008222 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
Mike Travis434d53b2008-04-04 18:11:04 -07008223 ptr += nr_cpu_ids * sizeof(void **);
8224
Yong Zhang07e06b02011-01-07 15:17:36 +08008225 root_task_group.rt_rq = (struct rt_rq **)ptr;
Peter Zijlstraeff766a2008-04-19 19:45:00 +02008226 ptr += nr_cpu_ids * sizeof(void **);
8227
Dhaval Giani6d6bc0a2008-05-30 14:23:45 +02008228#endif /* CONFIG_RT_GROUP_SCHED */
Rusty Russelldf7c8e82009-03-19 15:22:20 +10308229#ifdef CONFIG_CPUMASK_OFFSTACK
8230 for_each_possible_cpu(i) {
8231 per_cpu(load_balance_tmpmask, i) = (void *)ptr;
8232 ptr += cpumask_size();
8233 }
8234#endif /* CONFIG_CPUMASK_OFFSTACK */
Mike Travis434d53b2008-04-04 18:11:04 -07008235 }
Ingo Molnardd41f592007-07-09 18:51:59 +02008236
Gregory Haskins57d885f2008-01-25 21:08:18 +01008237#ifdef CONFIG_SMP
8238 init_defrootdomain();
8239#endif
8240
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02008241 init_rt_bandwidth(&def_rt_bandwidth,
8242 global_rt_period(), global_rt_runtime());
8243
8244#ifdef CONFIG_RT_GROUP_SCHED
Yong Zhang07e06b02011-01-07 15:17:36 +08008245 init_rt_bandwidth(&root_task_group.rt_bandwidth,
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02008246 global_rt_period(), global_rt_runtime());
Dhaval Giani6d6bc0a2008-05-30 14:23:45 +02008247#endif /* CONFIG_RT_GROUP_SCHED */
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02008248
Dhaval Giani7c941432010-01-20 13:26:18 +01008249#ifdef CONFIG_CGROUP_SCHED
Yong Zhang07e06b02011-01-07 15:17:36 +08008250 list_add(&root_task_group.list, &task_groups);
8251 INIT_LIST_HEAD(&root_task_group.children);
Mike Galbraith5091faa2010-11-30 14:18:03 +01008252 autogroup_init(&init_task);
Dhaval Giani7c941432010-01-20 13:26:18 +01008253#endif /* CONFIG_CGROUP_SCHED */
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008254
KAMEZAWA Hiroyuki0a945022006-03-28 01:56:37 -08008255 for_each_possible_cpu(i) {
Ingo Molnar70b97a72006-07-03 00:25:42 -07008256 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07008257
8258 rq = cpu_rq(i);
Thomas Gleixner05fa7852009-11-17 14:28:38 +01008259 raw_spin_lock_init(&rq->lock);
Nick Piggin78979862005-06-25 14:57:13 -07008260 rq->nr_running = 0;
Thomas Gleixnerdce48a82009-04-11 10:43:41 +02008261 rq->calc_load_active = 0;
8262 rq->calc_load_update = jiffies + LOAD_FREQ;
Jan H. Schönherracb5a9b2011-07-14 18:32:43 +02008263 init_cfs_rq(&rq->cfs);
Peter Zijlstrafa85ae22008-01-25 21:08:29 +01008264 init_rt_rq(&rq->rt, rq);
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008265#ifdef CONFIG_FAIR_GROUP_SCHED
Yong Zhang07e06b02011-01-07 15:17:36 +08008266 root_task_group.shares = root_task_group_load;
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008267 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
Dhaval Giani354d60c2008-04-19 19:44:59 +02008268 /*
Yong Zhang07e06b02011-01-07 15:17:36 +08008269 * How much cpu bandwidth does root_task_group get?
Dhaval Giani354d60c2008-04-19 19:44:59 +02008270 *
8271 * In case of task-groups formed thr' the cgroup filesystem, it
8272 * gets 100% of the cpu resources in the system. This overall
8273 * system cpu resource is divided among the tasks of
Yong Zhang07e06b02011-01-07 15:17:36 +08008274 * root_task_group and its child task-groups in a fair manner,
Dhaval Giani354d60c2008-04-19 19:44:59 +02008275 * based on each entity's (task or task-group's) weight
8276 * (se->load.weight).
8277 *
Yong Zhang07e06b02011-01-07 15:17:36 +08008278 * In other words, if root_task_group has 10 tasks of weight
Dhaval Giani354d60c2008-04-19 19:44:59 +02008279 * 1024) and two child groups A0 and A1 (of weight 1024 each),
8280 * then A0's share of the cpu resource is:
8281 *
Ingo Molnar0d905bc2009-05-04 19:13:30 +02008282 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
Dhaval Giani354d60c2008-04-19 19:44:59 +02008283 *
Yong Zhang07e06b02011-01-07 15:17:36 +08008284 * We achieve this by letting root_task_group's tasks sit
8285 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
Dhaval Giani354d60c2008-04-19 19:44:59 +02008286 */
Paul Turnerab84d312011-07-21 09:43:28 -07008287 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
Yong Zhang07e06b02011-01-07 15:17:36 +08008288 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
Dhaval Giani354d60c2008-04-19 19:44:59 +02008289#endif /* CONFIG_FAIR_GROUP_SCHED */
8290
8291 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
Peter Zijlstra052f1dc2008-02-13 15:45:40 +01008292#ifdef CONFIG_RT_GROUP_SCHED
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008293 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
Yong Zhang07e06b02011-01-07 15:17:36 +08008294 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008295#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07008296
Ingo Molnardd41f592007-07-09 18:51:59 +02008297 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
8298 rq->cpu_load[j] = 0;
Venkatesh Pallipadifdf3e952010-05-17 18:14:43 -07008299
8300 rq->last_load_update_tick = jiffies;
8301
Linus Torvalds1da177e2005-04-16 15:20:36 -07008302#ifdef CONFIG_SMP
Nick Piggin41c7ce92005-06-25 14:57:24 -07008303 rq->sd = NULL;
Gregory Haskins57d885f2008-01-25 21:08:18 +01008304 rq->rd = NULL;
Nikhil Rao1399fa72011-05-18 10:09:39 -07008305 rq->cpu_power = SCHED_POWER_SCALE;
Gregory Haskins3f029d32009-07-29 11:08:47 -04008306 rq->post_schedule = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07008307 rq->active_balance = 0;
Ingo Molnardd41f592007-07-09 18:51:59 +02008308 rq->next_balance = jiffies;
Linus Torvalds1da177e2005-04-16 15:20:36 -07008309 rq->push_cpu = 0;
Christoph Lameter0a2966b2006-09-25 23:30:51 -07008310 rq->cpu = i;
Gregory Haskins1f11eb62008-06-04 15:04:05 -04008311 rq->online = 0;
Mike Galbraitheae0c9d2009-11-10 03:50:02 +01008312 rq->idle_stamp = 0;
8313 rq->avg_idle = 2*sysctl_sched_migration_cost;
Gregory Haskinsdc938522008-01-25 21:08:26 +01008314 rq_attach_root(rq, &def_root_domain);
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07008315#ifdef CONFIG_NO_HZ
8316 rq->nohz_balance_kick = 0;
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07008317#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07008318#endif
Peter Zijlstra8f4d37e2008-01-25 21:08:29 +01008319 init_rq_hrtick(rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07008320 atomic_set(&rq->nr_iowait, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07008321 }
8322
Peter Williams2dd73a42006-06-27 02:54:34 -07008323 set_load_weight(&init_task);
Heiko Carstensb50f60c2006-07-30 03:03:52 -07008324
Avi Kivitye107be32007-07-26 13:40:43 +02008325#ifdef CONFIG_PREEMPT_NOTIFIERS
8326 INIT_HLIST_HEAD(&init_task.preempt_notifiers);
8327#endif
8328
Christoph Lameterc9819f42006-12-10 02:20:25 -08008329#ifdef CONFIG_SMP
Carlos R. Mafra962cf362008-05-15 11:15:37 -03008330 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
Christoph Lameterc9819f42006-12-10 02:20:25 -08008331#endif
8332
Heiko Carstensb50f60c2006-07-30 03:03:52 -07008333#ifdef CONFIG_RT_MUTEXES
Dima Zavin732375c2011-07-07 17:27:59 -07008334 plist_head_init(&init_task.pi_waiters);
Heiko Carstensb50f60c2006-07-30 03:03:52 -07008335#endif
8336
Linus Torvalds1da177e2005-04-16 15:20:36 -07008337 /*
8338 * The boot idle thread does lazy MMU switching as well:
8339 */
8340 atomic_inc(&init_mm.mm_count);
8341 enter_lazy_tlb(&init_mm, current);
8342
8343 /*
8344 * Make us the idle thread. Technically, schedule() should not be
8345 * called from this thread, however somewhere below it might be,
8346 * but because we are the idle thread, we just pick up running again
8347 * when this runqueue becomes "idle".
8348 */
8349 init_idle(current, smp_processor_id());
Thomas Gleixnerdce48a82009-04-11 10:43:41 +02008350
8351 calc_load_update = jiffies + LOAD_FREQ;
8352
Ingo Molnardd41f592007-07-09 18:51:59 +02008353 /*
8354 * During early bootup we pretend to be a normal task:
8355 */
8356 current->sched_class = &fair_sched_class;
Ingo Molnar6892b752008-02-13 14:02:36 +01008357
Rusty Russellbf4d83f2008-11-25 09:57:51 +10308358#ifdef CONFIG_SMP
Peter Zijlstra4cb98832011-04-07 14:09:58 +02008359 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
Rusty Russell7d1e6a92008-11-25 02:35:09 +10308360#ifdef CONFIG_NO_HZ
Venkatesh Pallipadi83cd4fe2010-05-21 17:09:41 -07008361 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
8362 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
8363 atomic_set(&nohz.load_balancer, nr_cpu_ids);
8364 atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
8365 atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
Rusty Russell7d1e6a92008-11-25 02:35:09 +10308366#endif
Rusty Russellbdddd292009-12-02 14:09:16 +10308367 /* May be allocated at isolcpus cmdline parse time */
8368 if (cpu_isolated_map == NULL)
8369 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
Rusty Russellbf4d83f2008-11-25 09:57:51 +10308370#endif /* SMP */
Rusty Russell6a7b3dc2008-11-25 02:35:04 +10308371
Ingo Molnar6892b752008-02-13 14:02:36 +01008372 scheduler_running = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07008373}
8374
Frederic Weisbeckerd902db12011-06-08 19:31:56 +02008375#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
Frederic Weisbeckere4aafea2009-07-16 15:44:29 +02008376static inline int preempt_count_equals(int preempt_offset)
8377{
Frederic Weisbecker234da7b2009-12-16 20:21:05 +01008378 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
Frederic Weisbeckere4aafea2009-07-16 15:44:29 +02008379
Arnd Bergmann4ba82162011-01-25 22:52:22 +01008380 return (nested == preempt_offset);
Frederic Weisbeckere4aafea2009-07-16 15:44:29 +02008381}
8382
Simon Kagstromd8948372009-12-23 11:08:18 +01008383void __might_sleep(const char *file, int line, int preempt_offset)
Linus Torvalds1da177e2005-04-16 15:20:36 -07008384{
Linus Torvalds1da177e2005-04-16 15:20:36 -07008385 static unsigned long prev_jiffy; /* ratelimiting */
8386
Paul E. McKenneyb3fbab02011-05-24 08:31:09 -07008387 rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
Frederic Weisbeckere4aafea2009-07-16 15:44:29 +02008388 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
8389 system_state != SYSTEM_RUNNING || oops_in_progress)
Ingo Molnaraef745f2008-08-28 11:34:43 +02008390 return;
8391 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
8392 return;
8393 prev_jiffy = jiffies;
8394
Peter Zijlstra3df0fc52009-12-20 14:23:57 +01008395 printk(KERN_ERR
8396 "BUG: sleeping function called from invalid context at %s:%d\n",
8397 file, line);
8398 printk(KERN_ERR
8399 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
8400 in_atomic(), irqs_disabled(),
8401 current->pid, current->comm);
Ingo Molnaraef745f2008-08-28 11:34:43 +02008402
8403 debug_show_held_locks(current);
8404 if (irqs_disabled())
8405 print_irqtrace_events(current);
8406 dump_stack();
Linus Torvalds1da177e2005-04-16 15:20:36 -07008407}
8408EXPORT_SYMBOL(__might_sleep);
8409#endif
8410
8411#ifdef CONFIG_MAGIC_SYSRQ
Andi Kleen3a5e4dc2007-10-15 17:00:15 +02008412static void normalize_task(struct rq *rq, struct task_struct *p)
8413{
Peter Zijlstrada7a7352011-01-17 17:03:27 +01008414 const struct sched_class *prev_class = p->sched_class;
8415 int old_prio = p->prio;
Andi Kleen3a5e4dc2007-10-15 17:00:15 +02008416 int on_rq;
Peter Zijlstra3e51f332008-05-03 18:29:28 +02008417
Peter Zijlstrafd2f4412011-04-05 17:23:44 +02008418 on_rq = p->on_rq;
Andi Kleen3a5e4dc2007-10-15 17:00:15 +02008419 if (on_rq)
8420 deactivate_task(rq, p, 0);
8421 __setscheduler(rq, p, SCHED_NORMAL, 0);
8422 if (on_rq) {
8423 activate_task(rq, p, 0);
8424 resched_task(rq->curr);
8425 }
Peter Zijlstrada7a7352011-01-17 17:03:27 +01008426
8427 check_class_changed(rq, p, prev_class, old_prio);
Andi Kleen3a5e4dc2007-10-15 17:00:15 +02008428}
8429
Linus Torvalds1da177e2005-04-16 15:20:36 -07008430void normalize_rt_tasks(void)
8431{
Ingo Molnara0f98a12007-06-17 18:37:45 +02008432 struct task_struct *g, *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07008433 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07008434 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07008435
Peter Zijlstra4cf5d772008-02-13 15:45:39 +01008436 read_lock_irqsave(&tasklist_lock, flags);
Ingo Molnara0f98a12007-06-17 18:37:45 +02008437 do_each_thread(g, p) {
Ingo Molnar178be792007-10-15 17:00:18 +02008438 /*
8439 * Only normalize user tasks:
8440 */
8441 if (!p->mm)
8442 continue;
8443
Ingo Molnardd41f592007-07-09 18:51:59 +02008444 p->se.exec_start = 0;
Ingo Molnar6cfb0d52007-08-02 17:41:40 +02008445#ifdef CONFIG_SCHEDSTATS
Lucas De Marchi41acab82010-03-10 23:37:45 -03008446 p->se.statistics.wait_start = 0;
8447 p->se.statistics.sleep_start = 0;
8448 p->se.statistics.block_start = 0;
Ingo Molnar6cfb0d52007-08-02 17:41:40 +02008449#endif
Ingo Molnardd41f592007-07-09 18:51:59 +02008450
8451 if (!rt_task(p)) {
8452 /*
8453 * Renice negative nice level userspace
8454 * tasks back to 0:
8455 */
8456 if (TASK_NICE(p) < 0 && p->mm)
8457 set_user_nice(p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07008458 continue;
Ingo Molnardd41f592007-07-09 18:51:59 +02008459 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07008460
Thomas Gleixner1d615482009-11-17 14:54:03 +01008461 raw_spin_lock(&p->pi_lock);
Ingo Molnarb29739f2006-06-27 02:54:51 -07008462 rq = __task_rq_lock(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07008463
Ingo Molnar178be792007-10-15 17:00:18 +02008464 normalize_task(rq, p);
Andi Kleen3a5e4dc2007-10-15 17:00:15 +02008465
Ingo Molnarb29739f2006-06-27 02:54:51 -07008466 __task_rq_unlock(rq);
Thomas Gleixner1d615482009-11-17 14:54:03 +01008467 raw_spin_unlock(&p->pi_lock);
Ingo Molnara0f98a12007-06-17 18:37:45 +02008468 } while_each_thread(g, p);
8469
Peter Zijlstra4cf5d772008-02-13 15:45:39 +01008470 read_unlock_irqrestore(&tasklist_lock, flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07008471}
8472
8473#endif /* CONFIG_MAGIC_SYSRQ */
Linus Torvalds1df5c102005-09-12 07:59:21 -07008474
Jason Wessel67fc4e02010-05-20 21:04:21 -05008475#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
Linus Torvalds1df5c102005-09-12 07:59:21 -07008476/*
Jason Wessel67fc4e02010-05-20 21:04:21 -05008477 * These functions are only useful for the IA64 MCA handling, or kdb.
Linus Torvalds1df5c102005-09-12 07:59:21 -07008478 *
8479 * They can only be called when the whole system has been
8480 * stopped - every CPU needs to be quiescent, and no scheduling
8481 * activity can take place. Using them for anything else would
8482 * be a serious bug, and as a result, they aren't even visible
8483 * under any other configuration.
8484 */
8485
8486/**
8487 * curr_task - return the current task for a given cpu.
8488 * @cpu: the processor in question.
8489 *
8490 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
8491 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07008492struct task_struct *curr_task(int cpu)
Linus Torvalds1df5c102005-09-12 07:59:21 -07008493{
8494 return cpu_curr(cpu);
8495}
8496
Jason Wessel67fc4e02010-05-20 21:04:21 -05008497#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
8498
8499#ifdef CONFIG_IA64
Linus Torvalds1df5c102005-09-12 07:59:21 -07008500/**
8501 * set_curr_task - set the current task for a given cpu.
8502 * @cpu: the processor in question.
8503 * @p: the task pointer to set.
8504 *
8505 * Description: This function must only be used when non-maskable interrupts
Ingo Molnar41a2d6c2007-12-05 15:46:09 +01008506 * are serviced on a separate stack. It allows the architecture to switch the
8507 * notion of the current task on a cpu in a non-blocking manner. This function
Linus Torvalds1df5c102005-09-12 07:59:21 -07008508 * must be called with all CPU's synchronized, and interrupts disabled, the
8509 * and caller must save the original value of the current task (see
8510 * curr_task() above) and restore that value before reenabling interrupts and
8511 * re-starting the system.
8512 *
8513 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
8514 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07008515void set_curr_task(int cpu, struct task_struct *p)
Linus Torvalds1df5c102005-09-12 07:59:21 -07008516{
8517 cpu_curr(cpu) = p;
8518}
8519
8520#endif
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008521
Peter Zijlstrabccbe082008-02-13 15:45:40 +01008522#ifdef CONFIG_FAIR_GROUP_SCHED
8523static void free_fair_sched_group(struct task_group *tg)
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008524{
8525 int i;
8526
Paul Turnerab84d312011-07-21 09:43:28 -07008527 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
8528
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008529 for_each_possible_cpu(i) {
8530 if (tg->cfs_rq)
8531 kfree(tg->cfs_rq[i]);
8532 if (tg->se)
8533 kfree(tg->se[i]);
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008534 }
8535
8536 kfree(tg->cfs_rq);
8537 kfree(tg->se);
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008538}
8539
Dhaval Gianiec7dc8a2008-04-19 19:44:59 +02008540static
8541int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008542{
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008543 struct cfs_rq *cfs_rq;
Li Zefaneab17222008-10-29 17:03:22 +08008544 struct sched_entity *se;
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008545 int i;
8546
Mike Travis434d53b2008-04-04 18:11:04 -07008547 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008548 if (!tg->cfs_rq)
8549 goto err;
Mike Travis434d53b2008-04-04 18:11:04 -07008550 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008551 if (!tg->se)
8552 goto err;
Peter Zijlstra052f1dc2008-02-13 15:45:40 +01008553
8554 tg->shares = NICE_0_LOAD;
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008555
Paul Turnerab84d312011-07-21 09:43:28 -07008556 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
8557
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008558 for_each_possible_cpu(i) {
Li Zefaneab17222008-10-29 17:03:22 +08008559 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
8560 GFP_KERNEL, cpu_to_node(i));
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008561 if (!cfs_rq)
8562 goto err;
8563
Li Zefaneab17222008-10-29 17:03:22 +08008564 se = kzalloc_node(sizeof(struct sched_entity),
8565 GFP_KERNEL, cpu_to_node(i));
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008566 if (!se)
Phil Carmodydfc12eb2009-12-10 14:29:37 +02008567 goto err_free_rq;
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008568
Jan H. Schönherracb5a9b2011-07-14 18:32:43 +02008569 init_cfs_rq(cfs_rq);
Peter Zijlstra3d4b47b2010-11-15 15:47:01 -08008570 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
Peter Zijlstrabccbe082008-02-13 15:45:40 +01008571 }
8572
8573 return 1;
8574
Peter Zijlstra49246272010-10-17 21:46:10 +02008575err_free_rq:
Phil Carmodydfc12eb2009-12-10 14:29:37 +02008576 kfree(cfs_rq);
Peter Zijlstra49246272010-10-17 21:46:10 +02008577err:
Peter Zijlstrabccbe082008-02-13 15:45:40 +01008578 return 0;
8579}
8580
Peter Zijlstrabccbe082008-02-13 15:45:40 +01008581static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8582{
Peter Zijlstra3d4b47b2010-11-15 15:47:01 -08008583 struct rq *rq = cpu_rq(cpu);
8584 unsigned long flags;
Peter Zijlstra3d4b47b2010-11-15 15:47:01 -08008585
8586 /*
8587 * Only empty task groups can be destroyed; so we can speculatively
8588 * check on_list without danger of it being re-added.
8589 */
8590 if (!tg->cfs_rq[cpu]->on_list)
8591 return;
8592
8593 raw_spin_lock_irqsave(&rq->lock, flags);
Paul Turner822bc182010-11-29 16:55:40 -08008594 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
Peter Zijlstra3d4b47b2010-11-15 15:47:01 -08008595 raw_spin_unlock_irqrestore(&rq->lock, flags);
Peter Zijlstrabccbe082008-02-13 15:45:40 +01008596}
Jan Schoenherr5f817d62011-07-13 20:13:31 +02008597#else /* !CONFIG_FAIR_GROUP_SCHED */
Peter Zijlstrabccbe082008-02-13 15:45:40 +01008598static inline void free_fair_sched_group(struct task_group *tg)
8599{
8600}
8601
Dhaval Gianiec7dc8a2008-04-19 19:44:59 +02008602static inline
8603int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
Peter Zijlstrabccbe082008-02-13 15:45:40 +01008604{
8605 return 1;
8606}
8607
Peter Zijlstrabccbe082008-02-13 15:45:40 +01008608static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8609{
8610}
Dhaval Giani6d6bc0a2008-05-30 14:23:45 +02008611#endif /* CONFIG_FAIR_GROUP_SCHED */
Peter Zijlstra052f1dc2008-02-13 15:45:40 +01008612
8613#ifdef CONFIG_RT_GROUP_SCHED
Peter Zijlstrabccbe082008-02-13 15:45:40 +01008614static void free_rt_sched_group(struct task_group *tg)
8615{
8616 int i;
8617
Bianca Lutz99bc5242011-07-13 20:13:36 +02008618 if (tg->rt_se)
8619 destroy_rt_bandwidth(&tg->rt_bandwidth);
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02008620
Peter Zijlstrabccbe082008-02-13 15:45:40 +01008621 for_each_possible_cpu(i) {
8622 if (tg->rt_rq)
8623 kfree(tg->rt_rq[i]);
8624 if (tg->rt_se)
8625 kfree(tg->rt_se[i]);
8626 }
8627
8628 kfree(tg->rt_rq);
8629 kfree(tg->rt_se);
8630}
8631
Dhaval Gianiec7dc8a2008-04-19 19:44:59 +02008632static
8633int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
Peter Zijlstrabccbe082008-02-13 15:45:40 +01008634{
8635 struct rt_rq *rt_rq;
Li Zefaneab17222008-10-29 17:03:22 +08008636 struct sched_rt_entity *rt_se;
Peter Zijlstrabccbe082008-02-13 15:45:40 +01008637 int i;
8638
Mike Travis434d53b2008-04-04 18:11:04 -07008639 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
Peter Zijlstrabccbe082008-02-13 15:45:40 +01008640 if (!tg->rt_rq)
8641 goto err;
Mike Travis434d53b2008-04-04 18:11:04 -07008642 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
Peter Zijlstrabccbe082008-02-13 15:45:40 +01008643 if (!tg->rt_se)
8644 goto err;
8645
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02008646 init_rt_bandwidth(&tg->rt_bandwidth,
8647 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
Peter Zijlstrabccbe082008-02-13 15:45:40 +01008648
8649 for_each_possible_cpu(i) {
Li Zefaneab17222008-10-29 17:03:22 +08008650 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8651 GFP_KERNEL, cpu_to_node(i));
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008652 if (!rt_rq)
8653 goto err;
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008654
Li Zefaneab17222008-10-29 17:03:22 +08008655 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
8656 GFP_KERNEL, cpu_to_node(i));
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008657 if (!rt_se)
Phil Carmodydfc12eb2009-12-10 14:29:37 +02008658 goto err_free_rq;
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008659
Jan H. Schönherracb5a9b2011-07-14 18:32:43 +02008660 init_rt_rq(rt_rq, cpu_rq(i));
8661 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
Peter Zijlstra3d4b47b2010-11-15 15:47:01 -08008662 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008663 }
8664
Peter Zijlstrabccbe082008-02-13 15:45:40 +01008665 return 1;
8666
Peter Zijlstra49246272010-10-17 21:46:10 +02008667err_free_rq:
Phil Carmodydfc12eb2009-12-10 14:29:37 +02008668 kfree(rt_rq);
Peter Zijlstra49246272010-10-17 21:46:10 +02008669err:
Peter Zijlstrabccbe082008-02-13 15:45:40 +01008670 return 0;
8671}
Dhaval Giani6d6bc0a2008-05-30 14:23:45 +02008672#else /* !CONFIG_RT_GROUP_SCHED */
Peter Zijlstrabccbe082008-02-13 15:45:40 +01008673static inline void free_rt_sched_group(struct task_group *tg)
8674{
8675}
8676
Dhaval Gianiec7dc8a2008-04-19 19:44:59 +02008677static inline
8678int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
Peter Zijlstrabccbe082008-02-13 15:45:40 +01008679{
8680 return 1;
8681}
Dhaval Giani6d6bc0a2008-05-30 14:23:45 +02008682#endif /* CONFIG_RT_GROUP_SCHED */
Peter Zijlstrabccbe082008-02-13 15:45:40 +01008683
Dhaval Giani7c941432010-01-20 13:26:18 +01008684#ifdef CONFIG_CGROUP_SCHED
Peter Zijlstrabccbe082008-02-13 15:45:40 +01008685static void free_sched_group(struct task_group *tg)
8686{
8687 free_fair_sched_group(tg);
8688 free_rt_sched_group(tg);
Mike Galbraithe9aa1dd2011-01-05 11:11:25 +01008689 autogroup_free(tg);
Peter Zijlstrabccbe082008-02-13 15:45:40 +01008690 kfree(tg);
8691}
8692
8693/* allocate runqueue etc for a new task group */
Dhaval Gianiec7dc8a2008-04-19 19:44:59 +02008694struct task_group *sched_create_group(struct task_group *parent)
Peter Zijlstrabccbe082008-02-13 15:45:40 +01008695{
8696 struct task_group *tg;
8697 unsigned long flags;
Peter Zijlstrabccbe082008-02-13 15:45:40 +01008698
8699 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
8700 if (!tg)
8701 return ERR_PTR(-ENOMEM);
8702
Dhaval Gianiec7dc8a2008-04-19 19:44:59 +02008703 if (!alloc_fair_sched_group(tg, parent))
Peter Zijlstrabccbe082008-02-13 15:45:40 +01008704 goto err;
8705
Dhaval Gianiec7dc8a2008-04-19 19:44:59 +02008706 if (!alloc_rt_sched_group(tg, parent))
Peter Zijlstrabccbe082008-02-13 15:45:40 +01008707 goto err;
8708
Peter Zijlstra8ed36992008-02-13 15:45:39 +01008709 spin_lock_irqsave(&task_group_lock, flags);
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008710 list_add_rcu(&tg->list, &task_groups);
Peter Zijlstraf473aa52008-04-19 19:45:00 +02008711
8712 WARN_ON(!parent); /* root should already exist */
8713
8714 tg->parent = parent;
Peter Zijlstraf473aa52008-04-19 19:45:00 +02008715 INIT_LIST_HEAD(&tg->children);
Zhang, Yanmin09f27242030-08-14 15:56:40 +08008716 list_add_rcu(&tg->siblings, &parent->children);
Peter Zijlstra8ed36992008-02-13 15:45:39 +01008717 spin_unlock_irqrestore(&task_group_lock, flags);
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008718
Srivatsa Vaddagiri9b5b7752007-10-15 17:00:09 +02008719 return tg;
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008720
8721err:
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008722 free_sched_group(tg);
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008723 return ERR_PTR(-ENOMEM);
8724}
8725
Srivatsa Vaddagiri9b5b7752007-10-15 17:00:09 +02008726/* rcu callback to free various structures associated with a task group */
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008727static void free_sched_group_rcu(struct rcu_head *rhp)
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008728{
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008729 /* now it should be safe to free those cfs_rqs */
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008730 free_sched_group(container_of(rhp, struct task_group, rcu));
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008731}
8732
Srivatsa Vaddagiri9b5b7752007-10-15 17:00:09 +02008733/* Destroy runqueue etc associated with a task group */
Ingo Molnar4cf86d72007-10-15 17:00:14 +02008734void sched_destroy_group(struct task_group *tg)
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008735{
Peter Zijlstra8ed36992008-02-13 15:45:39 +01008736 unsigned long flags;
Srivatsa Vaddagiri9b5b7752007-10-15 17:00:09 +02008737 int i;
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008738
Peter Zijlstra3d4b47b2010-11-15 15:47:01 -08008739 /* end participation in shares distribution */
8740 for_each_possible_cpu(i)
Peter Zijlstrabccbe082008-02-13 15:45:40 +01008741 unregister_fair_sched_group(tg, i);
Peter Zijlstra3d4b47b2010-11-15 15:47:01 -08008742
8743 spin_lock_irqsave(&task_group_lock, flags);
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008744 list_del_rcu(&tg->list);
Peter Zijlstraf473aa52008-04-19 19:45:00 +02008745 list_del_rcu(&tg->siblings);
Peter Zijlstra8ed36992008-02-13 15:45:39 +01008746 spin_unlock_irqrestore(&task_group_lock, flags);
Srivatsa Vaddagiri9b5b7752007-10-15 17:00:09 +02008747
Srivatsa Vaddagiri9b5b7752007-10-15 17:00:09 +02008748 /* wait for possible concurrent references to cfs_rqs complete */
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008749 call_rcu(&tg->rcu, free_sched_group_rcu);
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008750}
8751
Srivatsa Vaddagiri9b5b7752007-10-15 17:00:09 +02008752/* change task's runqueue when it moves between groups.
Ingo Molnar3a252012007-10-15 17:00:12 +02008753 * The caller of this function should have put the task in its new group
8754 * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
8755 * reflect its new group.
Srivatsa Vaddagiri9b5b7752007-10-15 17:00:09 +02008756 */
8757void sched_move_task(struct task_struct *tsk)
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008758{
8759 int on_rq, running;
8760 unsigned long flags;
8761 struct rq *rq;
8762
8763 rq = task_rq_lock(tsk, &flags);
8764
Dmitry Adamushko051a1d12007-12-18 15:21:13 +01008765 running = task_current(rq, tsk);
Peter Zijlstrafd2f4412011-04-05 17:23:44 +02008766 on_rq = tsk->on_rq;
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008767
Hiroshi Shimamoto0e1f3482008-03-10 11:01:20 -07008768 if (on_rq)
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008769 dequeue_task(rq, tsk, 0);
Hiroshi Shimamoto0e1f3482008-03-10 11:01:20 -07008770 if (unlikely(running))
8771 tsk->sched_class->put_prev_task(rq, tsk);
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008772
Peter Zijlstra810b3812008-02-29 15:21:01 -05008773#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstrab2b5ce02010-10-15 15:24:15 +02008774 if (tsk->sched_class->task_move_group)
8775 tsk->sched_class->task_move_group(tsk, on_rq);
8776 else
Peter Zijlstra810b3812008-02-29 15:21:01 -05008777#endif
Peter Zijlstrab2b5ce02010-10-15 15:24:15 +02008778 set_task_rq(tsk, task_cpu(tsk));
Peter Zijlstra810b3812008-02-29 15:21:01 -05008779
Hiroshi Shimamoto0e1f3482008-03-10 11:01:20 -07008780 if (unlikely(running))
8781 tsk->sched_class->set_curr_task(rq);
8782 if (on_rq)
Peter Zijlstra371fd7e2010-03-24 16:38:48 +01008783 enqueue_task(rq, tsk, 0);
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008784
Peter Zijlstra0122ec52011-04-05 17:23:51 +02008785 task_rq_unlock(rq, tsk, &flags);
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008786}
Dhaval Giani7c941432010-01-20 13:26:18 +01008787#endif /* CONFIG_CGROUP_SCHED */
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008788
Peter Zijlstra052f1dc2008-02-13 15:45:40 +01008789#ifdef CONFIG_FAIR_GROUP_SCHED
Peter Zijlstra8ed36992008-02-13 15:45:39 +01008790static DEFINE_MUTEX(shares_mutex);
8791
Ingo Molnar4cf86d72007-10-15 17:00:14 +02008792int sched_group_set_shares(struct task_group *tg, unsigned long shares)
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008793{
8794 int i;
Peter Zijlstra8ed36992008-02-13 15:45:39 +01008795 unsigned long flags;
Ingo Molnarc61935f2008-01-22 11:24:58 +01008796
Peter Zijlstra62fb1852008-02-25 17:34:02 +01008797 /*
Dhaval Gianiec7dc8a2008-04-19 19:44:59 +02008798 * We can't change the weight of the root cgroup.
8799 */
8800 if (!tg->se[0])
8801 return -EINVAL;
8802
Mike Galbraithcd622872011-06-04 15:03:20 +02008803 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
Peter Zijlstra62fb1852008-02-25 17:34:02 +01008804
Peter Zijlstra8ed36992008-02-13 15:45:39 +01008805 mutex_lock(&shares_mutex);
Srivatsa Vaddagiri9b5b7752007-10-15 17:00:09 +02008806 if (tg->shares == shares)
Dhaval Giani5cb350b2007-10-15 17:00:14 +02008807 goto done;
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008808
Srivatsa Vaddagiri6b2d7702008-01-25 21:08:00 +01008809 tg->shares = shares;
Peter Zijlstrac09595f2008-06-27 13:41:14 +02008810 for_each_possible_cpu(i) {
Paul Turner94371782010-11-15 15:47:10 -08008811 struct rq *rq = cpu_rq(i);
8812 struct sched_entity *se;
8813
8814 se = tg->se[i];
8815 /* Propagate contribution to hierarchy */
8816 raw_spin_lock_irqsave(&rq->lock, flags);
8817 for_each_sched_entity(se)
Paul Turner6d5ab292011-01-21 20:45:01 -08008818 update_cfs_shares(group_cfs_rq(se));
Paul Turner94371782010-11-15 15:47:10 -08008819 raw_spin_unlock_irqrestore(&rq->lock, flags);
Peter Zijlstrac09595f2008-06-27 13:41:14 +02008820 }
Srivatsa Vaddagiri6b2d7702008-01-25 21:08:00 +01008821
Dhaval Giani5cb350b2007-10-15 17:00:14 +02008822done:
Peter Zijlstra8ed36992008-02-13 15:45:39 +01008823 mutex_unlock(&shares_mutex);
Srivatsa Vaddagiri9b5b7752007-10-15 17:00:09 +02008824 return 0;
Srivatsa Vaddagiri29f59db2007-10-15 17:00:07 +02008825}
8826
Dhaval Giani5cb350b2007-10-15 17:00:14 +02008827unsigned long sched_group_shares(struct task_group *tg)
8828{
8829 return tg->shares;
8830}
Peter Zijlstra052f1dc2008-02-13 15:45:40 +01008831#endif
Dhaval Giani5cb350b2007-10-15 17:00:14 +02008832
Paul Turnera790de92011-07-21 09:43:29 -07008833#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
Peter Zijlstra9f0c1e52008-02-13 15:45:39 +01008834static unsigned long to_ratio(u64 period, u64 runtime)
8835{
8836 if (runtime == RUNTIME_INF)
Peter Zijlstra9a7e0b12008-08-19 12:33:06 +02008837 return 1ULL << 20;
Peter Zijlstra9f0c1e52008-02-13 15:45:39 +01008838
Peter Zijlstra9a7e0b12008-08-19 12:33:06 +02008839 return div64_u64(runtime << 20, period);
Peter Zijlstra9f0c1e52008-02-13 15:45:39 +01008840}
Paul Turnera790de92011-07-21 09:43:29 -07008841#endif
8842
8843#ifdef CONFIG_RT_GROUP_SCHED
8844/*
8845 * Ensure that the real time constraints are schedulable.
8846 */
8847static DEFINE_MUTEX(rt_constraints_mutex);
Peter Zijlstra9f0c1e52008-02-13 15:45:39 +01008848
Dhaval Giani521f1a242008-02-28 15:21:56 +05308849/* Must be called with tasklist_lock held */
8850static inline int tg_has_rt_tasks(struct task_group *tg)
8851{
8852 struct task_struct *g, *p;
Peter Zijlstra9a7e0b12008-08-19 12:33:06 +02008853
Dhaval Giani521f1a242008-02-28 15:21:56 +05308854 do_each_thread(g, p) {
8855 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
8856 return 1;
8857 } while_each_thread(g, p);
Peter Zijlstra9a7e0b12008-08-19 12:33:06 +02008858
Dhaval Giani521f1a242008-02-28 15:21:56 +05308859 return 0;
8860}
8861
Peter Zijlstra9a7e0b12008-08-19 12:33:06 +02008862struct rt_schedulable_data {
8863 struct task_group *tg;
8864 u64 rt_period;
8865 u64 rt_runtime;
8866};
8867
Paul Turnera790de92011-07-21 09:43:29 -07008868static int tg_rt_schedulable(struct task_group *tg, void *data)
Peter Zijlstra9a7e0b12008-08-19 12:33:06 +02008869{
8870 struct rt_schedulable_data *d = data;
8871 struct task_group *child;
8872 unsigned long total, sum = 0;
8873 u64 period, runtime;
8874
8875 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8876 runtime = tg->rt_bandwidth.rt_runtime;
8877
8878 if (tg == d->tg) {
8879 period = d->rt_period;
8880 runtime = d->rt_runtime;
8881 }
8882
Peter Zijlstra4653f802008-09-23 15:33:44 +02008883 /*
8884 * Cannot have more runtime than the period.
8885 */
8886 if (runtime > period && runtime != RUNTIME_INF)
8887 return -EINVAL;
8888
8889 /*
8890 * Ensure we don't starve existing RT tasks.
8891 */
Peter Zijlstra9a7e0b12008-08-19 12:33:06 +02008892 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
8893 return -EBUSY;
8894
8895 total = to_ratio(period, runtime);
8896
Peter Zijlstra4653f802008-09-23 15:33:44 +02008897 /*
8898 * Nobody can have more than the global setting allows.
8899 */
8900 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
8901 return -EINVAL;
8902
8903 /*
8904 * The sum of our children's runtime should not exceed our own.
8905 */
Peter Zijlstra9a7e0b12008-08-19 12:33:06 +02008906 list_for_each_entry_rcu(child, &tg->children, siblings) {
8907 period = ktime_to_ns(child->rt_bandwidth.rt_period);
8908 runtime = child->rt_bandwidth.rt_runtime;
8909
8910 if (child == d->tg) {
8911 period = d->rt_period;
8912 runtime = d->rt_runtime;
8913 }
8914
8915 sum += to_ratio(period, runtime);
8916 }
8917
8918 if (sum > total)
8919 return -EINVAL;
8920
8921 return 0;
8922}
8923
8924static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8925{
Paul Turner82774342011-07-21 09:43:35 -07008926 int ret;
8927
Peter Zijlstra9a7e0b12008-08-19 12:33:06 +02008928 struct rt_schedulable_data data = {
8929 .tg = tg,
8930 .rt_period = period,
8931 .rt_runtime = runtime,
8932 };
8933
Paul Turner82774342011-07-21 09:43:35 -07008934 rcu_read_lock();
8935 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
8936 rcu_read_unlock();
8937
8938 return ret;
Peter Zijlstra9a7e0b12008-08-19 12:33:06 +02008939}
8940
Paul Turnerab84d312011-07-21 09:43:28 -07008941static int tg_set_rt_bandwidth(struct task_group *tg,
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02008942 u64 rt_period, u64 rt_runtime)
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008943{
Peter Zijlstraac086bc2008-04-19 19:44:58 +02008944 int i, err = 0;
Peter Zijlstra9f0c1e52008-02-13 15:45:39 +01008945
Peter Zijlstra9f0c1e52008-02-13 15:45:39 +01008946 mutex_lock(&rt_constraints_mutex);
Dhaval Giani521f1a242008-02-28 15:21:56 +05308947 read_lock(&tasklist_lock);
Peter Zijlstra9a7e0b12008-08-19 12:33:06 +02008948 err = __rt_schedulable(tg, rt_period, rt_runtime);
8949 if (err)
Dhaval Giani521f1a242008-02-28 15:21:56 +05308950 goto unlock;
Peter Zijlstraac086bc2008-04-19 19:44:58 +02008951
Thomas Gleixner0986b112009-11-17 15:32:06 +01008952 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02008953 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
8954 tg->rt_bandwidth.rt_runtime = rt_runtime;
Peter Zijlstraac086bc2008-04-19 19:44:58 +02008955
8956 for_each_possible_cpu(i) {
8957 struct rt_rq *rt_rq = tg->rt_rq[i];
8958
Thomas Gleixner0986b112009-11-17 15:32:06 +01008959 raw_spin_lock(&rt_rq->rt_runtime_lock);
Peter Zijlstraac086bc2008-04-19 19:44:58 +02008960 rt_rq->rt_runtime = rt_runtime;
Thomas Gleixner0986b112009-11-17 15:32:06 +01008961 raw_spin_unlock(&rt_rq->rt_runtime_lock);
Peter Zijlstraac086bc2008-04-19 19:44:58 +02008962 }
Thomas Gleixner0986b112009-11-17 15:32:06 +01008963 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
Peter Zijlstra49246272010-10-17 21:46:10 +02008964unlock:
Dhaval Giani521f1a242008-02-28 15:21:56 +05308965 read_unlock(&tasklist_lock);
Peter Zijlstra9f0c1e52008-02-13 15:45:39 +01008966 mutex_unlock(&rt_constraints_mutex);
8967
8968 return err;
Peter Zijlstra6f505b12008-01-25 21:08:30 +01008969}
8970
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02008971int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
8972{
8973 u64 rt_runtime, rt_period;
8974
8975 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8976 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
8977 if (rt_runtime_us < 0)
8978 rt_runtime = RUNTIME_INF;
8979
Paul Turnerab84d312011-07-21 09:43:28 -07008980 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02008981}
8982
Peter Zijlstra9f0c1e52008-02-13 15:45:39 +01008983long sched_group_rt_runtime(struct task_group *tg)
8984{
8985 u64 rt_runtime_us;
8986
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02008987 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
Peter Zijlstra9f0c1e52008-02-13 15:45:39 +01008988 return -1;
8989
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02008990 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
Peter Zijlstra9f0c1e52008-02-13 15:45:39 +01008991 do_div(rt_runtime_us, NSEC_PER_USEC);
8992 return rt_runtime_us;
8993}
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02008994
8995int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
8996{
8997 u64 rt_runtime, rt_period;
8998
8999 rt_period = (u64)rt_period_us * NSEC_PER_USEC;
9000 rt_runtime = tg->rt_bandwidth.rt_runtime;
9001
Raistlin619b0482008-06-26 18:54:09 +02009002 if (rt_period == 0)
9003 return -EINVAL;
9004
Paul Turnerab84d312011-07-21 09:43:28 -07009005 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02009006}
9007
9008long sched_group_rt_period(struct task_group *tg)
9009{
9010 u64 rt_period_us;
9011
9012 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
9013 do_div(rt_period_us, NSEC_PER_USEC);
9014 return rt_period_us;
9015}
9016
9017static int sched_rt_global_constraints(void)
9018{
Peter Zijlstra4653f802008-09-23 15:33:44 +02009019 u64 runtime, period;
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02009020 int ret = 0;
9021
Hiroshi Shimamotoec5d4982008-09-10 17:00:19 -07009022 if (sysctl_sched_rt_period <= 0)
9023 return -EINVAL;
9024
Peter Zijlstra4653f802008-09-23 15:33:44 +02009025 runtime = global_rt_runtime();
9026 period = global_rt_period();
9027
9028 /*
9029 * Sanity check on the sysctl variables.
9030 */
9031 if (runtime > period && runtime != RUNTIME_INF)
9032 return -EINVAL;
Peter Zijlstra10b612f2008-06-19 14:22:27 +02009033
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02009034 mutex_lock(&rt_constraints_mutex);
Peter Zijlstra9a7e0b12008-08-19 12:33:06 +02009035 read_lock(&tasklist_lock);
Peter Zijlstra4653f802008-09-23 15:33:44 +02009036 ret = __rt_schedulable(NULL, 0, 0);
Peter Zijlstra9a7e0b12008-08-19 12:33:06 +02009037 read_unlock(&tasklist_lock);
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02009038 mutex_unlock(&rt_constraints_mutex);
9039
9040 return ret;
9041}
Dhaval Giani54e99122009-02-27 15:13:54 +05309042
9043int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
9044{
9045 /* Don't accept realtime tasks when there is no way for them to run */
9046 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
9047 return 0;
9048
9049 return 1;
9050}
9051
Dhaval Giani6d6bc0a2008-05-30 14:23:45 +02009052#else /* !CONFIG_RT_GROUP_SCHED */
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02009053static int sched_rt_global_constraints(void)
9054{
Peter Zijlstraac086bc2008-04-19 19:44:58 +02009055 unsigned long flags;
9056 int i;
9057
Hiroshi Shimamotoec5d4982008-09-10 17:00:19 -07009058 if (sysctl_sched_rt_period <= 0)
9059 return -EINVAL;
9060
Peter Zijlstra60aa6052009-05-05 17:50:21 +02009061 /*
9062 * There's always some RT tasks in the root group
9063 * -- migration, kstopmachine etc..
9064 */
9065 if (sysctl_sched_rt_runtime == 0)
9066 return -EBUSY;
9067
Thomas Gleixner0986b112009-11-17 15:32:06 +01009068 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
Peter Zijlstraac086bc2008-04-19 19:44:58 +02009069 for_each_possible_cpu(i) {
9070 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
9071
Thomas Gleixner0986b112009-11-17 15:32:06 +01009072 raw_spin_lock(&rt_rq->rt_runtime_lock);
Peter Zijlstraac086bc2008-04-19 19:44:58 +02009073 rt_rq->rt_runtime = global_rt_runtime();
Thomas Gleixner0986b112009-11-17 15:32:06 +01009074 raw_spin_unlock(&rt_rq->rt_runtime_lock);
Peter Zijlstraac086bc2008-04-19 19:44:58 +02009075 }
Thomas Gleixner0986b112009-11-17 15:32:06 +01009076 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
Peter Zijlstraac086bc2008-04-19 19:44:58 +02009077
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02009078 return 0;
9079}
Dhaval Giani6d6bc0a2008-05-30 14:23:45 +02009080#endif /* CONFIG_RT_GROUP_SCHED */
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02009081
9082int sched_rt_handler(struct ctl_table *table, int write,
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07009083 void __user *buffer, size_t *lenp,
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02009084 loff_t *ppos)
9085{
9086 int ret;
9087 int old_period, old_runtime;
9088 static DEFINE_MUTEX(mutex);
9089
9090 mutex_lock(&mutex);
9091 old_period = sysctl_sched_rt_period;
9092 old_runtime = sysctl_sched_rt_runtime;
9093
Alexey Dobriyan8d65af72009-09-23 15:57:19 -07009094 ret = proc_dointvec(table, write, buffer, lenp, ppos);
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02009095
9096 if (!ret && write) {
9097 ret = sched_rt_global_constraints();
9098 if (ret) {
9099 sysctl_sched_rt_period = old_period;
9100 sysctl_sched_rt_runtime = old_runtime;
9101 } else {
9102 def_rt_bandwidth.rt_runtime = global_rt_runtime();
9103 def_rt_bandwidth.rt_period =
9104 ns_to_ktime(global_rt_period());
9105 }
9106 }
9107 mutex_unlock(&mutex);
9108
9109 return ret;
9110}
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009111
Peter Zijlstra052f1dc2008-02-13 15:45:40 +01009112#ifdef CONFIG_CGROUP_SCHED
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009113
9114/* return corresponding task_group object of a cgroup */
Paul Menage2b01dfe2007-10-24 18:23:50 +02009115static inline struct task_group *cgroup_tg(struct cgroup *cgrp)
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009116{
Paul Menage2b01dfe2007-10-24 18:23:50 +02009117 return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id),
9118 struct task_group, css);
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009119}
9120
9121static struct cgroup_subsys_state *
Paul Menage2b01dfe2007-10-24 18:23:50 +02009122cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009123{
Dhaval Gianiec7dc8a2008-04-19 19:44:59 +02009124 struct task_group *tg, *parent;
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009125
Paul Menage2b01dfe2007-10-24 18:23:50 +02009126 if (!cgrp->parent) {
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009127 /* This is early initialization for the top cgroup */
Yong Zhang07e06b02011-01-07 15:17:36 +08009128 return &root_task_group.css;
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009129 }
9130
Dhaval Gianiec7dc8a2008-04-19 19:44:59 +02009131 parent = cgroup_tg(cgrp->parent);
9132 tg = sched_create_group(parent);
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009133 if (IS_ERR(tg))
9134 return ERR_PTR(-ENOMEM);
9135
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009136 return &tg->css;
9137}
9138
Ingo Molnar41a2d6c2007-12-05 15:46:09 +01009139static void
9140cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009141{
Paul Menage2b01dfe2007-10-24 18:23:50 +02009142 struct task_group *tg = cgroup_tg(cgrp);
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009143
9144 sched_destroy_group(tg);
9145}
9146
Ingo Molnar41a2d6c2007-12-05 15:46:09 +01009147static int
Ben Blumbe367d02009-09-23 15:56:31 -07009148cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009149{
Peter Zijlstrab68aa232008-02-13 15:45:40 +01009150#ifdef CONFIG_RT_GROUP_SCHED
Dhaval Giani54e99122009-02-27 15:13:54 +05309151 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
Peter Zijlstrab68aa232008-02-13 15:45:40 +01009152 return -EINVAL;
9153#else
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009154 /* We don't support RT-tasks being in separate groups */
9155 if (tsk->sched_class != &fair_sched_class)
9156 return -EINVAL;
Peter Zijlstrab68aa232008-02-13 15:45:40 +01009157#endif
Ben Blumbe367d02009-09-23 15:56:31 -07009158 return 0;
9159}
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009160
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009161static void
Ben Blumf780bdb2011-05-26 16:25:19 -07009162cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009163{
9164 sched_move_task(tsk);
9165}
9166
Peter Zijlstra068c5cc2011-01-19 12:26:11 +01009167static void
Peter Zijlstrad41d5a02011-02-07 17:02:20 +01009168cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
9169 struct cgroup *old_cgrp, struct task_struct *task)
Peter Zijlstra068c5cc2011-01-19 12:26:11 +01009170{
9171 /*
9172 * cgroup_exit() is called in the copy_process() failure path.
9173 * Ignore this case since the task hasn't ran yet, this avoids
9174 * trying to poke a half freed task state from generic code.
9175 */
9176 if (!(task->flags & PF_EXITING))
9177 return;
9178
9179 sched_move_task(task);
9180}
9181
Peter Zijlstra052f1dc2008-02-13 15:45:40 +01009182#ifdef CONFIG_FAIR_GROUP_SCHED
Paul Menagef4c753b2008-04-29 00:59:56 -07009183static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
Paul Menage2b01dfe2007-10-24 18:23:50 +02009184 u64 shareval)
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009185{
Nikhil Raoc8b28112011-05-18 14:37:48 -07009186 return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009187}
9188
Paul Menagef4c753b2008-04-29 00:59:56 -07009189static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009190{
Paul Menage2b01dfe2007-10-24 18:23:50 +02009191 struct task_group *tg = cgroup_tg(cgrp);
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009192
Nikhil Raoc8b28112011-05-18 14:37:48 -07009193 return (u64) scale_load_down(tg->shares);
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009194}
Paul Turnerab84d312011-07-21 09:43:28 -07009195
9196#ifdef CONFIG_CFS_BANDWIDTH
Paul Turnera790de92011-07-21 09:43:29 -07009197static DEFINE_MUTEX(cfs_constraints_mutex);
9198
Paul Turnerab84d312011-07-21 09:43:28 -07009199const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
9200const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
9201
Paul Turnera790de92011-07-21 09:43:29 -07009202static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
9203
Paul Turnerab84d312011-07-21 09:43:28 -07009204static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
9205{
Paul Turner58088ad2011-07-21 09:43:31 -07009206 int i, ret = 0, runtime_enabled;
Paul Turnerab84d312011-07-21 09:43:28 -07009207 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
Paul Turnerab84d312011-07-21 09:43:28 -07009208
9209 if (tg == &root_task_group)
9210 return -EINVAL;
9211
9212 /*
9213 * Ensure we have at some amount of bandwidth every period. This is
9214 * to prevent reaching a state of large arrears when throttled via
9215 * entity_tick() resulting in prolonged exit starvation.
9216 */
9217 if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
9218 return -EINVAL;
9219
9220 /*
9221 * Likewise, bound things on the otherside by preventing insane quota
9222 * periods. This also allows us to normalize in computing quota
9223 * feasibility.
9224 */
9225 if (period > max_cfs_quota_period)
9226 return -EINVAL;
9227
Paul Turnera790de92011-07-21 09:43:29 -07009228 mutex_lock(&cfs_constraints_mutex);
9229 ret = __cfs_schedulable(tg, period, quota);
9230 if (ret)
9231 goto out_unlock;
9232
Paul Turner58088ad2011-07-21 09:43:31 -07009233 runtime_enabled = quota != RUNTIME_INF;
Paul Turnerab84d312011-07-21 09:43:28 -07009234 raw_spin_lock_irq(&cfs_b->lock);
9235 cfs_b->period = ns_to_ktime(period);
9236 cfs_b->quota = quota;
Paul Turner58088ad2011-07-21 09:43:31 -07009237
Paul Turnera9cf55b2011-07-21 09:43:32 -07009238 __refill_cfs_bandwidth_runtime(cfs_b);
Paul Turner58088ad2011-07-21 09:43:31 -07009239 /* restart the period timer (if active) to handle new period expiry */
9240 if (runtime_enabled && cfs_b->timer_active) {
9241 /* force a reprogram */
9242 cfs_b->timer_active = 0;
9243 __start_cfs_bandwidth(cfs_b);
9244 }
Paul Turnerab84d312011-07-21 09:43:28 -07009245 raw_spin_unlock_irq(&cfs_b->lock);
9246
9247 for_each_possible_cpu(i) {
9248 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
9249 struct rq *rq = rq_of(cfs_rq);
9250
9251 raw_spin_lock_irq(&rq->lock);
Paul Turner58088ad2011-07-21 09:43:31 -07009252 cfs_rq->runtime_enabled = runtime_enabled;
Paul Turnerab84d312011-07-21 09:43:28 -07009253 cfs_rq->runtime_remaining = 0;
Paul Turner671fd9d2011-07-21 09:43:34 -07009254
9255 if (cfs_rq_throttled(cfs_rq))
9256 unthrottle_cfs_rq(cfs_rq);
Paul Turnerab84d312011-07-21 09:43:28 -07009257 raw_spin_unlock_irq(&rq->lock);
9258 }
Paul Turnera790de92011-07-21 09:43:29 -07009259out_unlock:
9260 mutex_unlock(&cfs_constraints_mutex);
Paul Turnerab84d312011-07-21 09:43:28 -07009261
Paul Turnera790de92011-07-21 09:43:29 -07009262 return ret;
Paul Turnerab84d312011-07-21 09:43:28 -07009263}
9264
9265int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
9266{
9267 u64 quota, period;
9268
9269 period = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
9270 if (cfs_quota_us < 0)
9271 quota = RUNTIME_INF;
9272 else
9273 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
9274
9275 return tg_set_cfs_bandwidth(tg, period, quota);
9276}
9277
9278long tg_get_cfs_quota(struct task_group *tg)
9279{
9280 u64 quota_us;
9281
9282 if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF)
9283 return -1;
9284
9285 quota_us = tg_cfs_bandwidth(tg)->quota;
9286 do_div(quota_us, NSEC_PER_USEC);
9287
9288 return quota_us;
9289}
9290
9291int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
9292{
9293 u64 quota, period;
9294
9295 period = (u64)cfs_period_us * NSEC_PER_USEC;
9296 quota = tg_cfs_bandwidth(tg)->quota;
9297
9298 if (period <= 0)
9299 return -EINVAL;
9300
9301 return tg_set_cfs_bandwidth(tg, period, quota);
9302}
9303
9304long tg_get_cfs_period(struct task_group *tg)
9305{
9306 u64 cfs_period_us;
9307
9308 cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
9309 do_div(cfs_period_us, NSEC_PER_USEC);
9310
9311 return cfs_period_us;
9312}
9313
9314static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
9315{
9316 return tg_get_cfs_quota(cgroup_tg(cgrp));
9317}
9318
9319static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
9320 s64 cfs_quota_us)
9321{
9322 return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
9323}
9324
9325static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
9326{
9327 return tg_get_cfs_period(cgroup_tg(cgrp));
9328}
9329
9330static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
9331 u64 cfs_period_us)
9332{
9333 return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
9334}
9335
Paul Turnera790de92011-07-21 09:43:29 -07009336struct cfs_schedulable_data {
9337 struct task_group *tg;
9338 u64 period, quota;
9339};
9340
9341/*
9342 * normalize group quota/period to be quota/max_period
9343 * note: units are usecs
9344 */
9345static u64 normalize_cfs_quota(struct task_group *tg,
9346 struct cfs_schedulable_data *d)
9347{
9348 u64 quota, period;
9349
9350 if (tg == d->tg) {
9351 period = d->period;
9352 quota = d->quota;
9353 } else {
9354 period = tg_get_cfs_period(tg);
9355 quota = tg_get_cfs_quota(tg);
9356 }
9357
9358 /* note: these should typically be equivalent */
9359 if (quota == RUNTIME_INF || quota == -1)
9360 return RUNTIME_INF;
9361
9362 return to_ratio(period, quota);
9363}
9364
9365static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
9366{
9367 struct cfs_schedulable_data *d = data;
9368 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
9369 s64 quota = 0, parent_quota = -1;
9370
9371 if (!tg->parent) {
9372 quota = RUNTIME_INF;
9373 } else {
9374 struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent);
9375
9376 quota = normalize_cfs_quota(tg, d);
9377 parent_quota = parent_b->hierarchal_quota;
9378
9379 /*
9380 * ensure max(child_quota) <= parent_quota, inherit when no
9381 * limit is set
9382 */
9383 if (quota == RUNTIME_INF)
9384 quota = parent_quota;
9385 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
9386 return -EINVAL;
9387 }
9388 cfs_b->hierarchal_quota = quota;
9389
9390 return 0;
9391}
9392
9393static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
9394{
Paul Turner82774342011-07-21 09:43:35 -07009395 int ret;
Paul Turnera790de92011-07-21 09:43:29 -07009396 struct cfs_schedulable_data data = {
9397 .tg = tg,
9398 .period = period,
9399 .quota = quota,
9400 };
9401
9402 if (quota != RUNTIME_INF) {
9403 do_div(data.period, NSEC_PER_USEC);
9404 do_div(data.quota, NSEC_PER_USEC);
9405 }
9406
Paul Turner82774342011-07-21 09:43:35 -07009407 rcu_read_lock();
9408 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
9409 rcu_read_unlock();
9410
9411 return ret;
Paul Turnera790de92011-07-21 09:43:29 -07009412}
Nikhil Raoe8da1b12011-07-21 09:43:40 -07009413
9414static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
9415 struct cgroup_map_cb *cb)
9416{
9417 struct task_group *tg = cgroup_tg(cgrp);
9418 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
9419
9420 cb->fill(cb, "nr_periods", cfs_b->nr_periods);
9421 cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
9422 cb->fill(cb, "throttled_time", cfs_b->throttled_time);
9423
9424 return 0;
9425}
Paul Turnerab84d312011-07-21 09:43:28 -07009426#endif /* CONFIG_CFS_BANDWIDTH */
Dhaval Giani6d6bc0a2008-05-30 14:23:45 +02009427#endif /* CONFIG_FAIR_GROUP_SCHED */
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009428
Peter Zijlstra052f1dc2008-02-13 15:45:40 +01009429#ifdef CONFIG_RT_GROUP_SCHED
Mirco Tischler0c708142008-05-14 16:05:46 -07009430static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
Paul Menage06ecb272008-04-29 01:00:06 -07009431 s64 val)
Peter Zijlstra6f505b12008-01-25 21:08:30 +01009432{
Paul Menage06ecb272008-04-29 01:00:06 -07009433 return sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
Peter Zijlstra6f505b12008-01-25 21:08:30 +01009434}
9435
Paul Menage06ecb272008-04-29 01:00:06 -07009436static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft)
Peter Zijlstra6f505b12008-01-25 21:08:30 +01009437{
Paul Menage06ecb272008-04-29 01:00:06 -07009438 return sched_group_rt_runtime(cgroup_tg(cgrp));
Peter Zijlstra6f505b12008-01-25 21:08:30 +01009439}
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02009440
9441static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype,
9442 u64 rt_period_us)
9443{
9444 return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us);
9445}
9446
9447static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
9448{
9449 return sched_group_rt_period(cgroup_tg(cgrp));
9450}
Dhaval Giani6d6bc0a2008-05-30 14:23:45 +02009451#endif /* CONFIG_RT_GROUP_SCHED */
Peter Zijlstra6f505b12008-01-25 21:08:30 +01009452
Paul Menagefe5c7cc2007-10-29 21:18:11 +01009453static struct cftype cpu_files[] = {
Peter Zijlstra052f1dc2008-02-13 15:45:40 +01009454#ifdef CONFIG_FAIR_GROUP_SCHED
Paul Menagefe5c7cc2007-10-29 21:18:11 +01009455 {
9456 .name = "shares",
Paul Menagef4c753b2008-04-29 00:59:56 -07009457 .read_u64 = cpu_shares_read_u64,
9458 .write_u64 = cpu_shares_write_u64,
Paul Menagefe5c7cc2007-10-29 21:18:11 +01009459 },
Peter Zijlstra052f1dc2008-02-13 15:45:40 +01009460#endif
Paul Turnerab84d312011-07-21 09:43:28 -07009461#ifdef CONFIG_CFS_BANDWIDTH
9462 {
9463 .name = "cfs_quota_us",
9464 .read_s64 = cpu_cfs_quota_read_s64,
9465 .write_s64 = cpu_cfs_quota_write_s64,
9466 },
9467 {
9468 .name = "cfs_period_us",
9469 .read_u64 = cpu_cfs_period_read_u64,
9470 .write_u64 = cpu_cfs_period_write_u64,
9471 },
Nikhil Raoe8da1b12011-07-21 09:43:40 -07009472 {
9473 .name = "stat",
9474 .read_map = cpu_stats_show,
9475 },
Paul Turnerab84d312011-07-21 09:43:28 -07009476#endif
Peter Zijlstra052f1dc2008-02-13 15:45:40 +01009477#ifdef CONFIG_RT_GROUP_SCHED
Peter Zijlstra6f505b12008-01-25 21:08:30 +01009478 {
Peter Zijlstra9f0c1e52008-02-13 15:45:39 +01009479 .name = "rt_runtime_us",
Paul Menage06ecb272008-04-29 01:00:06 -07009480 .read_s64 = cpu_rt_runtime_read,
9481 .write_s64 = cpu_rt_runtime_write,
Peter Zijlstra6f505b12008-01-25 21:08:30 +01009482 },
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02009483 {
9484 .name = "rt_period_us",
Paul Menagef4c753b2008-04-29 00:59:56 -07009485 .read_u64 = cpu_rt_period_read_uint,
9486 .write_u64 = cpu_rt_period_write_uint,
Peter Zijlstrad0b27fa2008-04-19 19:44:57 +02009487 },
Peter Zijlstra052f1dc2008-02-13 15:45:40 +01009488#endif
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009489};
9490
9491static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
9492{
Paul Menagefe5c7cc2007-10-29 21:18:11 +01009493 return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009494}
9495
9496struct cgroup_subsys cpu_cgroup_subsys = {
Ingo Molnar38605ca2007-10-29 21:18:11 +01009497 .name = "cpu",
9498 .create = cpu_cgroup_create,
9499 .destroy = cpu_cgroup_destroy,
Ben Blumf780bdb2011-05-26 16:25:19 -07009500 .can_attach_task = cpu_cgroup_can_attach_task,
9501 .attach_task = cpu_cgroup_attach_task,
Peter Zijlstra068c5cc2011-01-19 12:26:11 +01009502 .exit = cpu_cgroup_exit,
Ingo Molnar38605ca2007-10-29 21:18:11 +01009503 .populate = cpu_cgroup_populate,
9504 .subsys_id = cpu_cgroup_subsys_id,
Srivatsa Vaddagiri68318b82007-10-18 23:41:03 -07009505 .early_init = 1,
9506};
9507
Peter Zijlstra052f1dc2008-02-13 15:45:40 +01009508#endif /* CONFIG_CGROUP_SCHED */
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +01009509
9510#ifdef CONFIG_CGROUP_CPUACCT
9511
9512/*
9513 * CPU accounting code for task groups.
9514 *
9515 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
9516 * (balbir@in.ibm.com).
9517 */
9518
Bharata B Rao934352f2008-11-10 20:41:13 +05309519/* track cpu usage of a group of tasks and its child groups */
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +01009520struct cpuacct {
9521 struct cgroup_subsys_state css;
9522 /* cpuusage holds pointer to a u64-type object on every cpu */
Tejun Heo43cf38e2010-02-02 14:38:57 +09009523 u64 __percpu *cpuusage;
Bharata B Raoef12fef2009-03-31 10:02:22 +05309524 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
Bharata B Rao934352f2008-11-10 20:41:13 +05309525 struct cpuacct *parent;
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +01009526};
9527
9528struct cgroup_subsys cpuacct_subsys;
9529
9530/* return cpu accounting group corresponding to this container */
Dhaval Giani32cd7562008-02-29 10:02:43 +05309531static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +01009532{
Dhaval Giani32cd7562008-02-29 10:02:43 +05309533 return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +01009534 struct cpuacct, css);
9535}
9536
9537/* return cpu accounting group to which this task belongs */
9538static inline struct cpuacct *task_ca(struct task_struct *tsk)
9539{
9540 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
9541 struct cpuacct, css);
9542}
9543
9544/* create a new cpu accounting group */
9545static struct cgroup_subsys_state *cpuacct_create(
Dhaval Giani32cd7562008-02-29 10:02:43 +05309546 struct cgroup_subsys *ss, struct cgroup *cgrp)
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +01009547{
9548 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
Bharata B Raoef12fef2009-03-31 10:02:22 +05309549 int i;
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +01009550
9551 if (!ca)
Bharata B Raoef12fef2009-03-31 10:02:22 +05309552 goto out;
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +01009553
9554 ca->cpuusage = alloc_percpu(u64);
Bharata B Raoef12fef2009-03-31 10:02:22 +05309555 if (!ca->cpuusage)
9556 goto out_free_ca;
9557
9558 for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
9559 if (percpu_counter_init(&ca->cpustat[i], 0))
9560 goto out_free_counters;
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +01009561
Bharata B Rao934352f2008-11-10 20:41:13 +05309562 if (cgrp->parent)
9563 ca->parent = cgroup_ca(cgrp->parent);
9564
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +01009565 return &ca->css;
Bharata B Raoef12fef2009-03-31 10:02:22 +05309566
9567out_free_counters:
9568 while (--i >= 0)
9569 percpu_counter_destroy(&ca->cpustat[i]);
9570 free_percpu(ca->cpuusage);
9571out_free_ca:
9572 kfree(ca);
9573out:
9574 return ERR_PTR(-ENOMEM);
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +01009575}
9576
9577/* destroy an existing cpu accounting group */
Ingo Molnar41a2d6c2007-12-05 15:46:09 +01009578static void
Dhaval Giani32cd7562008-02-29 10:02:43 +05309579cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +01009580{
Dhaval Giani32cd7562008-02-29 10:02:43 +05309581 struct cpuacct *ca = cgroup_ca(cgrp);
Bharata B Raoef12fef2009-03-31 10:02:22 +05309582 int i;
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +01009583
Bharata B Raoef12fef2009-03-31 10:02:22 +05309584 for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
9585 percpu_counter_destroy(&ca->cpustat[i]);
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +01009586 free_percpu(ca->cpuusage);
9587 kfree(ca);
9588}
9589
Ken Chen720f5492008-12-15 22:02:01 -08009590static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
9591{
Rusty Russellb36128c2009-02-20 16:29:08 +09009592 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
Ken Chen720f5492008-12-15 22:02:01 -08009593 u64 data;
9594
9595#ifndef CONFIG_64BIT
9596 /*
9597 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
9598 */
Thomas Gleixner05fa7852009-11-17 14:28:38 +01009599 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
Ken Chen720f5492008-12-15 22:02:01 -08009600 data = *cpuusage;
Thomas Gleixner05fa7852009-11-17 14:28:38 +01009601 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
Ken Chen720f5492008-12-15 22:02:01 -08009602#else
9603 data = *cpuusage;
9604#endif
9605
9606 return data;
9607}
9608
9609static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
9610{
Rusty Russellb36128c2009-02-20 16:29:08 +09009611 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
Ken Chen720f5492008-12-15 22:02:01 -08009612
9613#ifndef CONFIG_64BIT
9614 /*
9615 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
9616 */
Thomas Gleixner05fa7852009-11-17 14:28:38 +01009617 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
Ken Chen720f5492008-12-15 22:02:01 -08009618 *cpuusage = val;
Thomas Gleixner05fa7852009-11-17 14:28:38 +01009619 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
Ken Chen720f5492008-12-15 22:02:01 -08009620#else
9621 *cpuusage = val;
9622#endif
9623}
9624
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +01009625/* return total cpu usage (in nanoseconds) of a group */
Dhaval Giani32cd7562008-02-29 10:02:43 +05309626static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +01009627{
Dhaval Giani32cd7562008-02-29 10:02:43 +05309628 struct cpuacct *ca = cgroup_ca(cgrp);
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +01009629 u64 totalcpuusage = 0;
9630 int i;
9631
Ken Chen720f5492008-12-15 22:02:01 -08009632 for_each_present_cpu(i)
9633 totalcpuusage += cpuacct_cpuusage_read(ca, i);
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +01009634
9635 return totalcpuusage;
9636}
9637
Dhaval Giani0297b802008-02-29 10:02:44 +05309638static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
9639 u64 reset)
9640{
9641 struct cpuacct *ca = cgroup_ca(cgrp);
9642 int err = 0;
9643 int i;
9644
9645 if (reset) {
9646 err = -EINVAL;
9647 goto out;
9648 }
9649
Ken Chen720f5492008-12-15 22:02:01 -08009650 for_each_present_cpu(i)
9651 cpuacct_cpuusage_write(ca, i, 0);
Dhaval Giani0297b802008-02-29 10:02:44 +05309652
Dhaval Giani0297b802008-02-29 10:02:44 +05309653out:
9654 return err;
9655}
9656
Ken Chene9515c32008-12-15 22:04:15 -08009657static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
9658 struct seq_file *m)
9659{
9660 struct cpuacct *ca = cgroup_ca(cgroup);
9661 u64 percpu;
9662 int i;
9663
9664 for_each_present_cpu(i) {
9665 percpu = cpuacct_cpuusage_read(ca, i);
9666 seq_printf(m, "%llu ", (unsigned long long) percpu);
9667 }
9668 seq_printf(m, "\n");
9669 return 0;
9670}
9671
Bharata B Raoef12fef2009-03-31 10:02:22 +05309672static const char *cpuacct_stat_desc[] = {
9673 [CPUACCT_STAT_USER] = "user",
9674 [CPUACCT_STAT_SYSTEM] = "system",
9675};
9676
9677static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
9678 struct cgroup_map_cb *cb)
9679{
9680 struct cpuacct *ca = cgroup_ca(cgrp);
9681 int i;
9682
9683 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
9684 s64 val = percpu_counter_read(&ca->cpustat[i]);
9685 val = cputime64_to_clock_t(val);
9686 cb->fill(cb, cpuacct_stat_desc[i], val);
9687 }
9688 return 0;
9689}
9690
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +01009691static struct cftype files[] = {
9692 {
9693 .name = "usage",
Paul Menagef4c753b2008-04-29 00:59:56 -07009694 .read_u64 = cpuusage_read,
9695 .write_u64 = cpuusage_write,
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +01009696 },
Ken Chene9515c32008-12-15 22:04:15 -08009697 {
9698 .name = "usage_percpu",
9699 .read_seq_string = cpuacct_percpu_seq_read,
9700 },
Bharata B Raoef12fef2009-03-31 10:02:22 +05309701 {
9702 .name = "stat",
9703 .read_map = cpuacct_stats_show,
9704 },
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +01009705};
9706
Dhaval Giani32cd7562008-02-29 10:02:43 +05309707static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +01009708{
Dhaval Giani32cd7562008-02-29 10:02:43 +05309709 return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +01009710}
9711
9712/*
9713 * charge this task's execution time to its accounting group.
9714 *
9715 * called with rq->lock held.
9716 */
9717static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9718{
9719 struct cpuacct *ca;
Bharata B Rao934352f2008-11-10 20:41:13 +05309720 int cpu;
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +01009721
Li Zefanc40c6f82009-02-26 15:40:15 +08009722 if (unlikely(!cpuacct_subsys.active))
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +01009723 return;
9724
Bharata B Rao934352f2008-11-10 20:41:13 +05309725 cpu = task_cpu(tsk);
Bharata B Raoa18b83b2009-03-23 10:02:53 +05309726
9727 rcu_read_lock();
9728
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +01009729 ca = task_ca(tsk);
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +01009730
Bharata B Rao934352f2008-11-10 20:41:13 +05309731 for (; ca; ca = ca->parent) {
Rusty Russellb36128c2009-02-20 16:29:08 +09009732 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +01009733 *cpuusage += cputime;
9734 }
Bharata B Raoa18b83b2009-03-23 10:02:53 +05309735
9736 rcu_read_unlock();
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +01009737}
9738
Bharata B Raoef12fef2009-03-31 10:02:22 +05309739/*
Anton Blanchardfa535a72010-02-02 14:46:13 -08009740 * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
9741 * in cputime_t units. As a result, cpuacct_update_stats calls
9742 * percpu_counter_add with values large enough to always overflow the
9743 * per cpu batch limit causing bad SMP scalability.
9744 *
9745 * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
9746 * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
9747 * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
9748 */
9749#ifdef CONFIG_SMP
9750#define CPUACCT_BATCH \
9751 min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
9752#else
9753#define CPUACCT_BATCH 0
9754#endif
9755
9756/*
Bharata B Raoef12fef2009-03-31 10:02:22 +05309757 * Charge the system/user time to the task's accounting group.
9758 */
9759static void cpuacct_update_stats(struct task_struct *tsk,
9760 enum cpuacct_stat_index idx, cputime_t val)
9761{
9762 struct cpuacct *ca;
Anton Blanchardfa535a72010-02-02 14:46:13 -08009763 int batch = CPUACCT_BATCH;
Bharata B Raoef12fef2009-03-31 10:02:22 +05309764
9765 if (unlikely(!cpuacct_subsys.active))
9766 return;
9767
9768 rcu_read_lock();
9769 ca = task_ca(tsk);
9770
9771 do {
Anton Blanchardfa535a72010-02-02 14:46:13 -08009772 __percpu_counter_add(&ca->cpustat[idx], val, batch);
Bharata B Raoef12fef2009-03-31 10:02:22 +05309773 ca = ca->parent;
9774 } while (ca);
9775 rcu_read_unlock();
9776}
9777
Srivatsa Vaddagirid842de82007-12-02 20:04:49 +01009778struct cgroup_subsys cpuacct_subsys = {
9779 .name = "cpuacct",
9780 .create = cpuacct_create,
9781 .destroy = cpuacct_destroy,
9782 .populate = cpuacct_populate,
9783 .subsys_id = cpuacct_subsys_id,
9784};
9785#endif /* CONFIG_CGROUP_CPUACCT */