blob: 9fbced64bfee004e1275088bb22eb4ac848e1733 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * kernel/sched.c
3 *
4 * Kernel scheduler and related syscalls
5 *
6 * Copyright (C) 1991-2002 Linus Torvalds
7 *
8 * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
9 * make semaphores SMP safe
10 * 1998-11-19 Implemented schedule_timeout() and related stuff
11 * by Andrea Arcangeli
12 * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
13 * hybrid priority-list and round-robin design with
14 * an array-switch method of distributing timeslices
15 * and per-CPU runqueues. Cleanups and useful suggestions
16 * by Davide Libenzi, preemptible kernel bits by Robert Love.
17 * 2003-09-03 Interactivity tuning by Con Kolivas.
18 * 2004-04-02 Scheduler domains code by Nick Piggin
Ingo Molnarc31f2e82007-07-09 18:52:01 +020019 * 2007-04-15 Work begun on replacing all interactivity tuning with a
20 * fair scheduling design by Con Kolivas.
21 * 2007-05-05 Load balancing (smp-nice) and other improvements
22 * by Peter Williams
23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
Linus Torvalds1da177e2005-04-16 15:20:36 -070025 */
26
27#include <linux/mm.h>
28#include <linux/module.h>
29#include <linux/nmi.h>
30#include <linux/init.h>
Ingo Molnardff06c12007-07-09 18:52:00 +020031#include <linux/uaccess.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070032#include <linux/highmem.h>
33#include <linux/smp_lock.h>
34#include <asm/mmu_context.h>
35#include <linux/interrupt.h>
Randy.Dunlapc59ede72006-01-11 12:17:46 -080036#include <linux/capability.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070037#include <linux/completion.h>
38#include <linux/kernel_stat.h>
Ingo Molnar9a11b49a2006-07-03 00:24:33 -070039#include <linux/debug_locks.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070040#include <linux/security.h>
41#include <linux/notifier.h>
42#include <linux/profile.h>
Nigel Cunningham7dfb7102006-12-06 20:34:23 -080043#include <linux/freezer.h>
akpm@osdl.org198e2f12006-01-12 01:05:30 -080044#include <linux/vmalloc.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070045#include <linux/blkdev.h>
46#include <linux/delay.h>
47#include <linux/smp.h>
48#include <linux/threads.h>
49#include <linux/timer.h>
50#include <linux/rcupdate.h>
51#include <linux/cpu.h>
52#include <linux/cpuset.h>
53#include <linux/percpu.h>
54#include <linux/kthread.h>
55#include <linux/seq_file.h>
56#include <linux/syscalls.h>
57#include <linux/times.h>
Jay Lan8f0ab512006-09-30 23:28:59 -070058#include <linux/tsacct_kern.h>
bibo maoc6fd91f2006-03-26 01:38:20 -080059#include <linux/kprobes.h>
Shailabh Nagar0ff92242006-07-14 00:24:37 -070060#include <linux/delayacct.h>
Eric Dumazet5517d862007-05-08 00:32:57 -070061#include <linux/reciprocal_div.h>
Ingo Molnardff06c12007-07-09 18:52:00 +020062#include <linux/unistd.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070063
Eric Dumazet5517d862007-05-08 00:32:57 -070064#include <asm/tlb.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070065
66/*
Alexey Dobriyanb035b6d2007-02-10 01:45:10 -080067 * Scheduler clock - returns current time in nanosec units.
68 * This is default implementation.
69 * Architectures and sub-architectures can override this.
70 */
71unsigned long long __attribute__((weak)) sched_clock(void)
72{
73 return (unsigned long long)jiffies * (1000000000 / HZ);
74}
75
76/*
Linus Torvalds1da177e2005-04-16 15:20:36 -070077 * Convert user-nice values [ -20 ... 0 ... 19 ]
78 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
79 * and back.
80 */
81#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
82#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
83#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
84
85/*
86 * 'User priority' is the nice value converted to something we
87 * can work with better when scaling various scheduler parameters,
88 * it's a [ 0 ... 39 ] range.
89 */
90#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
91#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
92#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
93
94/*
95 * Some helpers for converting nanosecond timing to jiffy resolution
96 */
97#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
98#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
99
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200100#define NICE_0_LOAD SCHED_LOAD_SCALE
101#define NICE_0_SHIFT SCHED_LOAD_SHIFT
102
Linus Torvalds1da177e2005-04-16 15:20:36 -0700103/*
104 * These are the 'tuning knobs' of the scheduler:
105 *
106 * Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
107 * default timeslice is 100 msecs, maximum timeslice is 800 msecs.
108 * Timeslices get refilled after they expire.
109 */
110#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
111#define DEF_TIMESLICE (100 * HZ / 1000)
Peter Williams2dd73a42006-06-27 02:54:34 -0700112
Eric Dumazet5517d862007-05-08 00:32:57 -0700113#ifdef CONFIG_SMP
114/*
115 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
116 * Since cpu_power is a 'constant', we can use a reciprocal divide.
117 */
118static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
119{
120 return reciprocal_divide(load, sg->reciprocal_cpu_power);
121}
122
123/*
124 * Each time a sched group cpu_power is changed,
125 * we must compute its reciprocal value
126 */
127static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
128{
129 sg->__cpu_power += val;
130 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
131}
132#endif
133
Ingo Molnar634fa8c2007-07-09 18:52:00 +0200134#define SCALE_PRIO(x, prio) \
135 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
Borislav Petkov91fcdd42006-10-19 23:28:29 -0700136
Ingo Molnar634fa8c2007-07-09 18:52:00 +0200137/*
138 * static_prio_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
139 * to time slice values: [800ms ... 100ms ... 5ms]
140 */
141static unsigned int static_prio_timeslice(int static_prio)
Peter Williams2dd73a42006-06-27 02:54:34 -0700142{
Ingo Molnar634fa8c2007-07-09 18:52:00 +0200143 if (static_prio == NICE_TO_PRIO(19))
144 return 1;
145
146 if (static_prio < NICE_TO_PRIO(0))
147 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
148 else
149 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
Peter Williams2dd73a42006-06-27 02:54:34 -0700150}
151
Ingo Molnare05606d2007-07-09 18:51:59 +0200152static inline int rt_policy(int policy)
153{
154 if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR))
155 return 1;
156 return 0;
157}
158
159static inline int task_has_rt_policy(struct task_struct *p)
160{
161 return rt_policy(p->policy);
162}
163
Linus Torvalds1da177e2005-04-16 15:20:36 -0700164/*
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200165 * This is the priority-queue data structure of the RT scheduling class:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700166 */
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200167struct rt_prio_array {
168 DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
169 struct list_head queue[MAX_RT_PRIO];
170};
Linus Torvalds1da177e2005-04-16 15:20:36 -0700171
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200172struct load_stat {
173 struct load_weight load;
174 u64 load_update_start, load_update_last;
175 unsigned long delta_fair, delta_exec, delta_stat;
176};
177
178/* CFS-related fields in a runqueue */
179struct cfs_rq {
180 struct load_weight load;
181 unsigned long nr_running;
182
183 s64 fair_clock;
184 u64 exec_clock;
185 s64 wait_runtime;
186 u64 sleeper_bonus;
187 unsigned long wait_runtime_overruns, wait_runtime_underruns;
188
189 struct rb_root tasks_timeline;
190 struct rb_node *rb_leftmost;
191 struct rb_node *rb_load_balance_curr;
192#ifdef CONFIG_FAIR_GROUP_SCHED
193 /* 'curr' points to currently running entity on this cfs_rq.
194 * It is set to NULL otherwise (i.e when none are currently running).
195 */
196 struct sched_entity *curr;
197 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
198
199 /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
200 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
201 * (like users, containers etc.)
202 *
203 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
204 * list is used during load balance.
205 */
206 struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */
207#endif
208};
209
210/* Real-Time classes' related field in a runqueue: */
211struct rt_rq {
212 struct rt_prio_array active;
213 int rt_load_balance_idx;
214 struct list_head *rt_load_balance_head, *rt_load_balance_curr;
215};
216
217/*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700218 * This is the main, per-CPU runqueue data structure.
219 *
220 * Locking rule: those places that want to lock multiple runqueues
221 * (such as the load balancing or the thread migration code), lock
222 * acquire operations must be ordered by ascending &runqueue.
223 */
Ingo Molnar70b97a72006-07-03 00:25:42 -0700224struct rq {
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200225 spinlock_t lock; /* runqueue lock */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700226
227 /*
228 * nr_running and cpu_load should be in the same cacheline because
229 * remote CPUs use both these fields when doing load calculation.
230 */
231 unsigned long nr_running;
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200232 #define CPU_LOAD_IDX_MAX 5
233 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
Siddha, Suresh Bbdecea32007-05-08 00:32:48 -0700234 unsigned char idle_at_tick;
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -0700235#ifdef CONFIG_NO_HZ
236 unsigned char in_nohz_recently;
237#endif
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200238 struct load_stat ls; /* capture load from *all* tasks on this cpu */
239 unsigned long nr_load_updates;
240 u64 nr_switches;
241
242 struct cfs_rq cfs;
243#ifdef CONFIG_FAIR_GROUP_SCHED
244 struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700245#endif
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200246 struct rt_rq rt;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700247
248 /*
249 * This is part of a global counter where only the total sum
250 * over all CPUs matters. A task can increase this counter on
251 * one CPU and if it got migrated afterwards it may decrease
252 * it on another CPU. Always updated under the runqueue lock:
253 */
254 unsigned long nr_uninterruptible;
255
Ingo Molnar36c8b582006-07-03 00:25:41 -0700256 struct task_struct *curr, *idle;
Christoph Lameterc9819f42006-12-10 02:20:25 -0800257 unsigned long next_balance;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700258 struct mm_struct *prev_mm;
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200259
Ingo Molnar6aa645e2007-07-09 18:51:58 +0200260 u64 clock, prev_clock_raw;
261 s64 clock_max_delta;
262
263 unsigned int clock_warps, clock_overflows;
264 unsigned int clock_unstable_events;
265
266 struct sched_class *load_balance_class;
267
Linus Torvalds1da177e2005-04-16 15:20:36 -0700268 atomic_t nr_iowait;
269
270#ifdef CONFIG_SMP
271 struct sched_domain *sd;
272
273 /* For active balancing */
274 int active_balance;
275 int push_cpu;
Christoph Lameter0a2966b2006-09-25 23:30:51 -0700276 int cpu; /* cpu of this runqueue */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700277
Ingo Molnar36c8b582006-07-03 00:25:41 -0700278 struct task_struct *migration_thread;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700279 struct list_head migration_queue;
280#endif
281
282#ifdef CONFIG_SCHEDSTATS
283 /* latency stats */
284 struct sched_info rq_sched_info;
285
286 /* sys_sched_yield() stats */
287 unsigned long yld_exp_empty;
288 unsigned long yld_act_empty;
289 unsigned long yld_both_empty;
290 unsigned long yld_cnt;
291
292 /* schedule() stats */
293 unsigned long sched_switch;
294 unsigned long sched_cnt;
295 unsigned long sched_goidle;
296
297 /* try_to_wake_up() stats */
298 unsigned long ttwu_cnt;
299 unsigned long ttwu_local;
300#endif
Ingo Molnarfcb99372006-07-03 00:25:10 -0700301 struct lock_class_key rq_lock_key;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700302};
303
Siddha, Suresh Bc3396622007-05-08 00:33:09 -0700304static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
Gautham R Shenoy5be93612007-05-09 02:34:04 -0700305static DEFINE_MUTEX(sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700306
Ingo Molnardd41f592007-07-09 18:51:59 +0200307static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
308{
309 rq->curr->sched_class->check_preempt_curr(rq, p);
310}
311
Christoph Lameter0a2966b2006-09-25 23:30:51 -0700312static inline int cpu_of(struct rq *rq)
313{
314#ifdef CONFIG_SMP
315 return rq->cpu;
316#else
317 return 0;
318#endif
319}
320
Nick Piggin674311d2005-06-25 14:57:27 -0700321/*
Ingo Molnar20d315d2007-07-09 18:51:58 +0200322 * Per-runqueue clock, as finegrained as the platform can give us:
323 */
324static unsigned long long __rq_clock(struct rq *rq)
325{
326 u64 prev_raw = rq->prev_clock_raw;
327 u64 now = sched_clock();
328 s64 delta = now - prev_raw;
329 u64 clock = rq->clock;
330
331 /*
332 * Protect against sched_clock() occasionally going backwards:
333 */
334 if (unlikely(delta < 0)) {
335 clock++;
336 rq->clock_warps++;
337 } else {
338 /*
339 * Catch too large forward jumps too:
340 */
341 if (unlikely(delta > 2*TICK_NSEC)) {
342 clock++;
343 rq->clock_overflows++;
344 } else {
345 if (unlikely(delta > rq->clock_max_delta))
346 rq->clock_max_delta = delta;
347 clock += delta;
348 }
349 }
350
351 rq->prev_clock_raw = now;
352 rq->clock = clock;
353
354 return clock;
355}
356
357static inline unsigned long long rq_clock(struct rq *rq)
358{
359 int this_cpu = smp_processor_id();
360
361 if (this_cpu == cpu_of(rq))
362 return __rq_clock(rq);
363
364 return rq->clock;
365}
366
367/*
Nick Piggin674311d2005-06-25 14:57:27 -0700368 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -0700369 * See detach_destroy_domains: synchronize_sched for details.
Nick Piggin674311d2005-06-25 14:57:27 -0700370 *
371 * The domain tree of any CPU may only be accessed from within
372 * preempt-disabled sections.
373 */
Ingo Molnar48f24c42006-07-03 00:25:40 -0700374#define for_each_domain(cpu, __sd) \
375 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700376
377#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
378#define this_rq() (&__get_cpu_var(runqueues))
379#define task_rq(p) cpu_rq(task_cpu(p))
380#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
381
Ingo Molnar138a8ae2007-07-09 18:51:58 +0200382#ifdef CONFIG_FAIR_GROUP_SCHED
383/* Change a task's ->cfs_rq if it moves across CPUs */
384static inline void set_task_cfs_rq(struct task_struct *p)
385{
386 p->se.cfs_rq = &task_rq(p)->cfs;
387}
388#else
389static inline void set_task_cfs_rq(struct task_struct *p)
390{
391}
392#endif
393
Linus Torvalds1da177e2005-04-16 15:20:36 -0700394#ifndef prepare_arch_switch
Nick Piggin4866cde2005-06-25 14:57:23 -0700395# define prepare_arch_switch(next) do { } while (0)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700396#endif
Nick Piggin4866cde2005-06-25 14:57:23 -0700397#ifndef finish_arch_switch
398# define finish_arch_switch(prev) do { } while (0)
399#endif
400
401#ifndef __ARCH_WANT_UNLOCKED_CTXSW
Ingo Molnar70b97a72006-07-03 00:25:42 -0700402static inline int task_running(struct rq *rq, struct task_struct *p)
Nick Piggin4866cde2005-06-25 14:57:23 -0700403{
404 return rq->curr == p;
405}
406
Ingo Molnar70b97a72006-07-03 00:25:42 -0700407static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
Nick Piggin4866cde2005-06-25 14:57:23 -0700408{
409}
410
Ingo Molnar70b97a72006-07-03 00:25:42 -0700411static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
Nick Piggin4866cde2005-06-25 14:57:23 -0700412{
Ingo Molnarda04c032005-09-13 11:17:59 +0200413#ifdef CONFIG_DEBUG_SPINLOCK
414 /* this is a valid case when another task releases the spinlock */
415 rq->lock.owner = current;
416#endif
Ingo Molnar8a25d5d2006-07-03 00:24:54 -0700417 /*
418 * If we are tracking spinlock dependencies then we have to
419 * fix up the runqueue lock - which gets 'carried over' from
420 * prev into current:
421 */
422 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
423
Nick Piggin4866cde2005-06-25 14:57:23 -0700424 spin_unlock_irq(&rq->lock);
425}
426
427#else /* __ARCH_WANT_UNLOCKED_CTXSW */
Ingo Molnar70b97a72006-07-03 00:25:42 -0700428static inline int task_running(struct rq *rq, struct task_struct *p)
Nick Piggin4866cde2005-06-25 14:57:23 -0700429{
430#ifdef CONFIG_SMP
431 return p->oncpu;
432#else
433 return rq->curr == p;
434#endif
435}
436
Ingo Molnar70b97a72006-07-03 00:25:42 -0700437static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
Nick Piggin4866cde2005-06-25 14:57:23 -0700438{
439#ifdef CONFIG_SMP
440 /*
441 * We can optimise this out completely for !SMP, because the
442 * SMP rebalancing from interrupt is the only thing that cares
443 * here.
444 */
445 next->oncpu = 1;
446#endif
447#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
448 spin_unlock_irq(&rq->lock);
449#else
450 spin_unlock(&rq->lock);
451#endif
452}
453
Ingo Molnar70b97a72006-07-03 00:25:42 -0700454static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
Nick Piggin4866cde2005-06-25 14:57:23 -0700455{
456#ifdef CONFIG_SMP
457 /*
458 * After ->oncpu is cleared, the task can be moved to a different CPU.
459 * We must ensure this doesn't happen until the switch is completely
460 * finished.
461 */
462 smp_wmb();
463 prev->oncpu = 0;
464#endif
465#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
466 local_irq_enable();
467#endif
468}
469#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700470
471/*
Ingo Molnarb29739f2006-06-27 02:54:51 -0700472 * __task_rq_lock - lock the runqueue a given task resides on.
473 * Must be called interrupts disabled.
474 */
Ingo Molnar70b97a72006-07-03 00:25:42 -0700475static inline struct rq *__task_rq_lock(struct task_struct *p)
Ingo Molnarb29739f2006-06-27 02:54:51 -0700476 __acquires(rq->lock)
477{
Ingo Molnar70b97a72006-07-03 00:25:42 -0700478 struct rq *rq;
Ingo Molnarb29739f2006-06-27 02:54:51 -0700479
480repeat_lock_task:
481 rq = task_rq(p);
482 spin_lock(&rq->lock);
483 if (unlikely(rq != task_rq(p))) {
484 spin_unlock(&rq->lock);
485 goto repeat_lock_task;
486 }
487 return rq;
488}
489
490/*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700491 * task_rq_lock - lock the runqueue a given task resides on and disable
492 * interrupts. Note the ordering: we can safely lookup the task_rq without
493 * explicitly disabling preemption.
494 */
Ingo Molnar70b97a72006-07-03 00:25:42 -0700495static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700496 __acquires(rq->lock)
497{
Ingo Molnar70b97a72006-07-03 00:25:42 -0700498 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700499
500repeat_lock_task:
501 local_irq_save(*flags);
502 rq = task_rq(p);
503 spin_lock(&rq->lock);
504 if (unlikely(rq != task_rq(p))) {
505 spin_unlock_irqrestore(&rq->lock, *flags);
506 goto repeat_lock_task;
507 }
508 return rq;
509}
510
Ingo Molnar70b97a72006-07-03 00:25:42 -0700511static inline void __task_rq_unlock(struct rq *rq)
Ingo Molnarb29739f2006-06-27 02:54:51 -0700512 __releases(rq->lock)
513{
514 spin_unlock(&rq->lock);
515}
516
Ingo Molnar70b97a72006-07-03 00:25:42 -0700517static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700518 __releases(rq->lock)
519{
520 spin_unlock_irqrestore(&rq->lock, *flags);
521}
522
Linus Torvalds1da177e2005-04-16 15:20:36 -0700523/*
Robert P. J. Daycc2a73b2006-12-10 02:20:00 -0800524 * this_rq_lock - lock this runqueue and disable interrupts.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700525 */
Ingo Molnar70b97a72006-07-03 00:25:42 -0700526static inline struct rq *this_rq_lock(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700527 __acquires(rq->lock)
528{
Ingo Molnar70b97a72006-07-03 00:25:42 -0700529 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700530
531 local_irq_disable();
532 rq = this_rq();
533 spin_lock(&rq->lock);
534
535 return rq;
536}
537
Ingo Molnarc24d20d2007-07-09 18:51:59 +0200538/*
Ingo Molnar1b9f19c2007-07-09 18:51:59 +0200539 * CPU frequency is/was unstable - start new by setting prev_clock_raw:
540 */
541void sched_clock_unstable_event(void)
542{
543 unsigned long flags;
544 struct rq *rq;
545
546 rq = task_rq_lock(current, &flags);
547 rq->prev_clock_raw = sched_clock();
548 rq->clock_unstable_events++;
549 task_rq_unlock(rq, &flags);
550}
551
552/*
Ingo Molnarc24d20d2007-07-09 18:51:59 +0200553 * resched_task - mark a task 'to be rescheduled now'.
554 *
555 * On UP this means the setting of the need_resched flag, on SMP it
556 * might also involve a cross-CPU call to trigger the scheduler on
557 * the target CPU.
558 */
559#ifdef CONFIG_SMP
560
561#ifndef tsk_is_polling
562#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
563#endif
564
565static void resched_task(struct task_struct *p)
566{
567 int cpu;
568
569 assert_spin_locked(&task_rq(p)->lock);
570
571 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
572 return;
573
574 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
575
576 cpu = task_cpu(p);
577 if (cpu == smp_processor_id())
578 return;
579
580 /* NEED_RESCHED must be visible before we test polling */
581 smp_mb();
582 if (!tsk_is_polling(p))
583 smp_send_reschedule(cpu);
584}
585
586static void resched_cpu(int cpu)
587{
588 struct rq *rq = cpu_rq(cpu);
589 unsigned long flags;
590
591 if (!spin_trylock_irqsave(&rq->lock, flags))
592 return;
593 resched_task(cpu_curr(cpu));
594 spin_unlock_irqrestore(&rq->lock, flags);
595}
596#else
597static inline void resched_task(struct task_struct *p)
598{
599 assert_spin_locked(&task_rq(p)->lock);
600 set_tsk_need_resched(p);
601}
602#endif
603
Ingo Molnar45bf76d2007-07-09 18:51:59 +0200604static u64 div64_likely32(u64 divident, unsigned long divisor)
605{
606#if BITS_PER_LONG == 32
607 if (likely(divident <= 0xffffffffULL))
608 return (u32)divident / divisor;
609 do_div(divident, divisor);
610
611 return divident;
612#else
613 return divident / divisor;
614#endif
615}
616
617#if BITS_PER_LONG == 32
618# define WMULT_CONST (~0UL)
619#else
620# define WMULT_CONST (1UL << 32)
621#endif
622
623#define WMULT_SHIFT 32
624
625static inline unsigned long
626calc_delta_mine(unsigned long delta_exec, unsigned long weight,
627 struct load_weight *lw)
628{
629 u64 tmp;
630
631 if (unlikely(!lw->inv_weight))
632 lw->inv_weight = WMULT_CONST / lw->weight;
633
634 tmp = (u64)delta_exec * weight;
635 /*
636 * Check whether we'd overflow the 64-bit multiplication:
637 */
638 if (unlikely(tmp > WMULT_CONST)) {
639 tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight)
640 >> (WMULT_SHIFT/2);
641 } else {
642 tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
643 }
644
645 return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit);
646}
647
648static inline unsigned long
649calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
650{
651 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
652}
653
654static void update_load_add(struct load_weight *lw, unsigned long inc)
655{
656 lw->weight += inc;
657 lw->inv_weight = 0;
658}
659
660static void update_load_sub(struct load_weight *lw, unsigned long dec)
661{
662 lw->weight -= dec;
663 lw->inv_weight = 0;
664}
665
666static void __update_curr_load(struct rq *rq, struct load_stat *ls)
667{
668 if (rq->curr != rq->idle && ls->load.weight) {
669 ls->delta_exec += ls->delta_stat;
670 ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
671 ls->delta_stat = 0;
672 }
673}
674
675/*
676 * Update delta_exec, delta_fair fields for rq.
677 *
678 * delta_fair clock advances at a rate inversely proportional to
679 * total load (rq->ls.load.weight) on the runqueue, while
680 * delta_exec advances at the same rate as wall-clock (provided
681 * cpu is not idle).
682 *
683 * delta_exec / delta_fair is a measure of the (smoothened) load on this
684 * runqueue over any given interval. This (smoothened) load is used
685 * during load balance.
686 *
687 * This function is called /before/ updating rq->ls.load
688 * and when switching tasks.
689 */
690static void update_curr_load(struct rq *rq, u64 now)
691{
692 struct load_stat *ls = &rq->ls;
693 u64 start;
694
695 start = ls->load_update_start;
696 ls->load_update_start = now;
697 ls->delta_stat += now - start;
698 /*
699 * Stagger updates to ls->delta_fair. Very frequent updates
700 * can be expensive.
701 */
702 if (ls->delta_stat >= sysctl_sched_stat_granularity)
703 __update_curr_load(rq, ls);
704}
705
Linus Torvalds1da177e2005-04-16 15:20:36 -0700706/*
Peter Williams2dd73a42006-06-27 02:54:34 -0700707 * To aid in avoiding the subversion of "niceness" due to uneven distribution
708 * of tasks with abnormal "nice" values across CPUs the contribution that
709 * each task makes to its run queue's load is weighted according to its
710 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
711 * scaled version of the new time slice allocation that they receive on time
712 * slice expiry etc.
713 */
714
715/*
716 * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
717 * If static_prio_timeslice() is ever changed to break this assumption then
718 * this code will need modification
719 */
720#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
Ingo Molnardd41f592007-07-09 18:51:59 +0200721#define load_weight(lp) \
Peter Williams2dd73a42006-06-27 02:54:34 -0700722 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
723#define PRIO_TO_LOAD_WEIGHT(prio) \
Ingo Molnardd41f592007-07-09 18:51:59 +0200724 load_weight(static_prio_timeslice(prio))
Peter Williams2dd73a42006-06-27 02:54:34 -0700725#define RTPRIO_TO_LOAD_WEIGHT(rp) \
Ingo Molnardd41f592007-07-09 18:51:59 +0200726 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp))
727
728#define WEIGHT_IDLEPRIO 2
729#define WMULT_IDLEPRIO (1 << 31)
730
731/*
732 * Nice levels are multiplicative, with a gentle 10% change for every
733 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
734 * nice 1, it will get ~10% less CPU time than another CPU-bound task
735 * that remained on nice 0.
736 *
737 * The "10% effect" is relative and cumulative: from _any_ nice level,
738 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
739 * it's +10% CPU usage.
740 */
741static const int prio_to_weight[40] = {
742/* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921,
743/* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280,
744/* 0 */ NICE_0_LOAD /* 1024 */,
745/* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137,
746/* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15,
747};
748
749static const u32 prio_to_wmult[40] = {
750 48356, 60446, 75558, 94446, 118058, 147573,
751 184467, 230589, 288233, 360285, 450347,
752 562979, 703746, 879575, 1099582, 1374389,
753 717986, 2147483, 2684354, 3355443, 4194304,
754 244160, 6557201, 8196502, 10250518, 12782640,
755 16025997, 19976592, 24970740, 31350126, 39045157,
756 49367440, 61356675, 76695844, 95443717, 119304647,
757 148102320, 186737708, 238609294, 286331153,
758};
Peter Williams2dd73a42006-06-27 02:54:34 -0700759
Ingo Molnar36c8b582006-07-03 00:25:41 -0700760static inline void
Ingo Molnardd41f592007-07-09 18:51:59 +0200761inc_load(struct rq *rq, const struct task_struct *p, u64 now)
Peter Williams2dd73a42006-06-27 02:54:34 -0700762{
Ingo Molnardd41f592007-07-09 18:51:59 +0200763 update_curr_load(rq, now);
764 update_load_add(&rq->ls.load, p->se.load.weight);
Peter Williams2dd73a42006-06-27 02:54:34 -0700765}
766
Ingo Molnar36c8b582006-07-03 00:25:41 -0700767static inline void
Ingo Molnardd41f592007-07-09 18:51:59 +0200768dec_load(struct rq *rq, const struct task_struct *p, u64 now)
Peter Williams2dd73a42006-06-27 02:54:34 -0700769{
Ingo Molnardd41f592007-07-09 18:51:59 +0200770 update_curr_load(rq, now);
771 update_load_sub(&rq->ls.load, p->se.load.weight);
Peter Williams2dd73a42006-06-27 02:54:34 -0700772}
773
Ingo Molnardd41f592007-07-09 18:51:59 +0200774static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
Peter Williams2dd73a42006-06-27 02:54:34 -0700775{
776 rq->nr_running++;
Ingo Molnardd41f592007-07-09 18:51:59 +0200777 inc_load(rq, p, now);
Peter Williams2dd73a42006-06-27 02:54:34 -0700778}
779
Ingo Molnardd41f592007-07-09 18:51:59 +0200780static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
Peter Williams2dd73a42006-06-27 02:54:34 -0700781{
782 rq->nr_running--;
Ingo Molnardd41f592007-07-09 18:51:59 +0200783 dec_load(rq, p, now);
Peter Williams2dd73a42006-06-27 02:54:34 -0700784}
785
Ingo Molnardd41f592007-07-09 18:51:59 +0200786static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
787
788/*
789 * runqueue iterator, to support SMP load-balancing between different
790 * scheduling classes, without having to expose their internal data
791 * structures to the load-balancing proper:
792 */
793struct rq_iterator {
794 void *arg;
795 struct task_struct *(*start)(void *);
796 struct task_struct *(*next)(void *);
797};
798
799static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
800 unsigned long max_nr_move, unsigned long max_load_move,
801 struct sched_domain *sd, enum cpu_idle_type idle,
802 int *all_pinned, unsigned long *load_moved,
803 int this_best_prio, int best_prio, int best_prio_seen,
804 struct rq_iterator *iterator);
805
806#include "sched_stats.h"
807#include "sched_rt.c"
808#include "sched_fair.c"
809#include "sched_idletask.c"
810#ifdef CONFIG_SCHED_DEBUG
811# include "sched_debug.c"
812#endif
813
814#define sched_class_highest (&rt_sched_class)
815
Ingo Molnar45bf76d2007-07-09 18:51:59 +0200816static void set_load_weight(struct task_struct *p)
817{
Ingo Molnardd41f592007-07-09 18:51:59 +0200818 task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
819 p->se.wait_runtime = 0;
820
Ingo Molnar45bf76d2007-07-09 18:51:59 +0200821 if (task_has_rt_policy(p)) {
Ingo Molnardd41f592007-07-09 18:51:59 +0200822 p->se.load.weight = prio_to_weight[0] * 2;
823 p->se.load.inv_weight = prio_to_wmult[0] >> 1;
824 return;
825 }
826
827 /*
828 * SCHED_IDLE tasks get minimal weight:
829 */
830 if (p->policy == SCHED_IDLE) {
831 p->se.load.weight = WEIGHT_IDLEPRIO;
832 p->se.load.inv_weight = WMULT_IDLEPRIO;
833 return;
834 }
835
836 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
837 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
Ingo Molnar45bf76d2007-07-09 18:51:59 +0200838}
839
Ingo Molnardd41f592007-07-09 18:51:59 +0200840static void
841enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now)
Ingo Molnar71f8bd42007-07-09 18:51:59 +0200842{
843 sched_info_queued(p);
Ingo Molnardd41f592007-07-09 18:51:59 +0200844 p->sched_class->enqueue_task(rq, p, wakeup, now);
845 p->se.on_rq = 1;
846}
847
848static void
849dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now)
850{
851 p->sched_class->dequeue_task(rq, p, sleep, now);
852 p->se.on_rq = 0;
Ingo Molnar71f8bd42007-07-09 18:51:59 +0200853}
854
855/*
Ingo Molnardd41f592007-07-09 18:51:59 +0200856 * __normal_prio - return the priority that is based on the static prio
Ingo Molnar71f8bd42007-07-09 18:51:59 +0200857 */
Ingo Molnar14531182007-07-09 18:51:59 +0200858static inline int __normal_prio(struct task_struct *p)
859{
Ingo Molnardd41f592007-07-09 18:51:59 +0200860 return p->static_prio;
Ingo Molnar14531182007-07-09 18:51:59 +0200861}
862
863/*
Ingo Molnarb29739f2006-06-27 02:54:51 -0700864 * Calculate the expected normal priority: i.e. priority
865 * without taking RT-inheritance into account. Might be
866 * boosted by interactivity modifiers. Changes upon fork,
867 * setprio syscalls, and whenever the interactivity
868 * estimator recalculates.
869 */
Ingo Molnar36c8b582006-07-03 00:25:41 -0700870static inline int normal_prio(struct task_struct *p)
Ingo Molnarb29739f2006-06-27 02:54:51 -0700871{
872 int prio;
873
Ingo Molnare05606d2007-07-09 18:51:59 +0200874 if (task_has_rt_policy(p))
Ingo Molnarb29739f2006-06-27 02:54:51 -0700875 prio = MAX_RT_PRIO-1 - p->rt_priority;
876 else
877 prio = __normal_prio(p);
878 return prio;
879}
880
881/*
882 * Calculate the current priority, i.e. the priority
883 * taken into account by the scheduler. This value might
884 * be boosted by RT tasks, or might be boosted by
885 * interactivity modifiers. Will be RT if the task got
886 * RT-boosted. If not then it returns p->normal_prio.
887 */
Ingo Molnar36c8b582006-07-03 00:25:41 -0700888static int effective_prio(struct task_struct *p)
Ingo Molnarb29739f2006-06-27 02:54:51 -0700889{
890 p->normal_prio = normal_prio(p);
891 /*
892 * If we are RT tasks or we were boosted to RT priority,
893 * keep the priority unchanged. Otherwise, update priority
894 * to the normal priority:
895 */
896 if (!rt_prio(p->prio))
897 return p->normal_prio;
898 return p->prio;
899}
900
901/*
Ingo Molnardd41f592007-07-09 18:51:59 +0200902 * activate_task - move a task to the runqueue.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700903 */
Ingo Molnardd41f592007-07-09 18:51:59 +0200904static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700905{
Ingo Molnardd41f592007-07-09 18:51:59 +0200906 u64 now = rq_clock(rq);
Con Kolivasd425b272006-03-31 02:31:29 -0800907
Ingo Molnardd41f592007-07-09 18:51:59 +0200908 if (p->state == TASK_UNINTERRUPTIBLE)
909 rq->nr_uninterruptible--;
910
911 enqueue_task(rq, p, wakeup, now);
912 inc_nr_running(p, rq, now);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700913}
914
915/*
Ingo Molnardd41f592007-07-09 18:51:59 +0200916 * activate_idle_task - move idle task to the _front_ of runqueue.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700917 */
Ingo Molnardd41f592007-07-09 18:51:59 +0200918static inline void activate_idle_task(struct task_struct *p, struct rq *rq)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700919{
Ingo Molnardd41f592007-07-09 18:51:59 +0200920 u64 now = rq_clock(rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700921
Ingo Molnardd41f592007-07-09 18:51:59 +0200922 if (p->state == TASK_UNINTERRUPTIBLE)
923 rq->nr_uninterruptible--;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700924
Ingo Molnardd41f592007-07-09 18:51:59 +0200925 enqueue_task(rq, p, 0, now);
926 inc_nr_running(p, rq, now);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700927}
928
929/*
930 * deactivate_task - remove a task from the runqueue.
931 */
Ingo Molnardd41f592007-07-09 18:51:59 +0200932static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700933{
Ingo Molnardd41f592007-07-09 18:51:59 +0200934 u64 now = rq_clock(rq);
935
936 if (p->state == TASK_UNINTERRUPTIBLE)
937 rq->nr_uninterruptible++;
938
939 dequeue_task(rq, p, sleep, now);
940 dec_nr_running(p, rq, now);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700941}
942
Linus Torvalds1da177e2005-04-16 15:20:36 -0700943/**
944 * task_curr - is this task currently executing on a CPU?
945 * @p: the task in question.
946 */
Ingo Molnar36c8b582006-07-03 00:25:41 -0700947inline int task_curr(const struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700948{
949 return cpu_curr(task_cpu(p)) == p;
950}
951
Peter Williams2dd73a42006-06-27 02:54:34 -0700952/* Used instead of source_load when we know the type == 0 */
953unsigned long weighted_cpuload(const int cpu)
954{
Ingo Molnardd41f592007-07-09 18:51:59 +0200955 return cpu_rq(cpu)->ls.load.weight;
956}
957
958static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
959{
960#ifdef CONFIG_SMP
961 task_thread_info(p)->cpu = cpu;
962 set_task_cfs_rq(p);
963#endif
Peter Williams2dd73a42006-06-27 02:54:34 -0700964}
965
Linus Torvalds1da177e2005-04-16 15:20:36 -0700966#ifdef CONFIG_SMP
Ingo Molnarc65cc872007-07-09 18:51:58 +0200967
Ingo Molnardd41f592007-07-09 18:51:59 +0200968void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
Ingo Molnarc65cc872007-07-09 18:51:58 +0200969{
Ingo Molnardd41f592007-07-09 18:51:59 +0200970 int old_cpu = task_cpu(p);
971 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu);
972 u64 clock_offset, fair_clock_offset;
973
974 clock_offset = old_rq->clock - new_rq->clock;
975 fair_clock_offset = old_rq->cfs.fair_clock -
976 new_rq->cfs.fair_clock;
977 if (p->se.wait_start)
978 p->se.wait_start -= clock_offset;
979 if (p->se.wait_start_fair)
980 p->se.wait_start_fair -= fair_clock_offset;
981 if (p->se.sleep_start)
982 p->se.sleep_start -= clock_offset;
983 if (p->se.block_start)
984 p->se.block_start -= clock_offset;
985 if (p->se.sleep_start_fair)
986 p->se.sleep_start_fair -= fair_clock_offset;
987
988 __set_task_cpu(p, new_cpu);
Ingo Molnarc65cc872007-07-09 18:51:58 +0200989}
990
Ingo Molnar70b97a72006-07-03 00:25:42 -0700991struct migration_req {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700992 struct list_head list;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700993
Ingo Molnar36c8b582006-07-03 00:25:41 -0700994 struct task_struct *task;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700995 int dest_cpu;
996
Linus Torvalds1da177e2005-04-16 15:20:36 -0700997 struct completion done;
Ingo Molnar70b97a72006-07-03 00:25:42 -0700998};
Linus Torvalds1da177e2005-04-16 15:20:36 -0700999
1000/*
1001 * The task's runqueue lock must be held.
1002 * Returns true if you have to wait for migration thread.
1003 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001004static int
Ingo Molnar70b97a72006-07-03 00:25:42 -07001005migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001006{
Ingo Molnar70b97a72006-07-03 00:25:42 -07001007 struct rq *rq = task_rq(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001008
1009 /*
1010 * If the task is not on a runqueue (and not running), then
1011 * it is sufficient to simply update the task's cpu field.
1012 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001013 if (!p->se.on_rq && !task_running(rq, p)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001014 set_task_cpu(p, dest_cpu);
1015 return 0;
1016 }
1017
1018 init_completion(&req->done);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001019 req->task = p;
1020 req->dest_cpu = dest_cpu;
1021 list_add(&req->list, &rq->migration_queue);
Ingo Molnar48f24c42006-07-03 00:25:40 -07001022
Linus Torvalds1da177e2005-04-16 15:20:36 -07001023 return 1;
1024}
1025
1026/*
1027 * wait_task_inactive - wait for a thread to unschedule.
1028 *
1029 * The caller must ensure that the task *will* unschedule sometime soon,
1030 * else this function might spin for a *long* time. This function can't
1031 * be called with interrupts off, or it may introduce deadlock with
1032 * smp_call_function() if an IPI is sent by the same process we are
1033 * waiting to become inactive.
1034 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001035void wait_task_inactive(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001036{
1037 unsigned long flags;
Ingo Molnardd41f592007-07-09 18:51:59 +02001038 int running, on_rq;
Ingo Molnar70b97a72006-07-03 00:25:42 -07001039 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001040
1041repeat:
Linus Torvaldsfa490cf2007-06-18 09:34:40 -07001042 /*
1043 * We do the initial early heuristics without holding
1044 * any task-queue locks at all. We'll only try to get
1045 * the runqueue lock when things look like they will
1046 * work out!
1047 */
1048 rq = task_rq(p);
1049
1050 /*
1051 * If the task is actively running on another CPU
1052 * still, just relax and busy-wait without holding
1053 * any locks.
1054 *
1055 * NOTE! Since we don't hold any locks, it's not
1056 * even sure that "rq" stays as the right runqueue!
1057 * But we don't care, since "task_running()" will
1058 * return false if the runqueue has changed and p
1059 * is actually now running somewhere else!
1060 */
1061 while (task_running(rq, p))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001062 cpu_relax();
Linus Torvaldsfa490cf2007-06-18 09:34:40 -07001063
1064 /*
1065 * Ok, time to look more closely! We need the rq
1066 * lock now, to be *sure*. If we're wrong, we'll
1067 * just go back and repeat.
1068 */
1069 rq = task_rq_lock(p, &flags);
1070 running = task_running(rq, p);
Ingo Molnardd41f592007-07-09 18:51:59 +02001071 on_rq = p->se.on_rq;
Linus Torvaldsfa490cf2007-06-18 09:34:40 -07001072 task_rq_unlock(rq, &flags);
1073
1074 /*
1075 * Was it really running after all now that we
1076 * checked with the proper locks actually held?
1077 *
1078 * Oops. Go back and try again..
1079 */
1080 if (unlikely(running)) {
1081 cpu_relax();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001082 goto repeat;
1083 }
Linus Torvaldsfa490cf2007-06-18 09:34:40 -07001084
1085 /*
1086 * It's not enough that it's not actively running,
1087 * it must be off the runqueue _entirely_, and not
1088 * preempted!
1089 *
1090 * So if it wa still runnable (but just not actively
1091 * running right now), it's preempted, and we should
1092 * yield - it could be a while.
1093 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001094 if (unlikely(on_rq)) {
Linus Torvaldsfa490cf2007-06-18 09:34:40 -07001095 yield();
1096 goto repeat;
1097 }
1098
1099 /*
1100 * Ahh, all good. It wasn't running, and it wasn't
1101 * runnable, which means that it will never become
1102 * running in the future either. We're all done!
1103 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001104}
1105
1106/***
1107 * kick_process - kick a running thread to enter/exit the kernel
1108 * @p: the to-be-kicked thread
1109 *
1110 * Cause a process which is running on another CPU to enter
1111 * kernel-mode, without any delay. (to get signals handled.)
1112 *
1113 * NOTE: this function doesnt have to take the runqueue lock,
1114 * because all it wants to ensure is that the remote task enters
1115 * the kernel. If the IPI races and the task has been migrated
1116 * to another CPU then no harm is done and the purpose has been
1117 * achieved as well.
1118 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001119void kick_process(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001120{
1121 int cpu;
1122
1123 preempt_disable();
1124 cpu = task_cpu(p);
1125 if ((cpu != smp_processor_id()) && task_curr(p))
1126 smp_send_reschedule(cpu);
1127 preempt_enable();
1128}
1129
1130/*
Peter Williams2dd73a42006-06-27 02:54:34 -07001131 * Return a low guess at the load of a migration-source cpu weighted
1132 * according to the scheduling class and "nice" value.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001133 *
1134 * We want to under-estimate the load of migration sources, to
1135 * balance conservatively.
1136 */
Con Kolivasb9104722005-11-08 21:38:55 -08001137static inline unsigned long source_load(int cpu, int type)
1138{
Ingo Molnar70b97a72006-07-03 00:25:42 -07001139 struct rq *rq = cpu_rq(cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02001140 unsigned long total = weighted_cpuload(cpu);
Nick Piggina2000572006-02-10 01:51:02 -08001141
Peter Williams2dd73a42006-06-27 02:54:34 -07001142 if (type == 0)
Ingo Molnardd41f592007-07-09 18:51:59 +02001143 return total;
Peter Williams2dd73a42006-06-27 02:54:34 -07001144
Ingo Molnardd41f592007-07-09 18:51:59 +02001145 return min(rq->cpu_load[type-1], total);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001146}
1147
1148/*
Peter Williams2dd73a42006-06-27 02:54:34 -07001149 * Return a high guess at the load of a migration-target cpu weighted
1150 * according to the scheduling class and "nice" value.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001151 */
Con Kolivasb9104722005-11-08 21:38:55 -08001152static inline unsigned long target_load(int cpu, int type)
1153{
Ingo Molnar70b97a72006-07-03 00:25:42 -07001154 struct rq *rq = cpu_rq(cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02001155 unsigned long total = weighted_cpuload(cpu);
Nick Piggina2000572006-02-10 01:51:02 -08001156
Peter Williams2dd73a42006-06-27 02:54:34 -07001157 if (type == 0)
Ingo Molnardd41f592007-07-09 18:51:59 +02001158 return total;
Peter Williams2dd73a42006-06-27 02:54:34 -07001159
Ingo Molnardd41f592007-07-09 18:51:59 +02001160 return max(rq->cpu_load[type-1], total);
Peter Williams2dd73a42006-06-27 02:54:34 -07001161}
1162
1163/*
1164 * Return the average load per task on the cpu's run queue
1165 */
1166static inline unsigned long cpu_avg_load_per_task(int cpu)
1167{
Ingo Molnar70b97a72006-07-03 00:25:42 -07001168 struct rq *rq = cpu_rq(cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02001169 unsigned long total = weighted_cpuload(cpu);
Peter Williams2dd73a42006-06-27 02:54:34 -07001170 unsigned long n = rq->nr_running;
1171
Ingo Molnardd41f592007-07-09 18:51:59 +02001172 return n ? total / n : SCHED_LOAD_SCALE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001173}
1174
Nick Piggin147cbb42005-06-25 14:57:19 -07001175/*
1176 * find_idlest_group finds and returns the least busy CPU group within the
1177 * domain.
1178 */
1179static struct sched_group *
1180find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1181{
1182 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
1183 unsigned long min_load = ULONG_MAX, this_load = 0;
1184 int load_idx = sd->forkexec_idx;
1185 int imbalance = 100 + (sd->imbalance_pct-100)/2;
1186
1187 do {
1188 unsigned long load, avg_load;
1189 int local_group;
1190 int i;
1191
M.Baris Demirayda5a5522005-09-10 00:26:09 -07001192 /* Skip over this group if it has no CPUs allowed */
1193 if (!cpus_intersects(group->cpumask, p->cpus_allowed))
1194 goto nextgroup;
1195
Nick Piggin147cbb42005-06-25 14:57:19 -07001196 local_group = cpu_isset(this_cpu, group->cpumask);
Nick Piggin147cbb42005-06-25 14:57:19 -07001197
1198 /* Tally up the load of all CPUs in the group */
1199 avg_load = 0;
1200
1201 for_each_cpu_mask(i, group->cpumask) {
1202 /* Bias balancing toward cpus of our domain */
1203 if (local_group)
1204 load = source_load(i, load_idx);
1205 else
1206 load = target_load(i, load_idx);
1207
1208 avg_load += load;
1209 }
1210
1211 /* Adjust by relative CPU power of the group */
Eric Dumazet5517d862007-05-08 00:32:57 -07001212 avg_load = sg_div_cpu_power(group,
1213 avg_load * SCHED_LOAD_SCALE);
Nick Piggin147cbb42005-06-25 14:57:19 -07001214
1215 if (local_group) {
1216 this_load = avg_load;
1217 this = group;
1218 } else if (avg_load < min_load) {
1219 min_load = avg_load;
1220 idlest = group;
1221 }
M.Baris Demirayda5a5522005-09-10 00:26:09 -07001222nextgroup:
Nick Piggin147cbb42005-06-25 14:57:19 -07001223 group = group->next;
1224 } while (group != sd->groups);
1225
1226 if (!idlest || 100*this_load < imbalance*min_load)
1227 return NULL;
1228 return idlest;
1229}
1230
1231/*
Satoru Takeuchi0feaece2006-10-03 01:14:10 -07001232 * find_idlest_cpu - find the idlest cpu among the cpus in group.
Nick Piggin147cbb42005-06-25 14:57:19 -07001233 */
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07001234static int
1235find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
Nick Piggin147cbb42005-06-25 14:57:19 -07001236{
M.Baris Demirayda5a5522005-09-10 00:26:09 -07001237 cpumask_t tmp;
Nick Piggin147cbb42005-06-25 14:57:19 -07001238 unsigned long load, min_load = ULONG_MAX;
1239 int idlest = -1;
1240 int i;
1241
M.Baris Demirayda5a5522005-09-10 00:26:09 -07001242 /* Traverse only the allowed CPUs */
1243 cpus_and(tmp, group->cpumask, p->cpus_allowed);
1244
1245 for_each_cpu_mask(i, tmp) {
Peter Williams2dd73a42006-06-27 02:54:34 -07001246 load = weighted_cpuload(i);
Nick Piggin147cbb42005-06-25 14:57:19 -07001247
1248 if (load < min_load || (load == min_load && i == this_cpu)) {
1249 min_load = load;
1250 idlest = i;
1251 }
1252 }
1253
1254 return idlest;
1255}
1256
Nick Piggin476d1392005-06-25 14:57:29 -07001257/*
1258 * sched_balance_self: balance the current task (running on cpu) in domains
1259 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1260 * SD_BALANCE_EXEC.
1261 *
1262 * Balance, ie. select the least loaded group.
1263 *
1264 * Returns the target CPU number, or the same CPU if no balancing is needed.
1265 *
1266 * preempt must be disabled.
1267 */
1268static int sched_balance_self(int cpu, int flag)
1269{
1270 struct task_struct *t = current;
1271 struct sched_domain *tmp, *sd = NULL;
Nick Piggin147cbb42005-06-25 14:57:19 -07001272
Chen, Kenneth Wc96d1452006-06-27 02:54:28 -07001273 for_each_domain(cpu, tmp) {
Ingo Molnar9761eea2007-07-09 18:52:00 +02001274 /*
1275 * If power savings logic is enabled for a domain, stop there.
1276 */
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07001277 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1278 break;
Nick Piggin476d1392005-06-25 14:57:29 -07001279 if (tmp->flags & flag)
1280 sd = tmp;
Chen, Kenneth Wc96d1452006-06-27 02:54:28 -07001281 }
Nick Piggin476d1392005-06-25 14:57:29 -07001282
1283 while (sd) {
1284 cpumask_t span;
1285 struct sched_group *group;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07001286 int new_cpu, weight;
1287
1288 if (!(sd->flags & flag)) {
1289 sd = sd->child;
1290 continue;
1291 }
Nick Piggin476d1392005-06-25 14:57:29 -07001292
1293 span = sd->span;
1294 group = find_idlest_group(sd, t, cpu);
Siddha, Suresh B1a848872006-10-03 01:14:08 -07001295 if (!group) {
1296 sd = sd->child;
1297 continue;
1298 }
Nick Piggin476d1392005-06-25 14:57:29 -07001299
M.Baris Demirayda5a5522005-09-10 00:26:09 -07001300 new_cpu = find_idlest_cpu(group, t, cpu);
Siddha, Suresh B1a848872006-10-03 01:14:08 -07001301 if (new_cpu == -1 || new_cpu == cpu) {
1302 /* Now try balancing at a lower domain level of cpu */
1303 sd = sd->child;
1304 continue;
1305 }
Nick Piggin476d1392005-06-25 14:57:29 -07001306
Siddha, Suresh B1a848872006-10-03 01:14:08 -07001307 /* Now try balancing at a lower domain level of new_cpu */
Nick Piggin476d1392005-06-25 14:57:29 -07001308 cpu = new_cpu;
Nick Piggin476d1392005-06-25 14:57:29 -07001309 sd = NULL;
1310 weight = cpus_weight(span);
1311 for_each_domain(cpu, tmp) {
1312 if (weight <= cpus_weight(tmp->span))
1313 break;
1314 if (tmp->flags & flag)
1315 sd = tmp;
1316 }
1317 /* while loop will break here if sd == NULL */
1318 }
1319
1320 return cpu;
1321}
1322
1323#endif /* CONFIG_SMP */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001324
1325/*
1326 * wake_idle() will wake a task on an idle cpu if task->cpu is
1327 * not idle and an idle cpu is available. The span of cpus to
1328 * search starts with cpus closest then further out as needed,
1329 * so we always favor a closer, idle cpu.
1330 *
1331 * Returns the CPU we should wake onto.
1332 */
1333#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
Ingo Molnar36c8b582006-07-03 00:25:41 -07001334static int wake_idle(int cpu, struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001335{
1336 cpumask_t tmp;
1337 struct sched_domain *sd;
1338 int i;
1339
Siddha, Suresh B49531982007-05-08 00:33:01 -07001340 /*
1341 * If it is idle, then it is the best cpu to run this task.
1342 *
1343 * This cpu is also the best, if it has more than one task already.
1344 * Siblings must be also busy(in most cases) as they didn't already
1345 * pickup the extra load from this cpu and hence we need not check
1346 * sibling runqueue info. This will avoid the checks and cache miss
1347 * penalities associated with that.
1348 */
1349 if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001350 return cpu;
1351
1352 for_each_domain(cpu, sd) {
1353 if (sd->flags & SD_WAKE_IDLE) {
Nick Piggine0f364f2005-06-25 14:57:06 -07001354 cpus_and(tmp, sd->span, p->cpus_allowed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001355 for_each_cpu_mask(i, tmp) {
1356 if (idle_cpu(i))
1357 return i;
1358 }
Ingo Molnar9761eea2007-07-09 18:52:00 +02001359 } else {
Nick Piggine0f364f2005-06-25 14:57:06 -07001360 break;
Ingo Molnar9761eea2007-07-09 18:52:00 +02001361 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001362 }
1363 return cpu;
1364}
1365#else
Ingo Molnar36c8b582006-07-03 00:25:41 -07001366static inline int wake_idle(int cpu, struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001367{
1368 return cpu;
1369}
1370#endif
1371
1372/***
1373 * try_to_wake_up - wake up a thread
1374 * @p: the to-be-woken-up thread
1375 * @state: the mask of task states that can be woken
1376 * @sync: do a synchronous wakeup?
1377 *
1378 * Put it on the run-queue if it's not already there. The "current"
1379 * thread is always on the run-queue (except when the actual
1380 * re-schedule is in progress), and as such you're allowed to do
1381 * the simpler "current->state = TASK_RUNNING" to mark yourself
1382 * runnable without the overhead of this.
1383 *
1384 * returns failure only if the task is already active.
1385 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001386static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001387{
1388 int cpu, this_cpu, success = 0;
1389 unsigned long flags;
1390 long old_state;
Ingo Molnar70b97a72006-07-03 00:25:42 -07001391 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001392#ifdef CONFIG_SMP
Nick Piggin78979862005-06-25 14:57:13 -07001393 struct sched_domain *sd, *this_sd = NULL;
Ingo Molnar70b97a72006-07-03 00:25:42 -07001394 unsigned long load, this_load;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001395 int new_cpu;
1396#endif
1397
1398 rq = task_rq_lock(p, &flags);
1399 old_state = p->state;
1400 if (!(old_state & state))
1401 goto out;
1402
Ingo Molnardd41f592007-07-09 18:51:59 +02001403 if (p->se.on_rq)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001404 goto out_running;
1405
1406 cpu = task_cpu(p);
1407 this_cpu = smp_processor_id();
1408
1409#ifdef CONFIG_SMP
1410 if (unlikely(task_running(rq, p)))
1411 goto out_activate;
1412
Nick Piggin78979862005-06-25 14:57:13 -07001413 new_cpu = cpu;
1414
Linus Torvalds1da177e2005-04-16 15:20:36 -07001415 schedstat_inc(rq, ttwu_cnt);
1416 if (cpu == this_cpu) {
1417 schedstat_inc(rq, ttwu_local);
Nick Piggin78979862005-06-25 14:57:13 -07001418 goto out_set_cpu;
1419 }
1420
1421 for_each_domain(this_cpu, sd) {
1422 if (cpu_isset(cpu, sd->span)) {
1423 schedstat_inc(sd, ttwu_wake_remote);
1424 this_sd = sd;
1425 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001426 }
1427 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001428
Nick Piggin78979862005-06-25 14:57:13 -07001429 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001430 goto out_set_cpu;
1431
Linus Torvalds1da177e2005-04-16 15:20:36 -07001432 /*
Nick Piggin78979862005-06-25 14:57:13 -07001433 * Check for affine wakeup and passive balancing possibilities.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001434 */
Nick Piggin78979862005-06-25 14:57:13 -07001435 if (this_sd) {
1436 int idx = this_sd->wake_idx;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001437 unsigned int imbalance;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001438
Nick Piggina3f21bc2005-06-25 14:57:15 -07001439 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1440
Nick Piggin78979862005-06-25 14:57:13 -07001441 load = source_load(cpu, idx);
1442 this_load = target_load(this_cpu, idx);
1443
Nick Piggin78979862005-06-25 14:57:13 -07001444 new_cpu = this_cpu; /* Wake to this CPU if we can */
1445
Nick Piggina3f21bc2005-06-25 14:57:15 -07001446 if (this_sd->flags & SD_WAKE_AFFINE) {
1447 unsigned long tl = this_load;
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08001448 unsigned long tl_per_task;
1449
1450 tl_per_task = cpu_avg_load_per_task(this_cpu);
Peter Williams2dd73a42006-06-27 02:54:34 -07001451
Linus Torvalds1da177e2005-04-16 15:20:36 -07001452 /*
Nick Piggina3f21bc2005-06-25 14:57:15 -07001453 * If sync wakeup then subtract the (maximum possible)
1454 * effect of the currently running task from the load
1455 * of the current CPU:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001456 */
Nick Piggina3f21bc2005-06-25 14:57:15 -07001457 if (sync)
Ingo Molnardd41f592007-07-09 18:51:59 +02001458 tl -= current->se.load.weight;
Nick Piggina3f21bc2005-06-25 14:57:15 -07001459
1460 if ((tl <= load &&
Peter Williams2dd73a42006-06-27 02:54:34 -07001461 tl + target_load(cpu, idx) <= tl_per_task) ||
Ingo Molnardd41f592007-07-09 18:51:59 +02001462 100*(tl + p->se.load.weight) <= imbalance*load) {
Nick Piggina3f21bc2005-06-25 14:57:15 -07001463 /*
1464 * This domain has SD_WAKE_AFFINE and
1465 * p is cache cold in this domain, and
1466 * there is no bad imbalance.
1467 */
1468 schedstat_inc(this_sd, ttwu_move_affine);
1469 goto out_set_cpu;
1470 }
1471 }
1472
1473 /*
1474 * Start passive balancing when half the imbalance_pct
1475 * limit is reached.
1476 */
1477 if (this_sd->flags & SD_WAKE_BALANCE) {
1478 if (imbalance*this_load <= 100*load) {
1479 schedstat_inc(this_sd, ttwu_move_balance);
1480 goto out_set_cpu;
1481 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001482 }
1483 }
1484
1485 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1486out_set_cpu:
1487 new_cpu = wake_idle(new_cpu, p);
1488 if (new_cpu != cpu) {
1489 set_task_cpu(p, new_cpu);
1490 task_rq_unlock(rq, &flags);
1491 /* might preempt at this point */
1492 rq = task_rq_lock(p, &flags);
1493 old_state = p->state;
1494 if (!(old_state & state))
1495 goto out;
Ingo Molnardd41f592007-07-09 18:51:59 +02001496 if (p->se.on_rq)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001497 goto out_running;
1498
1499 this_cpu = smp_processor_id();
1500 cpu = task_cpu(p);
1501 }
1502
1503out_activate:
1504#endif /* CONFIG_SMP */
Ingo Molnardd41f592007-07-09 18:51:59 +02001505 activate_task(rq, p, 1);
Ingo Molnard79fc0f2005-09-10 00:26:12 -07001506 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001507 * Sync wakeups (i.e. those types of wakeups where the waker
1508 * has indicated that it will leave the CPU in short order)
1509 * don't trigger a preemption, if the woken up task will run on
1510 * this cpu. (in this case the 'I will reschedule' promise of
1511 * the waker guarantees that the freshly woken up task is going
1512 * to be considered on this CPU.)
1513 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001514 if (!sync || cpu != this_cpu)
1515 check_preempt_curr(rq, p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001516 success = 1;
1517
1518out_running:
1519 p->state = TASK_RUNNING;
1520out:
1521 task_rq_unlock(rq, &flags);
1522
1523 return success;
1524}
1525
Ingo Molnar36c8b582006-07-03 00:25:41 -07001526int fastcall wake_up_process(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001527{
1528 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1529 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1530}
Linus Torvalds1da177e2005-04-16 15:20:36 -07001531EXPORT_SYMBOL(wake_up_process);
1532
Ingo Molnar36c8b582006-07-03 00:25:41 -07001533int fastcall wake_up_state(struct task_struct *p, unsigned int state)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001534{
1535 return try_to_wake_up(p, state, 0);
1536}
1537
Linus Torvalds1da177e2005-04-16 15:20:36 -07001538/*
1539 * Perform scheduler related setup for a newly forked process p.
1540 * p is forked by current.
Ingo Molnardd41f592007-07-09 18:51:59 +02001541 *
1542 * __sched_fork() is basic setup used by init_idle() too:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001543 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001544static void __sched_fork(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001545{
Ingo Molnardd41f592007-07-09 18:51:59 +02001546 p->se.wait_start_fair = 0;
1547 p->se.wait_start = 0;
1548 p->se.exec_start = 0;
1549 p->se.sum_exec_runtime = 0;
1550 p->se.delta_exec = 0;
1551 p->se.delta_fair_run = 0;
1552 p->se.delta_fair_sleep = 0;
1553 p->se.wait_runtime = 0;
1554 p->se.sum_wait_runtime = 0;
1555 p->se.sum_sleep_runtime = 0;
1556 p->se.sleep_start = 0;
1557 p->se.sleep_start_fair = 0;
1558 p->se.block_start = 0;
1559 p->se.sleep_max = 0;
1560 p->se.block_max = 0;
1561 p->se.exec_max = 0;
1562 p->se.wait_max = 0;
1563 p->se.wait_runtime_overruns = 0;
1564 p->se.wait_runtime_underruns = 0;
Nick Piggin476d1392005-06-25 14:57:29 -07001565
Ingo Molnardd41f592007-07-09 18:51:59 +02001566 INIT_LIST_HEAD(&p->run_list);
1567 p->se.on_rq = 0;
Nick Piggin476d1392005-06-25 14:57:29 -07001568
Linus Torvalds1da177e2005-04-16 15:20:36 -07001569 /*
1570 * We mark the process as running here, but have not actually
1571 * inserted it onto the runqueue yet. This guarantees that
1572 * nobody will actually run it, and a signal or other external
1573 * event cannot wake it up and insert it on the runqueue either.
1574 */
1575 p->state = TASK_RUNNING;
Ingo Molnardd41f592007-07-09 18:51:59 +02001576}
1577
1578/*
1579 * fork()/clone()-time setup:
1580 */
1581void sched_fork(struct task_struct *p, int clone_flags)
1582{
1583 int cpu = get_cpu();
1584
1585 __sched_fork(p);
1586
1587#ifdef CONFIG_SMP
1588 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
1589#endif
1590 __set_task_cpu(p, cpu);
Ingo Molnarb29739f2006-06-27 02:54:51 -07001591
1592 /*
1593 * Make sure we do not leak PI boosting priority to the child:
1594 */
1595 p->prio = current->normal_prio;
1596
Chandra Seetharaman52f17b62006-07-14 00:24:38 -07001597#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
Ingo Molnardd41f592007-07-09 18:51:59 +02001598 if (likely(sched_info_on()))
Chandra Seetharaman52f17b62006-07-14 00:24:38 -07001599 memset(&p->sched_info, 0, sizeof(p->sched_info));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001600#endif
Chen, Kenneth Wd6077cb2006-02-14 13:53:10 -08001601#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
Nick Piggin4866cde2005-06-25 14:57:23 -07001602 p->oncpu = 0;
1603#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001604#ifdef CONFIG_PREEMPT
Nick Piggin4866cde2005-06-25 14:57:23 -07001605 /* Want to start with kernel preemption disabled. */
Al Viroa1261f542005-11-13 16:06:55 -08001606 task_thread_info(p)->preempt_count = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001607#endif
Nick Piggin476d1392005-06-25 14:57:29 -07001608 put_cpu();
Linus Torvalds1da177e2005-04-16 15:20:36 -07001609}
1610
1611/*
Ingo Molnardd41f592007-07-09 18:51:59 +02001612 * After fork, child runs first. (default) If set to 0 then
1613 * parent will (try to) run first.
1614 */
1615unsigned int __read_mostly sysctl_sched_child_runs_first = 1;
1616
1617/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001618 * wake_up_new_task - wake up a newly created task for the first time.
1619 *
1620 * This function will do some initial scheduler statistics housekeeping
1621 * that must be done for every newly created context, then puts the task
1622 * on the runqueue and wakes it.
1623 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001624void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001625{
1626 unsigned long flags;
Ingo Molnardd41f592007-07-09 18:51:59 +02001627 struct rq *rq;
1628 int this_cpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001629
1630 rq = task_rq_lock(p, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001631 BUG_ON(p->state != TASK_RUNNING);
Ingo Molnardd41f592007-07-09 18:51:59 +02001632 this_cpu = smp_processor_id(); /* parent's CPU */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001633
1634 p->prio = effective_prio(p);
1635
Ingo Molnardd41f592007-07-09 18:51:59 +02001636 if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) ||
1637 task_cpu(p) != this_cpu || !current->se.on_rq) {
1638 activate_task(rq, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001639 } else {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001640 /*
Ingo Molnardd41f592007-07-09 18:51:59 +02001641 * Let the scheduling class do new task startup
1642 * management (if any):
Linus Torvalds1da177e2005-04-16 15:20:36 -07001643 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001644 p->sched_class->task_new(rq, p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001645 }
Ingo Molnardd41f592007-07-09 18:51:59 +02001646 check_preempt_curr(rq, p);
1647 task_rq_unlock(rq, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001648}
1649
Linus Torvalds1da177e2005-04-16 15:20:36 -07001650/**
Nick Piggin4866cde2005-06-25 14:57:23 -07001651 * prepare_task_switch - prepare to switch tasks
1652 * @rq: the runqueue preparing to switch
1653 * @next: the task we are going to switch to.
1654 *
1655 * This is called with the rq lock held and interrupts off. It must
1656 * be paired with a subsequent finish_task_switch after the context
1657 * switch.
1658 *
1659 * prepare_task_switch sets up locking and calls architecture specific
1660 * hooks.
1661 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07001662static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
Nick Piggin4866cde2005-06-25 14:57:23 -07001663{
1664 prepare_lock_switch(rq, next);
1665 prepare_arch_switch(next);
1666}
1667
1668/**
Linus Torvalds1da177e2005-04-16 15:20:36 -07001669 * finish_task_switch - clean up after a task-switch
Jeff Garzik344baba2005-09-07 01:15:17 -04001670 * @rq: runqueue associated with task-switch
Linus Torvalds1da177e2005-04-16 15:20:36 -07001671 * @prev: the thread we just switched away from.
1672 *
Nick Piggin4866cde2005-06-25 14:57:23 -07001673 * finish_task_switch must be called after the context switch, paired
1674 * with a prepare_task_switch call before the context switch.
1675 * finish_task_switch will reconcile locking set up by prepare_task_switch,
1676 * and do any other architecture-specific cleanup actions.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001677 *
1678 * Note that we may have delayed dropping an mm in context_switch(). If
1679 * so, we finish that here outside of the runqueue lock. (Doing it
1680 * with the lock held can cause deadlocks; see schedule() for
1681 * details.)
1682 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07001683static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001684 __releases(rq->lock)
1685{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001686 struct mm_struct *mm = rq->prev_mm;
Oleg Nesterov55a101f2006-09-29 02:01:10 -07001687 long prev_state;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001688
1689 rq->prev_mm = NULL;
1690
1691 /*
1692 * A task struct has one reference for the use as "current".
Oleg Nesterovc394cc92006-09-29 02:01:11 -07001693 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
Oleg Nesterov55a101f2006-09-29 02:01:10 -07001694 * schedule one last time. The schedule call will never return, and
1695 * the scheduled task must drop that reference.
Oleg Nesterovc394cc92006-09-29 02:01:11 -07001696 * The test for TASK_DEAD must occur while the runqueue locks are
Linus Torvalds1da177e2005-04-16 15:20:36 -07001697 * still held, otherwise prev could be scheduled on another cpu, die
1698 * there before we look at prev->state, and then the reference would
1699 * be dropped twice.
1700 * Manfred Spraul <manfred@colorfullife.com>
1701 */
Oleg Nesterov55a101f2006-09-29 02:01:10 -07001702 prev_state = prev->state;
Nick Piggin4866cde2005-06-25 14:57:23 -07001703 finish_arch_switch(prev);
1704 finish_lock_switch(rq, prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001705 if (mm)
1706 mmdrop(mm);
Oleg Nesterovc394cc92006-09-29 02:01:11 -07001707 if (unlikely(prev_state == TASK_DEAD)) {
bibo maoc6fd91f2006-03-26 01:38:20 -08001708 /*
1709 * Remove function-return probe instances associated with this
1710 * task and put them back on the free list.
Ingo Molnar9761eea2007-07-09 18:52:00 +02001711 */
bibo maoc6fd91f2006-03-26 01:38:20 -08001712 kprobe_flush_task(prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001713 put_task_struct(prev);
bibo maoc6fd91f2006-03-26 01:38:20 -08001714 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001715}
1716
1717/**
1718 * schedule_tail - first thing a freshly forked thread must call.
1719 * @prev: the thread we just switched away from.
1720 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001721asmlinkage void schedule_tail(struct task_struct *prev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001722 __releases(rq->lock)
1723{
Ingo Molnar70b97a72006-07-03 00:25:42 -07001724 struct rq *rq = this_rq();
1725
Nick Piggin4866cde2005-06-25 14:57:23 -07001726 finish_task_switch(rq, prev);
1727#ifdef __ARCH_WANT_UNLOCKED_CTXSW
1728 /* In this case, finish_task_switch does not reenable preemption */
1729 preempt_enable();
1730#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001731 if (current->set_child_tid)
1732 put_user(current->pid, current->set_child_tid);
1733}
1734
1735/*
1736 * context_switch - switch to the new MM and the new
1737 * thread's register state.
1738 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001739static inline void
Ingo Molnar70b97a72006-07-03 00:25:42 -07001740context_switch(struct rq *rq, struct task_struct *prev,
Ingo Molnar36c8b582006-07-03 00:25:41 -07001741 struct task_struct *next)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001742{
Ingo Molnardd41f592007-07-09 18:51:59 +02001743 struct mm_struct *mm, *oldmm;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001744
Ingo Molnardd41f592007-07-09 18:51:59 +02001745 prepare_task_switch(rq, next);
1746 mm = next->mm;
1747 oldmm = prev->active_mm;
Zachary Amsden9226d122007-02-13 13:26:21 +01001748 /*
1749 * For paravirt, this is coupled with an exit in switch_to to
1750 * combine the page table reload and the switch backend into
1751 * one hypercall.
1752 */
1753 arch_enter_lazy_cpu_mode();
1754
Ingo Molnardd41f592007-07-09 18:51:59 +02001755 if (unlikely(!mm)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001756 next->active_mm = oldmm;
1757 atomic_inc(&oldmm->mm_count);
1758 enter_lazy_tlb(oldmm, next);
1759 } else
1760 switch_mm(oldmm, mm, next);
1761
Ingo Molnardd41f592007-07-09 18:51:59 +02001762 if (unlikely(!prev->mm)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001763 prev->active_mm = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001764 rq->prev_mm = oldmm;
1765 }
Ingo Molnar3a5f5e42006-07-14 00:24:27 -07001766 /*
1767 * Since the runqueue lock will be released by the next
1768 * task (which is an invalid locking op but in the case
1769 * of the scheduler it's an obvious special-case), so we
1770 * do an early lockdep release here:
1771 */
1772#ifndef __ARCH_WANT_UNLOCKED_CTXSW
Ingo Molnar8a25d5d2006-07-03 00:24:54 -07001773 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
Ingo Molnar3a5f5e42006-07-14 00:24:27 -07001774#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07001775
1776 /* Here we just switch the register state and the stack. */
1777 switch_to(prev, next, prev);
1778
Ingo Molnardd41f592007-07-09 18:51:59 +02001779 barrier();
1780 /*
1781 * this_rq must be evaluated again because prev may have moved
1782 * CPUs since it called schedule(), thus the 'rq' on its stack
1783 * frame will be invalid.
1784 */
1785 finish_task_switch(this_rq(), prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001786}
1787
1788/*
1789 * nr_running, nr_uninterruptible and nr_context_switches:
1790 *
1791 * externally visible scheduler statistics: current number of runnable
1792 * threads, current number of uninterruptible-sleeping threads, total
1793 * number of context switches performed since bootup.
1794 */
1795unsigned long nr_running(void)
1796{
1797 unsigned long i, sum = 0;
1798
1799 for_each_online_cpu(i)
1800 sum += cpu_rq(i)->nr_running;
1801
1802 return sum;
1803}
1804
1805unsigned long nr_uninterruptible(void)
1806{
1807 unsigned long i, sum = 0;
1808
KAMEZAWA Hiroyuki0a945022006-03-28 01:56:37 -08001809 for_each_possible_cpu(i)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001810 sum += cpu_rq(i)->nr_uninterruptible;
1811
1812 /*
1813 * Since we read the counters lockless, it might be slightly
1814 * inaccurate. Do not allow it to go below zero though:
1815 */
1816 if (unlikely((long)sum < 0))
1817 sum = 0;
1818
1819 return sum;
1820}
1821
1822unsigned long long nr_context_switches(void)
1823{
Steven Rostedtcc94abf2006-06-27 02:54:31 -07001824 int i;
1825 unsigned long long sum = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001826
KAMEZAWA Hiroyuki0a945022006-03-28 01:56:37 -08001827 for_each_possible_cpu(i)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001828 sum += cpu_rq(i)->nr_switches;
1829
1830 return sum;
1831}
1832
1833unsigned long nr_iowait(void)
1834{
1835 unsigned long i, sum = 0;
1836
KAMEZAWA Hiroyuki0a945022006-03-28 01:56:37 -08001837 for_each_possible_cpu(i)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001838 sum += atomic_read(&cpu_rq(i)->nr_iowait);
1839
1840 return sum;
1841}
1842
Jack Steinerdb1b1fe2006-03-31 02:31:21 -08001843unsigned long nr_active(void)
1844{
1845 unsigned long i, running = 0, uninterruptible = 0;
1846
1847 for_each_online_cpu(i) {
1848 running += cpu_rq(i)->nr_running;
1849 uninterruptible += cpu_rq(i)->nr_uninterruptible;
1850 }
1851
1852 if (unlikely((long)uninterruptible < 0))
1853 uninterruptible = 0;
1854
1855 return running + uninterruptible;
1856}
1857
Linus Torvalds1da177e2005-04-16 15:20:36 -07001858/*
Ingo Molnardd41f592007-07-09 18:51:59 +02001859 * Update rq->cpu_load[] statistics. This function is usually called every
1860 * scheduler tick (TICK_NSEC).
Ingo Molnar48f24c42006-07-03 00:25:40 -07001861 */
Ingo Molnardd41f592007-07-09 18:51:59 +02001862static void update_cpu_load(struct rq *this_rq)
Ingo Molnar48f24c42006-07-03 00:25:40 -07001863{
Ingo Molnardd41f592007-07-09 18:51:59 +02001864 u64 fair_delta64, exec_delta64, idle_delta64, sample_interval64, tmp64;
1865 unsigned long total_load = this_rq->ls.load.weight;
1866 unsigned long this_load = total_load;
1867 struct load_stat *ls = &this_rq->ls;
1868 u64 now = __rq_clock(this_rq);
1869 int i, scale;
1870
1871 this_rq->nr_load_updates++;
1872 if (unlikely(!(sysctl_sched_features & SCHED_FEAT_PRECISE_CPU_LOAD)))
1873 goto do_avg;
1874
1875 /* Update delta_fair/delta_exec fields first */
1876 update_curr_load(this_rq, now);
1877
1878 fair_delta64 = ls->delta_fair + 1;
1879 ls->delta_fair = 0;
1880
1881 exec_delta64 = ls->delta_exec + 1;
1882 ls->delta_exec = 0;
1883
1884 sample_interval64 = now - ls->load_update_last;
1885 ls->load_update_last = now;
1886
1887 if ((s64)sample_interval64 < (s64)TICK_NSEC)
1888 sample_interval64 = TICK_NSEC;
1889
1890 if (exec_delta64 > sample_interval64)
1891 exec_delta64 = sample_interval64;
1892
1893 idle_delta64 = sample_interval64 - exec_delta64;
1894
1895 tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
1896 tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
1897
1898 this_load = (unsigned long)tmp64;
1899
1900do_avg:
1901
1902 /* Update our load: */
1903 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
1904 unsigned long old_load, new_load;
1905
1906 /* scale is effectively 1 << i now, and >> i divides by scale */
1907
1908 old_load = this_rq->cpu_load[i];
1909 new_load = this_load;
1910
1911 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
1912 }
Ingo Molnar48f24c42006-07-03 00:25:40 -07001913}
1914
Ingo Molnardd41f592007-07-09 18:51:59 +02001915#ifdef CONFIG_SMP
1916
Ingo Molnar48f24c42006-07-03 00:25:40 -07001917/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001918 * double_rq_lock - safely lock two runqueues
1919 *
1920 * Note this does not disable interrupts like task_rq_lock,
1921 * you need to do so manually before calling.
1922 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07001923static void double_rq_lock(struct rq *rq1, struct rq *rq2)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001924 __acquires(rq1->lock)
1925 __acquires(rq2->lock)
1926{
Kirill Korotaev054b9102006-12-10 02:20:11 -08001927 BUG_ON(!irqs_disabled());
Linus Torvalds1da177e2005-04-16 15:20:36 -07001928 if (rq1 == rq2) {
1929 spin_lock(&rq1->lock);
1930 __acquire(rq2->lock); /* Fake it out ;) */
1931 } else {
Chen, Kenneth Wc96d1452006-06-27 02:54:28 -07001932 if (rq1 < rq2) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001933 spin_lock(&rq1->lock);
1934 spin_lock(&rq2->lock);
1935 } else {
1936 spin_lock(&rq2->lock);
1937 spin_lock(&rq1->lock);
1938 }
1939 }
1940}
1941
1942/*
1943 * double_rq_unlock - safely unlock two runqueues
1944 *
1945 * Note this does not restore interrupts like task_rq_unlock,
1946 * you need to do so manually after calling.
1947 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07001948static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001949 __releases(rq1->lock)
1950 __releases(rq2->lock)
1951{
1952 spin_unlock(&rq1->lock);
1953 if (rq1 != rq2)
1954 spin_unlock(&rq2->lock);
1955 else
1956 __release(rq2->lock);
1957}
1958
1959/*
1960 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1961 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07001962static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001963 __releases(this_rq->lock)
1964 __acquires(busiest->lock)
1965 __acquires(this_rq->lock)
1966{
Kirill Korotaev054b9102006-12-10 02:20:11 -08001967 if (unlikely(!irqs_disabled())) {
1968 /* printk() doesn't work good under rq->lock */
1969 spin_unlock(&this_rq->lock);
1970 BUG_ON(1);
1971 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001972 if (unlikely(!spin_trylock(&busiest->lock))) {
Chen, Kenneth Wc96d1452006-06-27 02:54:28 -07001973 if (busiest < this_rq) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001974 spin_unlock(&this_rq->lock);
1975 spin_lock(&busiest->lock);
1976 spin_lock(&this_rq->lock);
1977 } else
1978 spin_lock(&busiest->lock);
1979 }
1980}
1981
1982/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07001983 * If dest_cpu is allowed for this process, migrate the task to it.
1984 * This is accomplished by forcing the cpu_allowed mask to only
1985 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
1986 * the cpu_allowed mask is restored.
1987 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07001988static void sched_migrate_task(struct task_struct *p, int dest_cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001989{
Ingo Molnar70b97a72006-07-03 00:25:42 -07001990 struct migration_req req;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001991 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07001992 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001993
1994 rq = task_rq_lock(p, &flags);
1995 if (!cpu_isset(dest_cpu, p->cpus_allowed)
1996 || unlikely(cpu_is_offline(dest_cpu)))
1997 goto out;
1998
1999 /* force the process onto the specified CPU */
2000 if (migrate_task(p, dest_cpu, &req)) {
2001 /* Need to wait for migration thread (might exit: take ref). */
2002 struct task_struct *mt = rq->migration_thread;
Ingo Molnar36c8b582006-07-03 00:25:41 -07002003
Linus Torvalds1da177e2005-04-16 15:20:36 -07002004 get_task_struct(mt);
2005 task_rq_unlock(rq, &flags);
2006 wake_up_process(mt);
2007 put_task_struct(mt);
2008 wait_for_completion(&req.done);
Ingo Molnar36c8b582006-07-03 00:25:41 -07002009
Linus Torvalds1da177e2005-04-16 15:20:36 -07002010 return;
2011 }
2012out:
2013 task_rq_unlock(rq, &flags);
2014}
2015
2016/*
Nick Piggin476d1392005-06-25 14:57:29 -07002017 * sched_exec - execve() is a valuable balancing opportunity, because at
2018 * this point the task has the smallest effective memory and cache footprint.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002019 */
2020void sched_exec(void)
2021{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002022 int new_cpu, this_cpu = get_cpu();
Nick Piggin476d1392005-06-25 14:57:29 -07002023 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002024 put_cpu();
Nick Piggin476d1392005-06-25 14:57:29 -07002025 if (new_cpu != this_cpu)
2026 sched_migrate_task(current, new_cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002027}
2028
2029/*
2030 * pull_task - move a task from a remote runqueue to the local runqueue.
2031 * Both runqueues must be locked.
2032 */
Ingo Molnardd41f592007-07-09 18:51:59 +02002033static void pull_task(struct rq *src_rq, struct task_struct *p,
2034 struct rq *this_rq, int this_cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002035{
Ingo Molnardd41f592007-07-09 18:51:59 +02002036 deactivate_task(src_rq, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002037 set_task_cpu(p, this_cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02002038 activate_task(this_rq, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002039 /*
2040 * Note that idle threads have a prio of MAX_PRIO, for this test
2041 * to be always true for them.
2042 */
Ingo Molnardd41f592007-07-09 18:51:59 +02002043 check_preempt_curr(this_rq, p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002044}
2045
2046/*
2047 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
2048 */
Arjan van de Ven858119e2006-01-14 13:20:43 -08002049static
Ingo Molnar70b97a72006-07-03 00:25:42 -07002050int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002051 struct sched_domain *sd, enum cpu_idle_type idle,
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07002052 int *all_pinned)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002053{
2054 /*
2055 * We do not migrate tasks that are:
2056 * 1) running (obviously), or
2057 * 2) cannot be migrated to this CPU due to cpus_allowed, or
2058 * 3) are cache-hot on their current CPU.
2059 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07002060 if (!cpu_isset(this_cpu, p->cpus_allowed))
2061 return 0;
Nick Piggin81026792005-06-25 14:57:07 -07002062 *all_pinned = 0;
2063
2064 if (task_running(rq, p))
2065 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002066
2067 /*
Ingo Molnardd41f592007-07-09 18:51:59 +02002068 * Aggressive migration if too many balance attempts have failed:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002069 */
Ingo Molnardd41f592007-07-09 18:51:59 +02002070 if (sd->nr_balance_failed > sd->cache_nice_tries)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002071 return 1;
2072
Linus Torvalds1da177e2005-04-16 15:20:36 -07002073 return 1;
2074}
2075
Ingo Molnardd41f592007-07-09 18:51:59 +02002076static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2077 unsigned long max_nr_move, unsigned long max_load_move,
2078 struct sched_domain *sd, enum cpu_idle_type idle,
2079 int *all_pinned, unsigned long *load_moved,
2080 int this_best_prio, int best_prio, int best_prio_seen,
2081 struct rq_iterator *iterator)
2082{
2083 int pulled = 0, pinned = 0, skip_for_load;
2084 struct task_struct *p;
2085 long rem_load_move = max_load_move;
2086
2087 if (max_nr_move == 0 || max_load_move == 0)
2088 goto out;
2089
2090 pinned = 1;
2091
2092 /*
2093 * Start the load-balancing iterator:
2094 */
2095 p = iterator->start(iterator->arg);
2096next:
2097 if (!p)
2098 goto out;
2099 /*
2100 * To help distribute high priority tasks accross CPUs we don't
2101 * skip a task if it will be the highest priority task (i.e. smallest
2102 * prio value) on its new queue regardless of its load weight
2103 */
2104 skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2105 SCHED_LOAD_SCALE_FUZZ;
2106 if (skip_for_load && p->prio < this_best_prio)
2107 skip_for_load = !best_prio_seen && p->prio == best_prio;
2108 if (skip_for_load ||
2109 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2110
2111 best_prio_seen |= p->prio == best_prio;
2112 p = iterator->next(iterator->arg);
2113 goto next;
2114 }
2115
2116 pull_task(busiest, p, this_rq, this_cpu);
2117 pulled++;
2118 rem_load_move -= p->se.load.weight;
2119
2120 /*
2121 * We only want to steal up to the prescribed number of tasks
2122 * and the prescribed amount of weighted load.
2123 */
2124 if (pulled < max_nr_move && rem_load_move > 0) {
2125 if (p->prio < this_best_prio)
2126 this_best_prio = p->prio;
2127 p = iterator->next(iterator->arg);
2128 goto next;
2129 }
2130out:
2131 /*
2132 * Right now, this is the only place pull_task() is called,
2133 * so we can safely collect pull_task() stats here rather than
2134 * inside pull_task().
2135 */
2136 schedstat_add(sd, lb_gained[idle], pulled);
2137
2138 if (all_pinned)
2139 *all_pinned = pinned;
2140 *load_moved = max_load_move - rem_load_move;
2141 return pulled;
2142}
Ingo Molnar48f24c42006-07-03 00:25:40 -07002143
Linus Torvalds1da177e2005-04-16 15:20:36 -07002144/*
Peter Williams2dd73a42006-06-27 02:54:34 -07002145 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
2146 * load from busiest to this_rq, as part of a balancing operation within
2147 * "domain". Returns the number of tasks moved.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002148 *
2149 * Called with both runqueues locked.
2150 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07002151static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
Peter Williams2dd73a42006-06-27 02:54:34 -07002152 unsigned long max_nr_move, unsigned long max_load_move,
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002153 struct sched_domain *sd, enum cpu_idle_type idle,
Peter Williams2dd73a42006-06-27 02:54:34 -07002154 int *all_pinned)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002155{
Ingo Molnardd41f592007-07-09 18:51:59 +02002156 struct sched_class *class = sched_class_highest;
2157 unsigned long load_moved, total_nr_moved = 0, nr_moved;
2158 long rem_load_move = max_load_move;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002159
Ingo Molnardd41f592007-07-09 18:51:59 +02002160 do {
2161 nr_moved = class->load_balance(this_rq, this_cpu, busiest,
2162 max_nr_move, (unsigned long)rem_load_move,
2163 sd, idle, all_pinned, &load_moved);
2164 total_nr_moved += nr_moved;
2165 max_nr_move -= nr_moved;
2166 rem_load_move -= load_moved;
2167 class = class->next;
2168 } while (class && max_nr_move && rem_load_move > 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002169
Ingo Molnardd41f592007-07-09 18:51:59 +02002170 return total_nr_moved;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002171}
2172
2173/*
2174 * find_busiest_group finds and returns the busiest CPU group within the
Ingo Molnar48f24c42006-07-03 00:25:40 -07002175 * domain. It calculates and returns the amount of weighted load which
2176 * should be moved to restore balance via the imbalance parameter.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002177 */
2178static struct sched_group *
2179find_busiest_group(struct sched_domain *sd, int this_cpu,
Ingo Molnardd41f592007-07-09 18:51:59 +02002180 unsigned long *imbalance, enum cpu_idle_type idle,
2181 int *sd_idle, cpumask_t *cpus, int *balance)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002182{
2183 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
2184 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
Siddha, Suresh B0c117f12005-09-10 00:26:21 -07002185 unsigned long max_pull;
Peter Williams2dd73a42006-06-27 02:54:34 -07002186 unsigned long busiest_load_per_task, busiest_nr_running;
2187 unsigned long this_load_per_task, this_nr_running;
Nick Piggin78979862005-06-25 14:57:13 -07002188 int load_idx;
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002189#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2190 int power_savings_balance = 1;
2191 unsigned long leader_nr_running = 0, min_load_per_task = 0;
2192 unsigned long min_nr_running = ULONG_MAX;
2193 struct sched_group *group_min = NULL, *group_leader = NULL;
2194#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002195
2196 max_load = this_load = total_load = total_pwr = 0;
Peter Williams2dd73a42006-06-27 02:54:34 -07002197 busiest_load_per_task = busiest_nr_running = 0;
2198 this_load_per_task = this_nr_running = 0;
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002199 if (idle == CPU_NOT_IDLE)
Nick Piggin78979862005-06-25 14:57:13 -07002200 load_idx = sd->busy_idx;
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002201 else if (idle == CPU_NEWLY_IDLE)
Nick Piggin78979862005-06-25 14:57:13 -07002202 load_idx = sd->newidle_idx;
2203 else
2204 load_idx = sd->idle_idx;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002205
2206 do {
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002207 unsigned long load, group_capacity;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002208 int local_group;
2209 int i;
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002210 unsigned int balance_cpu = -1, first_idle_cpu = 0;
Peter Williams2dd73a42006-06-27 02:54:34 -07002211 unsigned long sum_nr_running, sum_weighted_load;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002212
2213 local_group = cpu_isset(this_cpu, group->cpumask);
2214
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002215 if (local_group)
2216 balance_cpu = first_cpu(group->cpumask);
2217
Linus Torvalds1da177e2005-04-16 15:20:36 -07002218 /* Tally up the load of all CPUs in the group */
Peter Williams2dd73a42006-06-27 02:54:34 -07002219 sum_weighted_load = sum_nr_running = avg_load = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002220
2221 for_each_cpu_mask(i, group->cpumask) {
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002222 struct rq *rq;
2223
2224 if (!cpu_isset(i, *cpus))
2225 continue;
2226
2227 rq = cpu_rq(i);
Peter Williams2dd73a42006-06-27 02:54:34 -07002228
Nick Piggin5969fe02005-09-10 00:26:19 -07002229 if (*sd_idle && !idle_cpu(i))
2230 *sd_idle = 0;
2231
Linus Torvalds1da177e2005-04-16 15:20:36 -07002232 /* Bias balancing toward cpus of our domain */
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002233 if (local_group) {
2234 if (idle_cpu(i) && !first_idle_cpu) {
2235 first_idle_cpu = 1;
2236 balance_cpu = i;
2237 }
2238
Nick Piggina2000572006-02-10 01:51:02 -08002239 load = target_load(i, load_idx);
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002240 } else
Nick Piggina2000572006-02-10 01:51:02 -08002241 load = source_load(i, load_idx);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002242
2243 avg_load += load;
Peter Williams2dd73a42006-06-27 02:54:34 -07002244 sum_nr_running += rq->nr_running;
Ingo Molnardd41f592007-07-09 18:51:59 +02002245 sum_weighted_load += weighted_cpuload(i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002246 }
2247
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002248 /*
2249 * First idle cpu or the first cpu(busiest) in this sched group
2250 * is eligible for doing load balancing at this and above
2251 * domains.
2252 */
2253 if (local_group && balance_cpu != this_cpu && balance) {
2254 *balance = 0;
2255 goto ret;
2256 }
2257
Linus Torvalds1da177e2005-04-16 15:20:36 -07002258 total_load += avg_load;
Eric Dumazet5517d862007-05-08 00:32:57 -07002259 total_pwr += group->__cpu_power;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002260
2261 /* Adjust by relative CPU power of the group */
Eric Dumazet5517d862007-05-08 00:32:57 -07002262 avg_load = sg_div_cpu_power(group,
2263 avg_load * SCHED_LOAD_SCALE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002264
Eric Dumazet5517d862007-05-08 00:32:57 -07002265 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002266
Linus Torvalds1da177e2005-04-16 15:20:36 -07002267 if (local_group) {
2268 this_load = avg_load;
2269 this = group;
Peter Williams2dd73a42006-06-27 02:54:34 -07002270 this_nr_running = sum_nr_running;
2271 this_load_per_task = sum_weighted_load;
2272 } else if (avg_load > max_load &&
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002273 sum_nr_running > group_capacity) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002274 max_load = avg_load;
2275 busiest = group;
Peter Williams2dd73a42006-06-27 02:54:34 -07002276 busiest_nr_running = sum_nr_running;
2277 busiest_load_per_task = sum_weighted_load;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002278 }
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002279
2280#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2281 /*
2282 * Busy processors will not participate in power savings
2283 * balance.
2284 */
Ingo Molnardd41f592007-07-09 18:51:59 +02002285 if (idle == CPU_NOT_IDLE ||
2286 !(sd->flags & SD_POWERSAVINGS_BALANCE))
2287 goto group_next;
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002288
2289 /*
2290 * If the local group is idle or completely loaded
2291 * no need to do power savings balance at this domain
2292 */
2293 if (local_group && (this_nr_running >= group_capacity ||
2294 !this_nr_running))
2295 power_savings_balance = 0;
2296
Ingo Molnardd41f592007-07-09 18:51:59 +02002297 /*
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002298 * If a group is already running at full capacity or idle,
2299 * don't include that group in power savings calculations
Ingo Molnardd41f592007-07-09 18:51:59 +02002300 */
2301 if (!power_savings_balance || sum_nr_running >= group_capacity
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002302 || !sum_nr_running)
Ingo Molnardd41f592007-07-09 18:51:59 +02002303 goto group_next;
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002304
Ingo Molnardd41f592007-07-09 18:51:59 +02002305 /*
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002306 * Calculate the group which has the least non-idle load.
Ingo Molnardd41f592007-07-09 18:51:59 +02002307 * This is the group from where we need to pick up the load
2308 * for saving power
2309 */
2310 if ((sum_nr_running < min_nr_running) ||
2311 (sum_nr_running == min_nr_running &&
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002312 first_cpu(group->cpumask) <
2313 first_cpu(group_min->cpumask))) {
Ingo Molnardd41f592007-07-09 18:51:59 +02002314 group_min = group;
2315 min_nr_running = sum_nr_running;
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002316 min_load_per_task = sum_weighted_load /
2317 sum_nr_running;
Ingo Molnardd41f592007-07-09 18:51:59 +02002318 }
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002319
Ingo Molnardd41f592007-07-09 18:51:59 +02002320 /*
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002321 * Calculate the group which is almost near its
Ingo Molnardd41f592007-07-09 18:51:59 +02002322 * capacity but still has some space to pick up some load
2323 * from other group and save more power
2324 */
2325 if (sum_nr_running <= group_capacity - 1) {
2326 if (sum_nr_running > leader_nr_running ||
2327 (sum_nr_running == leader_nr_running &&
2328 first_cpu(group->cpumask) >
2329 first_cpu(group_leader->cpumask))) {
2330 group_leader = group;
2331 leader_nr_running = sum_nr_running;
2332 }
Ingo Molnar48f24c42006-07-03 00:25:40 -07002333 }
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002334group_next:
2335#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07002336 group = group->next;
2337 } while (group != sd->groups);
2338
Peter Williams2dd73a42006-06-27 02:54:34 -07002339 if (!busiest || this_load >= max_load || busiest_nr_running == 0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002340 goto out_balanced;
2341
2342 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
2343
2344 if (this_load >= avg_load ||
2345 100*max_load <= sd->imbalance_pct*this_load)
2346 goto out_balanced;
2347
Peter Williams2dd73a42006-06-27 02:54:34 -07002348 busiest_load_per_task /= busiest_nr_running;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002349 /*
2350 * We're trying to get all the cpus to the average_load, so we don't
2351 * want to push ourselves above the average load, nor do we wish to
2352 * reduce the max loaded cpu below the average load, as either of these
2353 * actions would just result in more rebalancing later, and ping-pong
2354 * tasks around. Thus we look for the minimum possible imbalance.
2355 * Negative imbalances (*we* are more loaded than anyone else) will
2356 * be counted as no imbalance for these purposes -- we can't fix that
2357 * by pulling tasks to us. Be careful of negative numbers as they'll
2358 * appear as very large values with unsigned longs.
2359 */
Peter Williams2dd73a42006-06-27 02:54:34 -07002360 if (max_load <= busiest_load_per_task)
2361 goto out_balanced;
2362
2363 /*
2364 * In the presence of smp nice balancing, certain scenarios can have
2365 * max load less than avg load(as we skip the groups at or below
2366 * its cpu_power, while calculating max_load..)
2367 */
2368 if (max_load < avg_load) {
2369 *imbalance = 0;
2370 goto small_imbalance;
2371 }
Siddha, Suresh B0c117f12005-09-10 00:26:21 -07002372
2373 /* Don't want to pull so many tasks that a group would go idle */
Peter Williams2dd73a42006-06-27 02:54:34 -07002374 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
Siddha, Suresh B0c117f12005-09-10 00:26:21 -07002375
Linus Torvalds1da177e2005-04-16 15:20:36 -07002376 /* How much load to actually move to equalise the imbalance */
Eric Dumazet5517d862007-05-08 00:32:57 -07002377 *imbalance = min(max_pull * busiest->__cpu_power,
2378 (avg_load - this_load) * this->__cpu_power)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002379 / SCHED_LOAD_SCALE;
2380
Peter Williams2dd73a42006-06-27 02:54:34 -07002381 /*
2382 * if *imbalance is less than the average load per runnable task
2383 * there is no gaurantee that any tasks will be moved so we'll have
2384 * a think about bumping its value to force at least one task to be
2385 * moved
2386 */
Ingo Molnardd41f592007-07-09 18:51:59 +02002387 if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {
Ingo Molnar48f24c42006-07-03 00:25:40 -07002388 unsigned long tmp, pwr_now, pwr_move;
Peter Williams2dd73a42006-06-27 02:54:34 -07002389 unsigned int imbn;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002390
Peter Williams2dd73a42006-06-27 02:54:34 -07002391small_imbalance:
2392 pwr_move = pwr_now = 0;
2393 imbn = 2;
2394 if (this_nr_running) {
2395 this_load_per_task /= this_nr_running;
2396 if (busiest_load_per_task > this_load_per_task)
2397 imbn = 1;
2398 } else
2399 this_load_per_task = SCHED_LOAD_SCALE;
2400
Ingo Molnardd41f592007-07-09 18:51:59 +02002401 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >=
2402 busiest_load_per_task * imbn) {
Peter Williams2dd73a42006-06-27 02:54:34 -07002403 *imbalance = busiest_load_per_task;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002404 return busiest;
2405 }
2406
2407 /*
2408 * OK, we don't have enough imbalance to justify moving tasks,
2409 * however we may be able to increase total CPU power used by
2410 * moving them.
2411 */
2412
Eric Dumazet5517d862007-05-08 00:32:57 -07002413 pwr_now += busiest->__cpu_power *
2414 min(busiest_load_per_task, max_load);
2415 pwr_now += this->__cpu_power *
2416 min(this_load_per_task, this_load);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002417 pwr_now /= SCHED_LOAD_SCALE;
2418
2419 /* Amount of load we'd subtract */
Eric Dumazet5517d862007-05-08 00:32:57 -07002420 tmp = sg_div_cpu_power(busiest,
2421 busiest_load_per_task * SCHED_LOAD_SCALE);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002422 if (max_load > tmp)
Eric Dumazet5517d862007-05-08 00:32:57 -07002423 pwr_move += busiest->__cpu_power *
Peter Williams2dd73a42006-06-27 02:54:34 -07002424 min(busiest_load_per_task, max_load - tmp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002425
2426 /* Amount of load we'd add */
Eric Dumazet5517d862007-05-08 00:32:57 -07002427 if (max_load * busiest->__cpu_power <
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08002428 busiest_load_per_task * SCHED_LOAD_SCALE)
Eric Dumazet5517d862007-05-08 00:32:57 -07002429 tmp = sg_div_cpu_power(this,
2430 max_load * busiest->__cpu_power);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002431 else
Eric Dumazet5517d862007-05-08 00:32:57 -07002432 tmp = sg_div_cpu_power(this,
2433 busiest_load_per_task * SCHED_LOAD_SCALE);
2434 pwr_move += this->__cpu_power *
2435 min(this_load_per_task, this_load + tmp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002436 pwr_move /= SCHED_LOAD_SCALE;
2437
2438 /* Move if we gain throughput */
2439 if (pwr_move <= pwr_now)
2440 goto out_balanced;
2441
Peter Williams2dd73a42006-06-27 02:54:34 -07002442 *imbalance = busiest_load_per_task;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002443 }
2444
Linus Torvalds1da177e2005-04-16 15:20:36 -07002445 return busiest;
2446
2447out_balanced:
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002448#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002449 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002450 goto ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002451
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002452 if (this == group_leader && group_leader != group_min) {
2453 *imbalance = min_load_per_task;
2454 return group_min;
2455 }
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002456#endif
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002457ret:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002458 *imbalance = 0;
2459 return NULL;
2460}
2461
2462/*
2463 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2464 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07002465static struct rq *
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002466find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002467 unsigned long imbalance, cpumask_t *cpus)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002468{
Ingo Molnar70b97a72006-07-03 00:25:42 -07002469 struct rq *busiest = NULL, *rq;
Peter Williams2dd73a42006-06-27 02:54:34 -07002470 unsigned long max_load = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002471 int i;
2472
2473 for_each_cpu_mask(i, group->cpumask) {
Ingo Molnardd41f592007-07-09 18:51:59 +02002474 unsigned long wl;
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002475
2476 if (!cpu_isset(i, *cpus))
2477 continue;
2478
Ingo Molnar48f24c42006-07-03 00:25:40 -07002479 rq = cpu_rq(i);
Ingo Molnardd41f592007-07-09 18:51:59 +02002480 wl = weighted_cpuload(i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002481
Ingo Molnardd41f592007-07-09 18:51:59 +02002482 if (rq->nr_running == 1 && wl > imbalance)
Peter Williams2dd73a42006-06-27 02:54:34 -07002483 continue;
2484
Ingo Molnardd41f592007-07-09 18:51:59 +02002485 if (wl > max_load) {
2486 max_load = wl;
Ingo Molnar48f24c42006-07-03 00:25:40 -07002487 busiest = rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002488 }
2489 }
2490
2491 return busiest;
2492}
2493
2494/*
Nick Piggin77391d72005-06-25 14:57:30 -07002495 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2496 * so long as it is large enough.
2497 */
2498#define MAX_PINNED_INTERVAL 512
2499
Ingo Molnar48f24c42006-07-03 00:25:40 -07002500static inline unsigned long minus_1_or_zero(unsigned long n)
2501{
2502 return n > 0 ? n - 1 : 0;
2503}
2504
Nick Piggin77391d72005-06-25 14:57:30 -07002505/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07002506 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2507 * tasks if there is an imbalance.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002508 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07002509static int load_balance(int this_cpu, struct rq *this_rq,
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002510 struct sched_domain *sd, enum cpu_idle_type idle,
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002511 int *balance)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002512{
Ingo Molnar48f24c42006-07-03 00:25:40 -07002513 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002514 struct sched_group *group;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002515 unsigned long imbalance;
Ingo Molnar70b97a72006-07-03 00:25:42 -07002516 struct rq *busiest;
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002517 cpumask_t cpus = CPU_MASK_ALL;
Christoph Lameterfe2eea32006-12-10 02:20:21 -08002518 unsigned long flags;
Nick Piggin5969fe02005-09-10 00:26:19 -07002519
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002520 /*
2521 * When power savings policy is enabled for the parent domain, idle
2522 * sibling can pick up load irrespective of busy siblings. In this case,
Ingo Molnardd41f592007-07-09 18:51:59 +02002523 * let the state of idle sibling percolate up as CPU_IDLE, instead of
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002524 * portraying it as CPU_NOT_IDLE.
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002525 */
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002526 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002527 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
Nick Piggin5969fe02005-09-10 00:26:19 -07002528 sd_idle = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002529
Linus Torvalds1da177e2005-04-16 15:20:36 -07002530 schedstat_inc(sd, lb_cnt[idle]);
2531
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002532redo:
2533 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002534 &cpus, balance);
2535
Chen, Kenneth W06066712006-12-10 02:20:35 -08002536 if (*balance == 0)
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002537 goto out_balanced;
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002538
Linus Torvalds1da177e2005-04-16 15:20:36 -07002539 if (!group) {
2540 schedstat_inc(sd, lb_nobusyg[idle]);
2541 goto out_balanced;
2542 }
2543
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002544 busiest = find_busiest_queue(group, idle, imbalance, &cpus);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002545 if (!busiest) {
2546 schedstat_inc(sd, lb_nobusyq[idle]);
2547 goto out_balanced;
2548 }
2549
Nick Piggindb935db2005-06-25 14:57:11 -07002550 BUG_ON(busiest == this_rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002551
2552 schedstat_add(sd, lb_imbalance[idle], imbalance);
2553
2554 nr_moved = 0;
2555 if (busiest->nr_running > 1) {
2556 /*
2557 * Attempt to move tasks. If find_busiest_group has found
2558 * an imbalance but busiest->nr_running <= 1, the group is
2559 * still unbalanced. nr_moved simply stays zero, so it is
2560 * correctly treated as an imbalance.
2561 */
Christoph Lameterfe2eea32006-12-10 02:20:21 -08002562 local_irq_save(flags);
Nick Piggine17224b2005-09-10 00:26:18 -07002563 double_rq_lock(this_rq, busiest);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002564 nr_moved = move_tasks(this_rq, this_cpu, busiest,
Ingo Molnar48f24c42006-07-03 00:25:40 -07002565 minus_1_or_zero(busiest->nr_running),
2566 imbalance, sd, idle, &all_pinned);
Nick Piggine17224b2005-09-10 00:26:18 -07002567 double_rq_unlock(this_rq, busiest);
Christoph Lameterfe2eea32006-12-10 02:20:21 -08002568 local_irq_restore(flags);
Nick Piggin81026792005-06-25 14:57:07 -07002569
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002570 /*
2571 * some other cpu did the load balance for us.
2572 */
2573 if (nr_moved && this_cpu != smp_processor_id())
2574 resched_cpu(this_cpu);
2575
Nick Piggin81026792005-06-25 14:57:07 -07002576 /* All tasks on this runqueue were pinned by CPU affinity */
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002577 if (unlikely(all_pinned)) {
2578 cpu_clear(cpu_of(busiest), cpus);
2579 if (!cpus_empty(cpus))
2580 goto redo;
Nick Piggin81026792005-06-25 14:57:07 -07002581 goto out_balanced;
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002582 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002583 }
Nick Piggin81026792005-06-25 14:57:07 -07002584
Linus Torvalds1da177e2005-04-16 15:20:36 -07002585 if (!nr_moved) {
2586 schedstat_inc(sd, lb_failed[idle]);
2587 sd->nr_balance_failed++;
2588
2589 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002590
Christoph Lameterfe2eea32006-12-10 02:20:21 -08002591 spin_lock_irqsave(&busiest->lock, flags);
Siddha, Suresh Bfa3b6dd2005-09-10 00:26:21 -07002592
2593 /* don't kick the migration_thread, if the curr
2594 * task on busiest cpu can't be moved to this_cpu
2595 */
2596 if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
Christoph Lameterfe2eea32006-12-10 02:20:21 -08002597 spin_unlock_irqrestore(&busiest->lock, flags);
Siddha, Suresh Bfa3b6dd2005-09-10 00:26:21 -07002598 all_pinned = 1;
2599 goto out_one_pinned;
2600 }
2601
Linus Torvalds1da177e2005-04-16 15:20:36 -07002602 if (!busiest->active_balance) {
2603 busiest->active_balance = 1;
2604 busiest->push_cpu = this_cpu;
Nick Piggin81026792005-06-25 14:57:07 -07002605 active_balance = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002606 }
Christoph Lameterfe2eea32006-12-10 02:20:21 -08002607 spin_unlock_irqrestore(&busiest->lock, flags);
Nick Piggin81026792005-06-25 14:57:07 -07002608 if (active_balance)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002609 wake_up_process(busiest->migration_thread);
2610
2611 /*
2612 * We've kicked active balancing, reset the failure
2613 * counter.
2614 */
Nick Piggin39507452005-06-25 14:57:09 -07002615 sd->nr_balance_failed = sd->cache_nice_tries+1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002616 }
Nick Piggin81026792005-06-25 14:57:07 -07002617 } else
Linus Torvalds1da177e2005-04-16 15:20:36 -07002618 sd->nr_balance_failed = 0;
2619
Nick Piggin81026792005-06-25 14:57:07 -07002620 if (likely(!active_balance)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002621 /* We were unbalanced, so reset the balancing interval */
2622 sd->balance_interval = sd->min_interval;
Nick Piggin81026792005-06-25 14:57:07 -07002623 } else {
2624 /*
2625 * If we've begun active balancing, start to back off. This
2626 * case may not be covered by the all_pinned logic if there
2627 * is only 1 task on the busy runqueue (because we don't call
2628 * move_tasks).
2629 */
2630 if (sd->balance_interval < sd->max_interval)
2631 sd->balance_interval *= 2;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002632 }
2633
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07002634 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002635 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
Nick Piggin5969fe02005-09-10 00:26:19 -07002636 return -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002637 return nr_moved;
2638
2639out_balanced:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002640 schedstat_inc(sd, lb_balanced[idle]);
2641
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002642 sd->nr_balance_failed = 0;
Siddha, Suresh Bfa3b6dd2005-09-10 00:26:21 -07002643
2644out_one_pinned:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002645 /* tune up the balancing interval */
Nick Piggin77391d72005-06-25 14:57:30 -07002646 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
2647 (sd->balance_interval < sd->max_interval))
Linus Torvalds1da177e2005-04-16 15:20:36 -07002648 sd->balance_interval *= 2;
2649
Ingo Molnar48f24c42006-07-03 00:25:40 -07002650 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002651 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
Nick Piggin5969fe02005-09-10 00:26:19 -07002652 return -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002653 return 0;
2654}
2655
2656/*
2657 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2658 * tasks if there is an imbalance.
2659 *
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002660 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
Linus Torvalds1da177e2005-04-16 15:20:36 -07002661 * this_rq is locked.
2662 */
Ingo Molnar48f24c42006-07-03 00:25:40 -07002663static int
Ingo Molnar70b97a72006-07-03 00:25:42 -07002664load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002665{
2666 struct sched_group *group;
Ingo Molnar70b97a72006-07-03 00:25:42 -07002667 struct rq *busiest = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002668 unsigned long imbalance;
2669 int nr_moved = 0;
Nick Piggin5969fe02005-09-10 00:26:19 -07002670 int sd_idle = 0;
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002671 cpumask_t cpus = CPU_MASK_ALL;
Nick Piggin5969fe02005-09-10 00:26:19 -07002672
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002673 /*
2674 * When power savings policy is enabled for the parent domain, idle
2675 * sibling can pick up load irrespective of busy siblings. In this case,
2676 * let the state of idle sibling percolate up as IDLE, instead of
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002677 * portraying it as CPU_NOT_IDLE.
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002678 */
2679 if (sd->flags & SD_SHARE_CPUPOWER &&
2680 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
Nick Piggin5969fe02005-09-10 00:26:19 -07002681 sd_idle = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002682
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002683 schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]);
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002684redo:
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002685 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002686 &sd_idle, &cpus, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002687 if (!group) {
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002688 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002689 goto out_balanced;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002690 }
2691
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002692 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance,
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002693 &cpus);
Nick Piggindb935db2005-06-25 14:57:11 -07002694 if (!busiest) {
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002695 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002696 goto out_balanced;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002697 }
2698
Nick Piggindb935db2005-06-25 14:57:11 -07002699 BUG_ON(busiest == this_rq);
2700
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002701 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
Nick Piggind6d5cfa2005-09-10 00:26:16 -07002702
2703 nr_moved = 0;
2704 if (busiest->nr_running > 1) {
2705 /* Attempt to move tasks */
2706 double_lock_balance(this_rq, busiest);
2707 nr_moved = move_tasks(this_rq, this_cpu, busiest,
Peter Williams2dd73a42006-06-27 02:54:34 -07002708 minus_1_or_zero(busiest->nr_running),
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002709 imbalance, sd, CPU_NEWLY_IDLE, NULL);
Nick Piggind6d5cfa2005-09-10 00:26:16 -07002710 spin_unlock(&busiest->lock);
Christoph Lameter0a2966b2006-09-25 23:30:51 -07002711
2712 if (!nr_moved) {
2713 cpu_clear(cpu_of(busiest), cpus);
2714 if (!cpus_empty(cpus))
2715 goto redo;
2716 }
Nick Piggind6d5cfa2005-09-10 00:26:16 -07002717 }
2718
Nick Piggin5969fe02005-09-10 00:26:19 -07002719 if (!nr_moved) {
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002720 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002721 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2722 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
Nick Piggin5969fe02005-09-10 00:26:19 -07002723 return -1;
2724 } else
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002725 sd->nr_balance_failed = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002726
Linus Torvalds1da177e2005-04-16 15:20:36 -07002727 return nr_moved;
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002728
2729out_balanced:
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002730 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
Ingo Molnar48f24c42006-07-03 00:25:40 -07002731 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
Siddha, Suresh B89c47102006-10-03 01:14:09 -07002732 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
Nick Piggin5969fe02005-09-10 00:26:19 -07002733 return -1;
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002734 sd->nr_balance_failed = 0;
Ingo Molnar48f24c42006-07-03 00:25:40 -07002735
Nick Piggin16cfb1c2005-06-25 14:57:08 -07002736 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002737}
2738
2739/*
2740 * idle_balance is called by schedule() if this_cpu is about to become
2741 * idle. Attempts to pull tasks from other CPUs.
2742 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07002743static void idle_balance(int this_cpu, struct rq *this_rq)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002744{
2745 struct sched_domain *sd;
Ingo Molnardd41f592007-07-09 18:51:59 +02002746 int pulled_task = -1;
2747 unsigned long next_balance = jiffies + HZ;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002748
2749 for_each_domain(this_cpu, sd) {
Christoph Lameter92c4ca52007-06-23 17:16:33 -07002750 unsigned long interval;
2751
2752 if (!(sd->flags & SD_LOAD_BALANCE))
2753 continue;
2754
2755 if (sd->flags & SD_BALANCE_NEWIDLE)
Ingo Molnar48f24c42006-07-03 00:25:40 -07002756 /* If we've pulled tasks over stop searching: */
Christoph Lameter1bd77f22006-12-10 02:20:27 -08002757 pulled_task = load_balance_newidle(this_cpu,
Christoph Lameter92c4ca52007-06-23 17:16:33 -07002758 this_rq, sd);
2759
2760 interval = msecs_to_jiffies(sd->balance_interval);
2761 if (time_after(next_balance, sd->last_balance + interval))
2762 next_balance = sd->last_balance + interval;
2763 if (pulled_task)
2764 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002765 }
Ingo Molnardd41f592007-07-09 18:51:59 +02002766 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
Christoph Lameter1bd77f22006-12-10 02:20:27 -08002767 /*
2768 * We are going idle. next_balance may be set based on
2769 * a busy processor. So reset next_balance.
2770 */
2771 this_rq->next_balance = next_balance;
Ingo Molnardd41f592007-07-09 18:51:59 +02002772 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002773}
2774
2775/*
2776 * active_load_balance is run by migration threads. It pushes running tasks
2777 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
2778 * running on each physical CPU where possible, and avoids physical /
2779 * logical imbalances.
2780 *
2781 * Called with busiest_rq locked.
2782 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07002783static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002784{
Nick Piggin39507452005-06-25 14:57:09 -07002785 int target_cpu = busiest_rq->push_cpu;
Ingo Molnar70b97a72006-07-03 00:25:42 -07002786 struct sched_domain *sd;
2787 struct rq *target_rq;
Nick Piggin39507452005-06-25 14:57:09 -07002788
Ingo Molnar48f24c42006-07-03 00:25:40 -07002789 /* Is there any task to move? */
Nick Piggin39507452005-06-25 14:57:09 -07002790 if (busiest_rq->nr_running <= 1)
Nick Piggin39507452005-06-25 14:57:09 -07002791 return;
2792
2793 target_rq = cpu_rq(target_cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002794
2795 /*
Nick Piggin39507452005-06-25 14:57:09 -07002796 * This condition is "impossible", if it occurs
2797 * we need to fix it. Originally reported by
2798 * Bjorn Helgaas on a 128-cpu setup.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002799 */
Nick Piggin39507452005-06-25 14:57:09 -07002800 BUG_ON(busiest_rq == target_rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002801
Nick Piggin39507452005-06-25 14:57:09 -07002802 /* move a task from busiest_rq to target_rq */
2803 double_lock_balance(busiest_rq, target_rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002804
Nick Piggin39507452005-06-25 14:57:09 -07002805 /* Search for an sd spanning us and the target CPU. */
Chen, Kenneth Wc96d1452006-06-27 02:54:28 -07002806 for_each_domain(target_cpu, sd) {
Nick Piggin39507452005-06-25 14:57:09 -07002807 if ((sd->flags & SD_LOAD_BALANCE) &&
Ingo Molnar48f24c42006-07-03 00:25:40 -07002808 cpu_isset(busiest_cpu, sd->span))
Nick Piggin39507452005-06-25 14:57:09 -07002809 break;
Chen, Kenneth Wc96d1452006-06-27 02:54:28 -07002810 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002811
Ingo Molnar48f24c42006-07-03 00:25:40 -07002812 if (likely(sd)) {
2813 schedstat_inc(sd, alb_cnt);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002814
Ingo Molnar48f24c42006-07-03 00:25:40 -07002815 if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002816 RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE,
Ingo Molnar48f24c42006-07-03 00:25:40 -07002817 NULL))
2818 schedstat_inc(sd, alb_pushed);
2819 else
2820 schedstat_inc(sd, alb_failed);
2821 }
Nick Piggin39507452005-06-25 14:57:09 -07002822 spin_unlock(&target_rq->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002823}
2824
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002825#ifdef CONFIG_NO_HZ
2826static struct {
2827 atomic_t load_balancer;
2828 cpumask_t cpu_mask;
2829} nohz ____cacheline_aligned = {
2830 .load_balancer = ATOMIC_INIT(-1),
2831 .cpu_mask = CPU_MASK_NONE,
2832};
2833
Christoph Lameter7835b982006-12-10 02:20:22 -08002834/*
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002835 * This routine will try to nominate the ilb (idle load balancing)
2836 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
2837 * load balancing on behalf of all those cpus. If all the cpus in the system
2838 * go into this tickless mode, then there will be no ilb owner (as there is
2839 * no need for one) and all the cpus will sleep till the next wakeup event
2840 * arrives...
Christoph Lameter7835b982006-12-10 02:20:22 -08002841 *
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002842 * For the ilb owner, tick is not stopped. And this tick will be used
2843 * for idle load balancing. ilb owner will still be part of
2844 * nohz.cpu_mask..
2845 *
2846 * While stopping the tick, this cpu will become the ilb owner if there
2847 * is no other owner. And will be the owner till that cpu becomes busy
2848 * or if all cpus in the system stop their ticks at which point
2849 * there is no need for ilb owner.
2850 *
2851 * When the ilb owner becomes busy, it nominates another owner, during the
2852 * next busy scheduler_tick()
2853 */
2854int select_nohz_load_balancer(int stop_tick)
2855{
2856 int cpu = smp_processor_id();
2857
2858 if (stop_tick) {
2859 cpu_set(cpu, nohz.cpu_mask);
2860 cpu_rq(cpu)->in_nohz_recently = 1;
2861
2862 /*
2863 * If we are going offline and still the leader, give up!
2864 */
2865 if (cpu_is_offline(cpu) &&
2866 atomic_read(&nohz.load_balancer) == cpu) {
2867 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2868 BUG();
2869 return 0;
2870 }
2871
2872 /* time for ilb owner also to sleep */
2873 if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
2874 if (atomic_read(&nohz.load_balancer) == cpu)
2875 atomic_set(&nohz.load_balancer, -1);
2876 return 0;
2877 }
2878
2879 if (atomic_read(&nohz.load_balancer) == -1) {
2880 /* make me the ilb owner */
2881 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
2882 return 1;
2883 } else if (atomic_read(&nohz.load_balancer) == cpu)
2884 return 1;
2885 } else {
2886 if (!cpu_isset(cpu, nohz.cpu_mask))
2887 return 0;
2888
2889 cpu_clear(cpu, nohz.cpu_mask);
2890
2891 if (atomic_read(&nohz.load_balancer) == cpu)
2892 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
2893 BUG();
2894 }
2895 return 0;
2896}
2897#endif
2898
2899static DEFINE_SPINLOCK(balancing);
2900
2901/*
Christoph Lameter7835b982006-12-10 02:20:22 -08002902 * It checks each scheduling domain to see if it is due to be balanced,
2903 * and initiates a balancing operation if so.
2904 *
2905 * Balancing parameters are set up in arch_init_sched_domains.
2906 */
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002907static inline void rebalance_domains(int cpu, enum cpu_idle_type idle)
Christoph Lameter7835b982006-12-10 02:20:22 -08002908{
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002909 int balance = 1;
2910 struct rq *rq = cpu_rq(cpu);
Christoph Lameter7835b982006-12-10 02:20:22 -08002911 unsigned long interval;
2912 struct sched_domain *sd;
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002913 /* Earliest time when we have to do rebalance again */
Christoph Lameterc9819f42006-12-10 02:20:25 -08002914 unsigned long next_balance = jiffies + 60*HZ;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002915
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002916 for_each_domain(cpu, sd) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002917 if (!(sd->flags & SD_LOAD_BALANCE))
2918 continue;
2919
2920 interval = sd->balance_interval;
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002921 if (idle != CPU_IDLE)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002922 interval *= sd->busy_factor;
2923
2924 /* scale ms to jiffies */
2925 interval = msecs_to_jiffies(interval);
2926 if (unlikely(!interval))
2927 interval = 1;
Ingo Molnardd41f592007-07-09 18:51:59 +02002928 if (interval > HZ*NR_CPUS/10)
2929 interval = HZ*NR_CPUS/10;
2930
Linus Torvalds1da177e2005-04-16 15:20:36 -07002931
Christoph Lameter08c183f2006-12-10 02:20:29 -08002932 if (sd->flags & SD_SERIALIZE) {
2933 if (!spin_trylock(&balancing))
2934 goto out;
2935 }
2936
Christoph Lameterc9819f42006-12-10 02:20:25 -08002937 if (time_after_eq(jiffies, sd->last_balance + interval)) {
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002938 if (load_balance(cpu, rq, sd, idle, &balance)) {
Siddha, Suresh Bfa3b6dd2005-09-10 00:26:21 -07002939 /*
2940 * We've pulled tasks over so either we're no
Nick Piggin5969fe02005-09-10 00:26:19 -07002941 * longer idle, or one of our SMT siblings is
2942 * not idle.
2943 */
Ingo Molnard15bcfd2007-07-09 18:51:57 +02002944 idle = CPU_NOT_IDLE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002945 }
Christoph Lameter1bd77f22006-12-10 02:20:27 -08002946 sd->last_balance = jiffies;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002947 }
Christoph Lameter08c183f2006-12-10 02:20:29 -08002948 if (sd->flags & SD_SERIALIZE)
2949 spin_unlock(&balancing);
2950out:
Christoph Lameterc9819f42006-12-10 02:20:25 -08002951 if (time_after(next_balance, sd->last_balance + interval))
2952 next_balance = sd->last_balance + interval;
Siddha, Suresh B783609c2006-12-10 02:20:33 -08002953
2954 /*
2955 * Stop the load balance at this level. There is another
2956 * CPU in our sched group which is doing load balancing more
2957 * actively.
2958 */
2959 if (!balance)
2960 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002961 }
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002962 rq->next_balance = next_balance;
2963}
2964
2965/*
2966 * run_rebalance_domains is triggered when needed from the scheduler tick.
2967 * In CONFIG_NO_HZ case, the idle load balance owner will do the
2968 * rebalancing for all the cpus for whom scheduler ticks are stopped.
2969 */
2970static void run_rebalance_domains(struct softirq_action *h)
2971{
Ingo Molnardd41f592007-07-09 18:51:59 +02002972 int this_cpu = smp_processor_id();
2973 struct rq *this_rq = cpu_rq(this_cpu);
2974 enum cpu_idle_type idle = this_rq->idle_at_tick ?
2975 CPU_IDLE : CPU_NOT_IDLE;
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002976
Ingo Molnardd41f592007-07-09 18:51:59 +02002977 rebalance_domains(this_cpu, idle);
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002978
2979#ifdef CONFIG_NO_HZ
2980 /*
2981 * If this cpu is the owner for idle load balancing, then do the
2982 * balancing on behalf of the other idle cpus whose ticks are
2983 * stopped.
2984 */
Ingo Molnardd41f592007-07-09 18:51:59 +02002985 if (this_rq->idle_at_tick &&
2986 atomic_read(&nohz.load_balancer) == this_cpu) {
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002987 cpumask_t cpus = nohz.cpu_mask;
2988 struct rq *rq;
2989 int balance_cpu;
2990
Ingo Molnardd41f592007-07-09 18:51:59 +02002991 cpu_clear(this_cpu, cpus);
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07002992 for_each_cpu_mask(balance_cpu, cpus) {
2993 /*
2994 * If this cpu gets work to do, stop the load balancing
2995 * work being done for other cpus. Next load
2996 * balancing owner will pick it up.
2997 */
2998 if (need_resched())
2999 break;
3000
Ingo Molnardd41f592007-07-09 18:51:59 +02003001 rebalance_domains(balance_cpu, SCHED_IDLE);
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003002
3003 rq = cpu_rq(balance_cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02003004 if (time_after(this_rq->next_balance, rq->next_balance))
3005 this_rq->next_balance = rq->next_balance;
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003006 }
3007 }
3008#endif
3009}
3010
3011/*
3012 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3013 *
3014 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3015 * idle load balancing owner or decide to stop the periodic load balancing,
3016 * if the whole system is idle.
3017 */
Ingo Molnardd41f592007-07-09 18:51:59 +02003018static inline void trigger_load_balance(struct rq *rq, int cpu)
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003019{
Siddha, Suresh B46cb4b72007-05-08 00:32:51 -07003020#ifdef CONFIG_NO_HZ
3021 /*
3022 * If we were in the nohz mode recently and busy at the current
3023 * scheduler tick, then check if we need to nominate new idle
3024 * load balancer.
3025 */
3026 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3027 rq->in_nohz_recently = 0;
3028
3029 if (atomic_read(&nohz.load_balancer) == cpu) {
3030 cpu_clear(cpu, nohz.cpu_mask);
3031 atomic_set(&nohz.load_balancer, -1);
3032 }
3033
3034 if (atomic_read(&nohz.load_balancer) == -1) {
3035 /*
3036 * simple selection for now: Nominate the
3037 * first cpu in the nohz list to be the next
3038 * ilb owner.
3039 *
3040 * TBD: Traverse the sched domains and nominate
3041 * the nearest cpu in the nohz.cpu_mask.
3042 */
3043 int ilb = first_cpu(nohz.cpu_mask);
3044
3045 if (ilb != NR_CPUS)
3046 resched_cpu(ilb);
3047 }
3048 }
3049
3050 /*
3051 * If this cpu is idle and doing idle load balancing for all the
3052 * cpus with ticks stopped, is it time for that to stop?
3053 */
3054 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3055 cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
3056 resched_cpu(cpu);
3057 return;
3058 }
3059
3060 /*
3061 * If this cpu is idle and the idle load balancing is done by
3062 * someone else, then no need raise the SCHED_SOFTIRQ
3063 */
3064 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3065 cpu_isset(cpu, nohz.cpu_mask))
3066 return;
3067#endif
3068 if (time_after_eq(jiffies, rq->next_balance))
3069 raise_softirq(SCHED_SOFTIRQ);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003070}
Ingo Molnardd41f592007-07-09 18:51:59 +02003071
3072#else /* CONFIG_SMP */
3073
Linus Torvalds1da177e2005-04-16 15:20:36 -07003074/*
3075 * on UP we do not need to balance between CPUs:
3076 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07003077static inline void idle_balance(int cpu, struct rq *rq)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003078{
3079}
Ingo Molnardd41f592007-07-09 18:51:59 +02003080
3081/* Avoid "used but not defined" warning on UP */
3082static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3083 unsigned long max_nr_move, unsigned long max_load_move,
3084 struct sched_domain *sd, enum cpu_idle_type idle,
3085 int *all_pinned, unsigned long *load_moved,
3086 int this_best_prio, int best_prio, int best_prio_seen,
3087 struct rq_iterator *iterator)
3088{
3089 *load_moved = 0;
3090
3091 return 0;
3092}
3093
Linus Torvalds1da177e2005-04-16 15:20:36 -07003094#endif
3095
Linus Torvalds1da177e2005-04-16 15:20:36 -07003096DEFINE_PER_CPU(struct kernel_stat, kstat);
3097
3098EXPORT_PER_CPU_SYMBOL(kstat);
3099
3100/*
Ingo Molnar41b86e92007-07-09 18:51:58 +02003101 * Return p->sum_exec_runtime plus any more ns on the sched_clock
3102 * that have not yet been banked in case the task is currently running.
Linus Torvalds1da177e2005-04-16 15:20:36 -07003103 */
Ingo Molnar41b86e92007-07-09 18:51:58 +02003104unsigned long long task_sched_runtime(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003105{
Linus Torvalds1da177e2005-04-16 15:20:36 -07003106 unsigned long flags;
Ingo Molnar41b86e92007-07-09 18:51:58 +02003107 u64 ns, delta_exec;
3108 struct rq *rq;
Ingo Molnar48f24c42006-07-03 00:25:40 -07003109
Ingo Molnar41b86e92007-07-09 18:51:58 +02003110 rq = task_rq_lock(p, &flags);
3111 ns = p->se.sum_exec_runtime;
3112 if (rq->curr == p) {
3113 delta_exec = rq_clock(rq) - p->se.exec_start;
3114 if ((s64)delta_exec > 0)
3115 ns += delta_exec;
3116 }
3117 task_rq_unlock(rq, &flags);
Ingo Molnar48f24c42006-07-03 00:25:40 -07003118
Linus Torvalds1da177e2005-04-16 15:20:36 -07003119 return ns;
3120}
3121
3122/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07003123 * Account user cpu time to a process.
3124 * @p: the process that the cpu time gets accounted to
3125 * @hardirq_offset: the offset to subtract from hardirq_count()
3126 * @cputime: the cpu time spent in user space since the last update
3127 */
3128void account_user_time(struct task_struct *p, cputime_t cputime)
3129{
3130 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3131 cputime64_t tmp;
3132
3133 p->utime = cputime_add(p->utime, cputime);
3134
3135 /* Add user time to cpustat. */
3136 tmp = cputime_to_cputime64(cputime);
3137 if (TASK_NICE(p) > 0)
3138 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3139 else
3140 cpustat->user = cputime64_add(cpustat->user, tmp);
3141}
3142
3143/*
3144 * Account system cpu time to a process.
3145 * @p: the process that the cpu time gets accounted to
3146 * @hardirq_offset: the offset to subtract from hardirq_count()
3147 * @cputime: the cpu time spent in kernel space since the last update
3148 */
3149void account_system_time(struct task_struct *p, int hardirq_offset,
3150 cputime_t cputime)
3151{
3152 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
Ingo Molnar70b97a72006-07-03 00:25:42 -07003153 struct rq *rq = this_rq();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003154 cputime64_t tmp;
3155
3156 p->stime = cputime_add(p->stime, cputime);
3157
3158 /* Add system time to cpustat. */
3159 tmp = cputime_to_cputime64(cputime);
3160 if (hardirq_count() - hardirq_offset)
3161 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3162 else if (softirq_count())
3163 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3164 else if (p != rq->idle)
3165 cpustat->system = cputime64_add(cpustat->system, tmp);
3166 else if (atomic_read(&rq->nr_iowait) > 0)
3167 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3168 else
3169 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3170 /* Account for system time used */
3171 acct_update_integrals(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003172}
3173
3174/*
3175 * Account for involuntary wait time.
3176 * @p: the process from which the cpu time has been stolen
3177 * @steal: the cpu time spent in involuntary wait
3178 */
3179void account_steal_time(struct task_struct *p, cputime_t steal)
3180{
3181 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3182 cputime64_t tmp = cputime_to_cputime64(steal);
Ingo Molnar70b97a72006-07-03 00:25:42 -07003183 struct rq *rq = this_rq();
Linus Torvalds1da177e2005-04-16 15:20:36 -07003184
3185 if (p == rq->idle) {
3186 p->stime = cputime_add(p->stime, steal);
3187 if (atomic_read(&rq->nr_iowait) > 0)
3188 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3189 else
3190 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3191 } else
3192 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3193}
3194
Christoph Lameter7835b982006-12-10 02:20:22 -08003195/*
3196 * This function gets called by the timer code, with HZ frequency.
3197 * We call it with interrupts disabled.
3198 *
3199 * It also gets called by the fork code, when changing the parent's
3200 * timeslices.
3201 */
3202void scheduler_tick(void)
3203{
Christoph Lameter7835b982006-12-10 02:20:22 -08003204 int cpu = smp_processor_id();
3205 struct rq *rq = cpu_rq(cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02003206 struct task_struct *curr = rq->curr;
Christoph Lameter7835b982006-12-10 02:20:22 -08003207
Ingo Molnardd41f592007-07-09 18:51:59 +02003208 spin_lock(&rq->lock);
3209 if (curr != rq->idle) /* FIXME: needed? */
3210 curr->sched_class->task_tick(rq, curr);
3211 update_cpu_load(rq);
3212 spin_unlock(&rq->lock);
3213
Christoph Lametere418e1c2006-12-10 02:20:23 -08003214#ifdef CONFIG_SMP
Ingo Molnardd41f592007-07-09 18:51:59 +02003215 rq->idle_at_tick = idle_cpu(cpu);
3216 trigger_load_balance(rq, cpu);
Christoph Lametere418e1c2006-12-10 02:20:23 -08003217#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07003218}
3219
Linus Torvalds1da177e2005-04-16 15:20:36 -07003220#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
3221
3222void fastcall add_preempt_count(int val)
3223{
3224 /*
3225 * Underflow?
3226 */
Ingo Molnar9a11b49a2006-07-03 00:24:33 -07003227 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3228 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003229 preempt_count() += val;
3230 /*
3231 * Spinlock count overflowing soon?
3232 */
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08003233 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3234 PREEMPT_MASK - 10);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003235}
3236EXPORT_SYMBOL(add_preempt_count);
3237
3238void fastcall sub_preempt_count(int val)
3239{
3240 /*
3241 * Underflow?
3242 */
Ingo Molnar9a11b49a2006-07-03 00:24:33 -07003243 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3244 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003245 /*
3246 * Is the spinlock portion underflowing?
3247 */
Ingo Molnar9a11b49a2006-07-03 00:24:33 -07003248 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3249 !(preempt_count() & PREEMPT_MASK)))
3250 return;
3251
Linus Torvalds1da177e2005-04-16 15:20:36 -07003252 preempt_count() -= val;
3253}
3254EXPORT_SYMBOL(sub_preempt_count);
3255
3256#endif
3257
3258/*
Ingo Molnardd41f592007-07-09 18:51:59 +02003259 * Print scheduling while atomic bug:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003260 */
Ingo Molnardd41f592007-07-09 18:51:59 +02003261static noinline void __schedule_bug(struct task_struct *prev)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003262{
Ingo Molnardd41f592007-07-09 18:51:59 +02003263 printk(KERN_ERR "BUG: scheduling while atomic: %s/0x%08x/%d\n",
3264 prev->comm, preempt_count(), prev->pid);
3265 debug_show_held_locks(prev);
3266 if (irqs_disabled())
3267 print_irqtrace_events(prev);
3268 dump_stack();
3269}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003270
Ingo Molnardd41f592007-07-09 18:51:59 +02003271/*
3272 * Various schedule()-time debugging checks and statistics:
3273 */
3274static inline void schedule_debug(struct task_struct *prev)
3275{
Linus Torvalds1da177e2005-04-16 15:20:36 -07003276 /*
3277 * Test if we are atomic. Since do_exit() needs to call into
3278 * schedule() atomically, we ignore that path for now.
3279 * Otherwise, whine if we are scheduling when we should not be.
3280 */
Ingo Molnardd41f592007-07-09 18:51:59 +02003281 if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state))
3282 __schedule_bug(prev);
3283
Linus Torvalds1da177e2005-04-16 15:20:36 -07003284 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3285
Ingo Molnardd41f592007-07-09 18:51:59 +02003286 schedstat_inc(this_rq(), sched_cnt);
3287}
3288
3289/*
3290 * Pick up the highest-prio task:
3291 */
3292static inline struct task_struct *
3293pick_next_task(struct rq *rq, struct task_struct *prev, u64 now)
3294{
3295 struct sched_class *class;
3296 struct task_struct *p;
3297
3298 /*
3299 * Optimization: we know that if all tasks are in
3300 * the fair class we can call that function directly:
3301 */
3302 if (likely(rq->nr_running == rq->cfs.nr_running)) {
3303 p = fair_sched_class.pick_next_task(rq, now);
3304 if (likely(p))
3305 return p;
3306 }
3307
3308 class = sched_class_highest;
3309 for ( ; ; ) {
3310 p = class->pick_next_task(rq, now);
3311 if (p)
3312 return p;
3313 /*
3314 * Will never be NULL as the idle class always
3315 * returns a non-NULL p:
3316 */
3317 class = class->next;
3318 }
3319}
3320
3321/*
3322 * schedule() is the main scheduler function.
3323 */
3324asmlinkage void __sched schedule(void)
3325{
3326 struct task_struct *prev, *next;
3327 long *switch_count;
3328 struct rq *rq;
3329 u64 now;
3330 int cpu;
3331
Linus Torvalds1da177e2005-04-16 15:20:36 -07003332need_resched:
3333 preempt_disable();
Ingo Molnardd41f592007-07-09 18:51:59 +02003334 cpu = smp_processor_id();
3335 rq = cpu_rq(cpu);
3336 rcu_qsctr_inc(cpu);
3337 prev = rq->curr;
3338 switch_count = &prev->nivcsw;
3339
Linus Torvalds1da177e2005-04-16 15:20:36 -07003340 release_kernel_lock(prev);
3341need_resched_nonpreemptible:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003342
Ingo Molnardd41f592007-07-09 18:51:59 +02003343 schedule_debug(prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003344
3345 spin_lock_irq(&rq->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003346 clear_tsk_need_resched(prev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003347
Ingo Molnardd41f592007-07-09 18:51:59 +02003348 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3349 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
3350 unlikely(signal_pending(prev)))) {
3351 prev->state = TASK_RUNNING;
3352 } else {
3353 deactivate_task(rq, prev, 1);
3354 }
3355 switch_count = &prev->nvcsw;
3356 }
3357
3358 if (unlikely(!rq->nr_running))
3359 idle_balance(cpu, rq);
3360
3361 now = __rq_clock(rq);
3362 prev->sched_class->put_prev_task(rq, prev, now);
3363 next = pick_next_task(rq, prev, now);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003364
3365 sched_info_switch(prev, next);
Ingo Molnardd41f592007-07-09 18:51:59 +02003366
Linus Torvalds1da177e2005-04-16 15:20:36 -07003367 if (likely(prev != next)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003368 rq->nr_switches++;
3369 rq->curr = next;
3370 ++*switch_count;
3371
Ingo Molnardd41f592007-07-09 18:51:59 +02003372 context_switch(rq, prev, next); /* unlocks the rq */
Linus Torvalds1da177e2005-04-16 15:20:36 -07003373 } else
3374 spin_unlock_irq(&rq->lock);
3375
Ingo Molnardd41f592007-07-09 18:51:59 +02003376 if (unlikely(reacquire_kernel_lock(current) < 0)) {
3377 cpu = smp_processor_id();
3378 rq = cpu_rq(cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003379 goto need_resched_nonpreemptible;
Ingo Molnardd41f592007-07-09 18:51:59 +02003380 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003381 preempt_enable_no_resched();
3382 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3383 goto need_resched;
3384}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003385EXPORT_SYMBOL(schedule);
3386
3387#ifdef CONFIG_PREEMPT
3388/*
Andreas Mohr2ed6e342006-07-10 04:43:52 -07003389 * this is the entry point to schedule() from in-kernel preemption
Linus Torvalds1da177e2005-04-16 15:20:36 -07003390 * off of preempt_enable. Kernel preemptions off return from interrupt
3391 * occur there and call schedule directly.
3392 */
3393asmlinkage void __sched preempt_schedule(void)
3394{
3395 struct thread_info *ti = current_thread_info();
3396#ifdef CONFIG_PREEMPT_BKL
3397 struct task_struct *task = current;
3398 int saved_lock_depth;
3399#endif
3400 /*
3401 * If there is a non-zero preempt_count or interrupts are disabled,
3402 * we do not want to preempt the current task. Just return..
3403 */
Nick Pigginbeed33a2006-10-11 01:21:52 -07003404 if (likely(ti->preempt_count || irqs_disabled()))
Linus Torvalds1da177e2005-04-16 15:20:36 -07003405 return;
3406
3407need_resched:
3408 add_preempt_count(PREEMPT_ACTIVE);
3409 /*
3410 * We keep the big kernel semaphore locked, but we
3411 * clear ->lock_depth so that schedule() doesnt
3412 * auto-release the semaphore:
3413 */
3414#ifdef CONFIG_PREEMPT_BKL
3415 saved_lock_depth = task->lock_depth;
3416 task->lock_depth = -1;
3417#endif
3418 schedule();
3419#ifdef CONFIG_PREEMPT_BKL
3420 task->lock_depth = saved_lock_depth;
3421#endif
3422 sub_preempt_count(PREEMPT_ACTIVE);
3423
3424 /* we could miss a preemption opportunity between schedule and now */
3425 barrier();
3426 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3427 goto need_resched;
3428}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003429EXPORT_SYMBOL(preempt_schedule);
3430
3431/*
Andreas Mohr2ed6e342006-07-10 04:43:52 -07003432 * this is the entry point to schedule() from kernel preemption
Linus Torvalds1da177e2005-04-16 15:20:36 -07003433 * off of irq context.
3434 * Note, that this is called and return with irqs disabled. This will
3435 * protect us against recursive calling from irq.
3436 */
3437asmlinkage void __sched preempt_schedule_irq(void)
3438{
3439 struct thread_info *ti = current_thread_info();
3440#ifdef CONFIG_PREEMPT_BKL
3441 struct task_struct *task = current;
3442 int saved_lock_depth;
3443#endif
Andreas Mohr2ed6e342006-07-10 04:43:52 -07003444 /* Catch callers which need to be fixed */
Linus Torvalds1da177e2005-04-16 15:20:36 -07003445 BUG_ON(ti->preempt_count || !irqs_disabled());
3446
3447need_resched:
3448 add_preempt_count(PREEMPT_ACTIVE);
3449 /*
3450 * We keep the big kernel semaphore locked, but we
3451 * clear ->lock_depth so that schedule() doesnt
3452 * auto-release the semaphore:
3453 */
3454#ifdef CONFIG_PREEMPT_BKL
3455 saved_lock_depth = task->lock_depth;
3456 task->lock_depth = -1;
3457#endif
3458 local_irq_enable();
3459 schedule();
3460 local_irq_disable();
3461#ifdef CONFIG_PREEMPT_BKL
3462 task->lock_depth = saved_lock_depth;
3463#endif
3464 sub_preempt_count(PREEMPT_ACTIVE);
3465
3466 /* we could miss a preemption opportunity between schedule and now */
3467 barrier();
3468 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3469 goto need_resched;
3470}
3471
3472#endif /* CONFIG_PREEMPT */
3473
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07003474int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
3475 void *key)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003476{
Ingo Molnar48f24c42006-07-03 00:25:40 -07003477 return try_to_wake_up(curr->private, mode, sync);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003478}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003479EXPORT_SYMBOL(default_wake_function);
3480
3481/*
3482 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
3483 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
3484 * number) then we wake all the non-exclusive tasks and one exclusive task.
3485 *
3486 * There are circumstances in which we can try to wake a task which has already
3487 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
3488 * zero in this (rare) case, and we handle it by continuing to scan the queue.
3489 */
3490static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3491 int nr_exclusive, int sync, void *key)
3492{
3493 struct list_head *tmp, *next;
3494
3495 list_for_each_safe(tmp, next, &q->task_list) {
Ingo Molnar48f24c42006-07-03 00:25:40 -07003496 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
3497 unsigned flags = curr->flags;
3498
Linus Torvalds1da177e2005-04-16 15:20:36 -07003499 if (curr->func(curr, mode, sync, key) &&
Ingo Molnar48f24c42006-07-03 00:25:40 -07003500 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003501 break;
3502 }
3503}
3504
3505/**
3506 * __wake_up - wake up threads blocked on a waitqueue.
3507 * @q: the waitqueue
3508 * @mode: which threads
3509 * @nr_exclusive: how many wake-one or wake-many threads to wake up
Martin Waitz67be2dd2005-05-01 08:59:26 -07003510 * @key: is directly passed to the wakeup function
Linus Torvalds1da177e2005-04-16 15:20:36 -07003511 */
3512void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07003513 int nr_exclusive, void *key)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003514{
3515 unsigned long flags;
3516
3517 spin_lock_irqsave(&q->lock, flags);
3518 __wake_up_common(q, mode, nr_exclusive, 0, key);
3519 spin_unlock_irqrestore(&q->lock, flags);
3520}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003521EXPORT_SYMBOL(__wake_up);
3522
3523/*
3524 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
3525 */
3526void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
3527{
3528 __wake_up_common(q, mode, 1, 0, NULL);
3529}
3530
3531/**
Martin Waitz67be2dd2005-05-01 08:59:26 -07003532 * __wake_up_sync - wake up threads blocked on a waitqueue.
Linus Torvalds1da177e2005-04-16 15:20:36 -07003533 * @q: the waitqueue
3534 * @mode: which threads
3535 * @nr_exclusive: how many wake-one or wake-many threads to wake up
3536 *
3537 * The sync wakeup differs that the waker knows that it will schedule
3538 * away soon, so while the target thread will be woken up, it will not
3539 * be migrated to another CPU - ie. the two threads are 'synchronized'
3540 * with each other. This can prevent needless bouncing between CPUs.
3541 *
3542 * On UP it can prevent extra preemption.
3543 */
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07003544void fastcall
3545__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003546{
3547 unsigned long flags;
3548 int sync = 1;
3549
3550 if (unlikely(!q))
3551 return;
3552
3553 if (unlikely(!nr_exclusive))
3554 sync = 0;
3555
3556 spin_lock_irqsave(&q->lock, flags);
3557 __wake_up_common(q, mode, nr_exclusive, sync, NULL);
3558 spin_unlock_irqrestore(&q->lock, flags);
3559}
3560EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
3561
3562void fastcall complete(struct completion *x)
3563{
3564 unsigned long flags;
3565
3566 spin_lock_irqsave(&x->wait.lock, flags);
3567 x->done++;
3568 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3569 1, 0, NULL);
3570 spin_unlock_irqrestore(&x->wait.lock, flags);
3571}
3572EXPORT_SYMBOL(complete);
3573
3574void fastcall complete_all(struct completion *x)
3575{
3576 unsigned long flags;
3577
3578 spin_lock_irqsave(&x->wait.lock, flags);
3579 x->done += UINT_MAX/2;
3580 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3581 0, 0, NULL);
3582 spin_unlock_irqrestore(&x->wait.lock, flags);
3583}
3584EXPORT_SYMBOL(complete_all);
3585
3586void fastcall __sched wait_for_completion(struct completion *x)
3587{
3588 might_sleep();
Ingo Molnar48f24c42006-07-03 00:25:40 -07003589
Linus Torvalds1da177e2005-04-16 15:20:36 -07003590 spin_lock_irq(&x->wait.lock);
3591 if (!x->done) {
3592 DECLARE_WAITQUEUE(wait, current);
3593
3594 wait.flags |= WQ_FLAG_EXCLUSIVE;
3595 __add_wait_queue_tail(&x->wait, &wait);
3596 do {
3597 __set_current_state(TASK_UNINTERRUPTIBLE);
3598 spin_unlock_irq(&x->wait.lock);
3599 schedule();
3600 spin_lock_irq(&x->wait.lock);
3601 } while (!x->done);
3602 __remove_wait_queue(&x->wait, &wait);
3603 }
3604 x->done--;
3605 spin_unlock_irq(&x->wait.lock);
3606}
3607EXPORT_SYMBOL(wait_for_completion);
3608
3609unsigned long fastcall __sched
3610wait_for_completion_timeout(struct completion *x, unsigned long timeout)
3611{
3612 might_sleep();
3613
3614 spin_lock_irq(&x->wait.lock);
3615 if (!x->done) {
3616 DECLARE_WAITQUEUE(wait, current);
3617
3618 wait.flags |= WQ_FLAG_EXCLUSIVE;
3619 __add_wait_queue_tail(&x->wait, &wait);
3620 do {
3621 __set_current_state(TASK_UNINTERRUPTIBLE);
3622 spin_unlock_irq(&x->wait.lock);
3623 timeout = schedule_timeout(timeout);
3624 spin_lock_irq(&x->wait.lock);
3625 if (!timeout) {
3626 __remove_wait_queue(&x->wait, &wait);
3627 goto out;
3628 }
3629 } while (!x->done);
3630 __remove_wait_queue(&x->wait, &wait);
3631 }
3632 x->done--;
3633out:
3634 spin_unlock_irq(&x->wait.lock);
3635 return timeout;
3636}
3637EXPORT_SYMBOL(wait_for_completion_timeout);
3638
3639int fastcall __sched wait_for_completion_interruptible(struct completion *x)
3640{
3641 int ret = 0;
3642
3643 might_sleep();
3644
3645 spin_lock_irq(&x->wait.lock);
3646 if (!x->done) {
3647 DECLARE_WAITQUEUE(wait, current);
3648
3649 wait.flags |= WQ_FLAG_EXCLUSIVE;
3650 __add_wait_queue_tail(&x->wait, &wait);
3651 do {
3652 if (signal_pending(current)) {
3653 ret = -ERESTARTSYS;
3654 __remove_wait_queue(&x->wait, &wait);
3655 goto out;
3656 }
3657 __set_current_state(TASK_INTERRUPTIBLE);
3658 spin_unlock_irq(&x->wait.lock);
3659 schedule();
3660 spin_lock_irq(&x->wait.lock);
3661 } while (!x->done);
3662 __remove_wait_queue(&x->wait, &wait);
3663 }
3664 x->done--;
3665out:
3666 spin_unlock_irq(&x->wait.lock);
3667
3668 return ret;
3669}
3670EXPORT_SYMBOL(wait_for_completion_interruptible);
3671
3672unsigned long fastcall __sched
3673wait_for_completion_interruptible_timeout(struct completion *x,
3674 unsigned long timeout)
3675{
3676 might_sleep();
3677
3678 spin_lock_irq(&x->wait.lock);
3679 if (!x->done) {
3680 DECLARE_WAITQUEUE(wait, current);
3681
3682 wait.flags |= WQ_FLAG_EXCLUSIVE;
3683 __add_wait_queue_tail(&x->wait, &wait);
3684 do {
3685 if (signal_pending(current)) {
3686 timeout = -ERESTARTSYS;
3687 __remove_wait_queue(&x->wait, &wait);
3688 goto out;
3689 }
3690 __set_current_state(TASK_INTERRUPTIBLE);
3691 spin_unlock_irq(&x->wait.lock);
3692 timeout = schedule_timeout(timeout);
3693 spin_lock_irq(&x->wait.lock);
3694 if (!timeout) {
3695 __remove_wait_queue(&x->wait, &wait);
3696 goto out;
3697 }
3698 } while (!x->done);
3699 __remove_wait_queue(&x->wait, &wait);
3700 }
3701 x->done--;
3702out:
3703 spin_unlock_irq(&x->wait.lock);
3704 return timeout;
3705}
3706EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3707
Ingo Molnar0fec1712007-07-09 18:52:01 +02003708static inline void
3709sleep_on_head(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003710{
Ingo Molnar0fec1712007-07-09 18:52:01 +02003711 spin_lock_irqsave(&q->lock, *flags);
3712 __add_wait_queue(q, wait);
3713 spin_unlock(&q->lock);
3714}
3715
3716static inline void
3717sleep_on_tail(wait_queue_head_t *q, wait_queue_t *wait, unsigned long *flags)
3718{
3719 spin_lock_irq(&q->lock);
3720 __remove_wait_queue(q, wait);
3721 spin_unlock_irqrestore(&q->lock, *flags);
3722}
3723
3724void __sched interruptible_sleep_on(wait_queue_head_t *q)
3725{
3726 unsigned long flags;
3727 wait_queue_t wait;
3728
3729 init_waitqueue_entry(&wait, current);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003730
3731 current->state = TASK_INTERRUPTIBLE;
3732
Ingo Molnar0fec1712007-07-09 18:52:01 +02003733 sleep_on_head(q, &wait, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003734 schedule();
Ingo Molnar0fec1712007-07-09 18:52:01 +02003735 sleep_on_tail(q, &wait, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003736}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003737EXPORT_SYMBOL(interruptible_sleep_on);
3738
Ingo Molnar0fec1712007-07-09 18:52:01 +02003739long __sched
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07003740interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003741{
Ingo Molnar0fec1712007-07-09 18:52:01 +02003742 unsigned long flags;
3743 wait_queue_t wait;
3744
3745 init_waitqueue_entry(&wait, current);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003746
3747 current->state = TASK_INTERRUPTIBLE;
3748
Ingo Molnar0fec1712007-07-09 18:52:01 +02003749 sleep_on_head(q, &wait, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003750 timeout = schedule_timeout(timeout);
Ingo Molnar0fec1712007-07-09 18:52:01 +02003751 sleep_on_tail(q, &wait, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003752
3753 return timeout;
3754}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003755EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3756
Ingo Molnar0fec1712007-07-09 18:52:01 +02003757void __sched sleep_on(wait_queue_head_t *q)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003758{
Ingo Molnar0fec1712007-07-09 18:52:01 +02003759 unsigned long flags;
3760 wait_queue_t wait;
3761
3762 init_waitqueue_entry(&wait, current);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003763
3764 current->state = TASK_UNINTERRUPTIBLE;
3765
Ingo Molnar0fec1712007-07-09 18:52:01 +02003766 sleep_on_head(q, &wait, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003767 schedule();
Ingo Molnar0fec1712007-07-09 18:52:01 +02003768 sleep_on_tail(q, &wait, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003769}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003770EXPORT_SYMBOL(sleep_on);
3771
Ingo Molnar0fec1712007-07-09 18:52:01 +02003772long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003773{
Ingo Molnar0fec1712007-07-09 18:52:01 +02003774 unsigned long flags;
3775 wait_queue_t wait;
3776
3777 init_waitqueue_entry(&wait, current);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003778
3779 current->state = TASK_UNINTERRUPTIBLE;
3780
Ingo Molnar0fec1712007-07-09 18:52:01 +02003781 sleep_on_head(q, &wait, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003782 timeout = schedule_timeout(timeout);
Ingo Molnar0fec1712007-07-09 18:52:01 +02003783 sleep_on_tail(q, &wait, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003784
3785 return timeout;
3786}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003787EXPORT_SYMBOL(sleep_on_timeout);
3788
Ingo Molnarb29739f2006-06-27 02:54:51 -07003789#ifdef CONFIG_RT_MUTEXES
3790
3791/*
3792 * rt_mutex_setprio - set the current priority of a task
3793 * @p: task
3794 * @prio: prio value (kernel-internal form)
3795 *
3796 * This function changes the 'effective' priority of a task. It does
3797 * not touch ->normal_prio like __setscheduler().
3798 *
3799 * Used by the rt_mutex code to implement priority inheritance logic.
3800 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07003801void rt_mutex_setprio(struct task_struct *p, int prio)
Ingo Molnarb29739f2006-06-27 02:54:51 -07003802{
3803 unsigned long flags;
Ingo Molnardd41f592007-07-09 18:51:59 +02003804 int oldprio, on_rq;
Ingo Molnar70b97a72006-07-03 00:25:42 -07003805 struct rq *rq;
Ingo Molnardd41f592007-07-09 18:51:59 +02003806 u64 now;
Ingo Molnarb29739f2006-06-27 02:54:51 -07003807
3808 BUG_ON(prio < 0 || prio > MAX_PRIO);
3809
3810 rq = task_rq_lock(p, &flags);
Ingo Molnardd41f592007-07-09 18:51:59 +02003811 now = rq_clock(rq);
Ingo Molnarb29739f2006-06-27 02:54:51 -07003812
Andrew Mortond5f9f942007-05-08 20:27:06 -07003813 oldprio = p->prio;
Ingo Molnardd41f592007-07-09 18:51:59 +02003814 on_rq = p->se.on_rq;
3815 if (on_rq)
3816 dequeue_task(rq, p, 0, now);
3817
3818 if (rt_prio(prio))
3819 p->sched_class = &rt_sched_class;
3820 else
3821 p->sched_class = &fair_sched_class;
3822
Ingo Molnarb29739f2006-06-27 02:54:51 -07003823 p->prio = prio;
3824
Ingo Molnardd41f592007-07-09 18:51:59 +02003825 if (on_rq) {
3826 enqueue_task(rq, p, 0, now);
Ingo Molnarb29739f2006-06-27 02:54:51 -07003827 /*
3828 * Reschedule if we are currently running on this runqueue and
Andrew Mortond5f9f942007-05-08 20:27:06 -07003829 * our priority decreased, or if we are not currently running on
3830 * this runqueue and our priority is higher than the current's
Ingo Molnarb29739f2006-06-27 02:54:51 -07003831 */
Andrew Mortond5f9f942007-05-08 20:27:06 -07003832 if (task_running(rq, p)) {
3833 if (p->prio > oldprio)
3834 resched_task(rq->curr);
Ingo Molnardd41f592007-07-09 18:51:59 +02003835 } else {
3836 check_preempt_curr(rq, p);
3837 }
Ingo Molnarb29739f2006-06-27 02:54:51 -07003838 }
3839 task_rq_unlock(rq, &flags);
3840}
3841
3842#endif
3843
Ingo Molnar36c8b582006-07-03 00:25:41 -07003844void set_user_nice(struct task_struct *p, long nice)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003845{
Ingo Molnardd41f592007-07-09 18:51:59 +02003846 int old_prio, delta, on_rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003847 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07003848 struct rq *rq;
Ingo Molnardd41f592007-07-09 18:51:59 +02003849 u64 now;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003850
3851 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3852 return;
3853 /*
3854 * We have to be careful, if called from sys_setpriority(),
3855 * the task might be in the middle of scheduling on another CPU.
3856 */
3857 rq = task_rq_lock(p, &flags);
Ingo Molnardd41f592007-07-09 18:51:59 +02003858 now = rq_clock(rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003859 /*
3860 * The RT priorities are set via sched_setscheduler(), but we still
3861 * allow the 'normal' nice value to be set - but as expected
3862 * it wont have any effect on scheduling until the task is
Ingo Molnardd41f592007-07-09 18:51:59 +02003863 * SCHED_FIFO/SCHED_RR:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003864 */
Ingo Molnare05606d2007-07-09 18:51:59 +02003865 if (task_has_rt_policy(p)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003866 p->static_prio = NICE_TO_PRIO(nice);
3867 goto out_unlock;
3868 }
Ingo Molnardd41f592007-07-09 18:51:59 +02003869 on_rq = p->se.on_rq;
3870 if (on_rq) {
3871 dequeue_task(rq, p, 0, now);
3872 dec_load(rq, p, now);
Peter Williams2dd73a42006-06-27 02:54:34 -07003873 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003874
Linus Torvalds1da177e2005-04-16 15:20:36 -07003875 p->static_prio = NICE_TO_PRIO(nice);
Peter Williams2dd73a42006-06-27 02:54:34 -07003876 set_load_weight(p);
Ingo Molnarb29739f2006-06-27 02:54:51 -07003877 old_prio = p->prio;
3878 p->prio = effective_prio(p);
3879 delta = p->prio - old_prio;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003880
Ingo Molnardd41f592007-07-09 18:51:59 +02003881 if (on_rq) {
3882 enqueue_task(rq, p, 0, now);
3883 inc_load(rq, p, now);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003884 /*
Andrew Mortond5f9f942007-05-08 20:27:06 -07003885 * If the task increased its priority or is running and
3886 * lowered its priority, then reschedule its CPU:
Linus Torvalds1da177e2005-04-16 15:20:36 -07003887 */
Andrew Mortond5f9f942007-05-08 20:27:06 -07003888 if (delta < 0 || (delta > 0 && task_running(rq, p)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07003889 resched_task(rq->curr);
3890 }
3891out_unlock:
3892 task_rq_unlock(rq, &flags);
3893}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003894EXPORT_SYMBOL(set_user_nice);
3895
Matt Mackalle43379f2005-05-01 08:59:00 -07003896/*
3897 * can_nice - check if a task can reduce its nice value
3898 * @p: task
3899 * @nice: nice value
3900 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07003901int can_nice(const struct task_struct *p, const int nice)
Matt Mackalle43379f2005-05-01 08:59:00 -07003902{
Matt Mackall024f4742005-08-18 11:24:19 -07003903 /* convert nice value [19,-20] to rlimit style value [1,40] */
3904 int nice_rlim = 20 - nice;
Ingo Molnar48f24c42006-07-03 00:25:40 -07003905
Matt Mackalle43379f2005-05-01 08:59:00 -07003906 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
3907 capable(CAP_SYS_NICE));
3908}
3909
Linus Torvalds1da177e2005-04-16 15:20:36 -07003910#ifdef __ARCH_WANT_SYS_NICE
3911
3912/*
3913 * sys_nice - change the priority of the current process.
3914 * @increment: priority increment
3915 *
3916 * sys_setpriority is a more generic, but much slower function that
3917 * does similar things.
3918 */
3919asmlinkage long sys_nice(int increment)
3920{
Ingo Molnar48f24c42006-07-03 00:25:40 -07003921 long nice, retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003922
3923 /*
3924 * Setpriority might change our priority at the same moment.
3925 * We don't have to worry. Conceptually one call occurs first
3926 * and we have a single winner.
3927 */
Matt Mackalle43379f2005-05-01 08:59:00 -07003928 if (increment < -40)
3929 increment = -40;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003930 if (increment > 40)
3931 increment = 40;
3932
3933 nice = PRIO_TO_NICE(current->static_prio) + increment;
3934 if (nice < -20)
3935 nice = -20;
3936 if (nice > 19)
3937 nice = 19;
3938
Matt Mackalle43379f2005-05-01 08:59:00 -07003939 if (increment < 0 && !can_nice(current, nice))
3940 return -EPERM;
3941
Linus Torvalds1da177e2005-04-16 15:20:36 -07003942 retval = security_task_setnice(current, nice);
3943 if (retval)
3944 return retval;
3945
3946 set_user_nice(current, nice);
3947 return 0;
3948}
3949
3950#endif
3951
3952/**
3953 * task_prio - return the priority value of a given task.
3954 * @p: the task in question.
3955 *
3956 * This is the priority value as seen by users in /proc.
3957 * RT tasks are offset by -200. Normal tasks are centered
3958 * around 0, value goes from -16 to +15.
3959 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07003960int task_prio(const struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003961{
3962 return p->prio - MAX_RT_PRIO;
3963}
3964
3965/**
3966 * task_nice - return the nice value of a given task.
3967 * @p: the task in question.
3968 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07003969int task_nice(const struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003970{
3971 return TASK_NICE(p);
3972}
Linus Torvalds1da177e2005-04-16 15:20:36 -07003973EXPORT_SYMBOL_GPL(task_nice);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003974
3975/**
3976 * idle_cpu - is a given cpu idle currently?
3977 * @cpu: the processor in question.
3978 */
3979int idle_cpu(int cpu)
3980{
3981 return cpu_curr(cpu) == cpu_rq(cpu)->idle;
3982}
3983
Linus Torvalds1da177e2005-04-16 15:20:36 -07003984/**
3985 * idle_task - return the idle task for a given cpu.
3986 * @cpu: the processor in question.
3987 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07003988struct task_struct *idle_task(int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003989{
3990 return cpu_rq(cpu)->idle;
3991}
3992
3993/**
3994 * find_process_by_pid - find a process with a matching PID value.
3995 * @pid: the pid in question.
3996 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07003997static inline struct task_struct *find_process_by_pid(pid_t pid)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003998{
3999 return pid ? find_task_by_pid(pid) : current;
4000}
4001
4002/* Actually do priority change: must hold rq lock. */
Ingo Molnardd41f592007-07-09 18:51:59 +02004003static void
4004__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004005{
Ingo Molnardd41f592007-07-09 18:51:59 +02004006 BUG_ON(p->se.on_rq);
Ingo Molnar48f24c42006-07-03 00:25:40 -07004007
Linus Torvalds1da177e2005-04-16 15:20:36 -07004008 p->policy = policy;
Ingo Molnardd41f592007-07-09 18:51:59 +02004009 switch (p->policy) {
4010 case SCHED_NORMAL:
4011 case SCHED_BATCH:
4012 case SCHED_IDLE:
4013 p->sched_class = &fair_sched_class;
4014 break;
4015 case SCHED_FIFO:
4016 case SCHED_RR:
4017 p->sched_class = &rt_sched_class;
4018 break;
4019 }
4020
Linus Torvalds1da177e2005-04-16 15:20:36 -07004021 p->rt_priority = prio;
Ingo Molnarb29739f2006-06-27 02:54:51 -07004022 p->normal_prio = normal_prio(p);
4023 /* we are holding p->pi_lock already */
4024 p->prio = rt_mutex_getprio(p);
Peter Williams2dd73a42006-06-27 02:54:34 -07004025 set_load_weight(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004026}
4027
4028/**
Robert P. J. Day72fd4a32007-02-10 01:45:59 -08004029 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004030 * @p: the task in question.
4031 * @policy: new policy.
4032 * @param: structure containing the new RT priority.
Oleg Nesterov5fe1d752006-09-29 02:00:48 -07004033 *
Robert P. J. Day72fd4a32007-02-10 01:45:59 -08004034 * NOTE that the task may be already dead.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004035 */
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07004036int sched_setscheduler(struct task_struct *p, int policy,
4037 struct sched_param *param)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004038{
Ingo Molnardd41f592007-07-09 18:51:59 +02004039 int retval, oldprio, oldpolicy = -1, on_rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004040 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07004041 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004042
Steven Rostedt66e53932006-06-27 02:54:44 -07004043 /* may grab non-irq protected spin_locks */
4044 BUG_ON(in_interrupt());
Linus Torvalds1da177e2005-04-16 15:20:36 -07004045recheck:
4046 /* double check policy once rq lock held */
4047 if (policy < 0)
4048 policy = oldpolicy = p->policy;
4049 else if (policy != SCHED_FIFO && policy != SCHED_RR &&
Ingo Molnardd41f592007-07-09 18:51:59 +02004050 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
4051 policy != SCHED_IDLE)
Ingo Molnarb0a94992006-01-14 13:20:41 -08004052 return -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004053 /*
4054 * Valid priorities for SCHED_FIFO and SCHED_RR are
Ingo Molnardd41f592007-07-09 18:51:59 +02004055 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
4056 * SCHED_BATCH and SCHED_IDLE is 0.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004057 */
4058 if (param->sched_priority < 0 ||
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07004059 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
Steven Rostedtd46523e2005-07-25 16:28:39 -04004060 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
Linus Torvalds1da177e2005-04-16 15:20:36 -07004061 return -EINVAL;
Ingo Molnare05606d2007-07-09 18:51:59 +02004062 if (rt_policy(policy) != (param->sched_priority != 0))
Linus Torvalds1da177e2005-04-16 15:20:36 -07004063 return -EINVAL;
4064
Olivier Croquette37e4ab32005-06-25 14:57:32 -07004065 /*
4066 * Allow unprivileged RT tasks to decrease priority:
4067 */
4068 if (!capable(CAP_SYS_NICE)) {
Ingo Molnare05606d2007-07-09 18:51:59 +02004069 if (rt_policy(policy)) {
Oleg Nesterov8dc3e902006-09-29 02:00:50 -07004070 unsigned long rlim_rtprio;
Oleg Nesterov5fe1d752006-09-29 02:00:48 -07004071
Oleg Nesterov8dc3e902006-09-29 02:00:50 -07004072 if (!lock_task_sighand(p, &flags))
4073 return -ESRCH;
4074 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
4075 unlock_task_sighand(p, &flags);
Oleg Nesterov5fe1d752006-09-29 02:00:48 -07004076
Oleg Nesterov8dc3e902006-09-29 02:00:50 -07004077 /* can't set/change the rt policy */
4078 if (policy != p->policy && !rlim_rtprio)
4079 return -EPERM;
4080
4081 /* can't increase priority */
4082 if (param->sched_priority > p->rt_priority &&
4083 param->sched_priority > rlim_rtprio)
4084 return -EPERM;
4085 }
Ingo Molnardd41f592007-07-09 18:51:59 +02004086 /*
4087 * Like positive nice levels, dont allow tasks to
4088 * move out of SCHED_IDLE either:
4089 */
4090 if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
4091 return -EPERM;
Oleg Nesterov8dc3e902006-09-29 02:00:50 -07004092
Olivier Croquette37e4ab32005-06-25 14:57:32 -07004093 /* can't change other user's priorities */
4094 if ((current->euid != p->euid) &&
4095 (current->euid != p->uid))
4096 return -EPERM;
4097 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004098
4099 retval = security_task_setscheduler(p, policy, param);
4100 if (retval)
4101 return retval;
4102 /*
Ingo Molnarb29739f2006-06-27 02:54:51 -07004103 * make sure no PI-waiters arrive (or leave) while we are
4104 * changing the priority of the task:
4105 */
4106 spin_lock_irqsave(&p->pi_lock, flags);
4107 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -07004108 * To be able to change p->policy safely, the apropriate
4109 * runqueue lock must be held.
4110 */
Ingo Molnarb29739f2006-06-27 02:54:51 -07004111 rq = __task_rq_lock(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004112 /* recheck policy now with rq lock held */
4113 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4114 policy = oldpolicy = -1;
Ingo Molnarb29739f2006-06-27 02:54:51 -07004115 __task_rq_unlock(rq);
4116 spin_unlock_irqrestore(&p->pi_lock, flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004117 goto recheck;
4118 }
Ingo Molnardd41f592007-07-09 18:51:59 +02004119 on_rq = p->se.on_rq;
4120 if (on_rq)
4121 deactivate_task(rq, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004122 oldprio = p->prio;
Ingo Molnardd41f592007-07-09 18:51:59 +02004123 __setscheduler(rq, p, policy, param->sched_priority);
4124 if (on_rq) {
4125 activate_task(rq, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004126 /*
4127 * Reschedule if we are currently running on this runqueue and
Andrew Mortond5f9f942007-05-08 20:27:06 -07004128 * our priority decreased, or if we are not currently running on
4129 * this runqueue and our priority is higher than the current's
Linus Torvalds1da177e2005-04-16 15:20:36 -07004130 */
Andrew Mortond5f9f942007-05-08 20:27:06 -07004131 if (task_running(rq, p)) {
4132 if (p->prio > oldprio)
4133 resched_task(rq->curr);
Ingo Molnardd41f592007-07-09 18:51:59 +02004134 } else {
4135 check_preempt_curr(rq, p);
4136 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004137 }
Ingo Molnarb29739f2006-06-27 02:54:51 -07004138 __task_rq_unlock(rq);
4139 spin_unlock_irqrestore(&p->pi_lock, flags);
4140
Thomas Gleixner95e02ca2006-06-27 02:55:02 -07004141 rt_mutex_adjust_pi(p);
4142
Linus Torvalds1da177e2005-04-16 15:20:36 -07004143 return 0;
4144}
4145EXPORT_SYMBOL_GPL(sched_setscheduler);
4146
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07004147static int
4148do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004149{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004150 struct sched_param lparam;
4151 struct task_struct *p;
Ingo Molnar36c8b582006-07-03 00:25:41 -07004152 int retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004153
4154 if (!param || pid < 0)
4155 return -EINVAL;
4156 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4157 return -EFAULT;
Oleg Nesterov5fe1d752006-09-29 02:00:48 -07004158
4159 rcu_read_lock();
4160 retval = -ESRCH;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004161 p = find_process_by_pid(pid);
Oleg Nesterov5fe1d752006-09-29 02:00:48 -07004162 if (p != NULL)
4163 retval = sched_setscheduler(p, policy, &lparam);
4164 rcu_read_unlock();
Ingo Molnar36c8b582006-07-03 00:25:41 -07004165
Linus Torvalds1da177e2005-04-16 15:20:36 -07004166 return retval;
4167}
4168
4169/**
4170 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
4171 * @pid: the pid in question.
4172 * @policy: new policy.
4173 * @param: structure containing the new RT priority.
4174 */
4175asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
4176 struct sched_param __user *param)
4177{
Jason Baronc21761f2006-01-18 17:43:03 -08004178 /* negative values for policy are not valid */
4179 if (policy < 0)
4180 return -EINVAL;
4181
Linus Torvalds1da177e2005-04-16 15:20:36 -07004182 return do_sched_setscheduler(pid, policy, param);
4183}
4184
4185/**
4186 * sys_sched_setparam - set/change the RT priority of a thread
4187 * @pid: the pid in question.
4188 * @param: structure containing the new RT priority.
4189 */
4190asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
4191{
4192 return do_sched_setscheduler(pid, -1, param);
4193}
4194
4195/**
4196 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
4197 * @pid: the pid in question.
4198 */
4199asmlinkage long sys_sched_getscheduler(pid_t pid)
4200{
Ingo Molnar36c8b582006-07-03 00:25:41 -07004201 struct task_struct *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004202 int retval = -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004203
4204 if (pid < 0)
4205 goto out_nounlock;
4206
4207 retval = -ESRCH;
4208 read_lock(&tasklist_lock);
4209 p = find_process_by_pid(pid);
4210 if (p) {
4211 retval = security_task_getscheduler(p);
4212 if (!retval)
4213 retval = p->policy;
4214 }
4215 read_unlock(&tasklist_lock);
4216
4217out_nounlock:
4218 return retval;
4219}
4220
4221/**
4222 * sys_sched_getscheduler - get the RT priority of a thread
4223 * @pid: the pid in question.
4224 * @param: structure containing the RT priority.
4225 */
4226asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
4227{
4228 struct sched_param lp;
Ingo Molnar36c8b582006-07-03 00:25:41 -07004229 struct task_struct *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004230 int retval = -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004231
4232 if (!param || pid < 0)
4233 goto out_nounlock;
4234
4235 read_lock(&tasklist_lock);
4236 p = find_process_by_pid(pid);
4237 retval = -ESRCH;
4238 if (!p)
4239 goto out_unlock;
4240
4241 retval = security_task_getscheduler(p);
4242 if (retval)
4243 goto out_unlock;
4244
4245 lp.sched_priority = p->rt_priority;
4246 read_unlock(&tasklist_lock);
4247
4248 /*
4249 * This one might sleep, we cannot do it with a spinlock held ...
4250 */
4251 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4252
4253out_nounlock:
4254 return retval;
4255
4256out_unlock:
4257 read_unlock(&tasklist_lock);
4258 return retval;
4259}
4260
4261long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4262{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004263 cpumask_t cpus_allowed;
Ingo Molnar36c8b582006-07-03 00:25:41 -07004264 struct task_struct *p;
4265 int retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004266
Gautham R Shenoy5be93612007-05-09 02:34:04 -07004267 mutex_lock(&sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004268 read_lock(&tasklist_lock);
4269
4270 p = find_process_by_pid(pid);
4271 if (!p) {
4272 read_unlock(&tasklist_lock);
Gautham R Shenoy5be93612007-05-09 02:34:04 -07004273 mutex_unlock(&sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004274 return -ESRCH;
4275 }
4276
4277 /*
4278 * It is not safe to call set_cpus_allowed with the
4279 * tasklist_lock held. We will bump the task_struct's
4280 * usage count and then drop tasklist_lock.
4281 */
4282 get_task_struct(p);
4283 read_unlock(&tasklist_lock);
4284
4285 retval = -EPERM;
4286 if ((current->euid != p->euid) && (current->euid != p->uid) &&
4287 !capable(CAP_SYS_NICE))
4288 goto out_unlock;
4289
David Quigleye7834f82006-06-23 02:03:59 -07004290 retval = security_task_setscheduler(p, 0, NULL);
4291 if (retval)
4292 goto out_unlock;
4293
Linus Torvalds1da177e2005-04-16 15:20:36 -07004294 cpus_allowed = cpuset_cpus_allowed(p);
4295 cpus_and(new_mask, new_mask, cpus_allowed);
4296 retval = set_cpus_allowed(p, new_mask);
4297
4298out_unlock:
4299 put_task_struct(p);
Gautham R Shenoy5be93612007-05-09 02:34:04 -07004300 mutex_unlock(&sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004301 return retval;
4302}
4303
4304static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4305 cpumask_t *new_mask)
4306{
4307 if (len < sizeof(cpumask_t)) {
4308 memset(new_mask, 0, sizeof(cpumask_t));
4309 } else if (len > sizeof(cpumask_t)) {
4310 len = sizeof(cpumask_t);
4311 }
4312 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4313}
4314
4315/**
4316 * sys_sched_setaffinity - set the cpu affinity of a process
4317 * @pid: pid of the process
4318 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4319 * @user_mask_ptr: user-space pointer to the new cpu mask
4320 */
4321asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
4322 unsigned long __user *user_mask_ptr)
4323{
4324 cpumask_t new_mask;
4325 int retval;
4326
4327 retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
4328 if (retval)
4329 return retval;
4330
4331 return sched_setaffinity(pid, new_mask);
4332}
4333
4334/*
4335 * Represents all cpu's present in the system
4336 * In systems capable of hotplug, this map could dynamically grow
4337 * as new cpu's are detected in the system via any platform specific
4338 * method, such as ACPI for e.g.
4339 */
4340
Andi Kleen4cef0c62006-01-11 22:44:57 +01004341cpumask_t cpu_present_map __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004342EXPORT_SYMBOL(cpu_present_map);
4343
4344#ifndef CONFIG_SMP
Andi Kleen4cef0c62006-01-11 22:44:57 +01004345cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
Greg Bankse16b38f2006-10-02 02:17:40 -07004346EXPORT_SYMBOL(cpu_online_map);
4347
Andi Kleen4cef0c62006-01-11 22:44:57 +01004348cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
Greg Bankse16b38f2006-10-02 02:17:40 -07004349EXPORT_SYMBOL(cpu_possible_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004350#endif
4351
4352long sched_getaffinity(pid_t pid, cpumask_t *mask)
4353{
Ingo Molnar36c8b582006-07-03 00:25:41 -07004354 struct task_struct *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004355 int retval;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004356
Gautham R Shenoy5be93612007-05-09 02:34:04 -07004357 mutex_lock(&sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004358 read_lock(&tasklist_lock);
4359
4360 retval = -ESRCH;
4361 p = find_process_by_pid(pid);
4362 if (!p)
4363 goto out_unlock;
4364
David Quigleye7834f82006-06-23 02:03:59 -07004365 retval = security_task_getscheduler(p);
4366 if (retval)
4367 goto out_unlock;
4368
Jack Steiner2f7016d2006-02-01 03:05:18 -08004369 cpus_and(*mask, p->cpus_allowed, cpu_online_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004370
4371out_unlock:
4372 read_unlock(&tasklist_lock);
Gautham R Shenoy5be93612007-05-09 02:34:04 -07004373 mutex_unlock(&sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004374 if (retval)
4375 return retval;
4376
4377 return 0;
4378}
4379
4380/**
4381 * sys_sched_getaffinity - get the cpu affinity of a process
4382 * @pid: pid of the process
4383 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4384 * @user_mask_ptr: user-space pointer to hold the current cpu mask
4385 */
4386asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
4387 unsigned long __user *user_mask_ptr)
4388{
4389 int ret;
4390 cpumask_t mask;
4391
4392 if (len < sizeof(cpumask_t))
4393 return -EINVAL;
4394
4395 ret = sched_getaffinity(pid, &mask);
4396 if (ret < 0)
4397 return ret;
4398
4399 if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
4400 return -EFAULT;
4401
4402 return sizeof(cpumask_t);
4403}
4404
4405/**
4406 * sys_sched_yield - yield the current processor to other threads.
4407 *
Ingo Molnardd41f592007-07-09 18:51:59 +02004408 * This function yields the current CPU to other tasks. If there are no
4409 * other threads running on this CPU then this function will return.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004410 */
4411asmlinkage long sys_sched_yield(void)
4412{
Ingo Molnar70b97a72006-07-03 00:25:42 -07004413 struct rq *rq = this_rq_lock();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004414
4415 schedstat_inc(rq, yld_cnt);
Ingo Molnardd41f592007-07-09 18:51:59 +02004416 if (unlikely(rq->nr_running == 1))
Linus Torvalds1da177e2005-04-16 15:20:36 -07004417 schedstat_inc(rq, yld_act_empty);
Ingo Molnardd41f592007-07-09 18:51:59 +02004418 else
4419 current->sched_class->yield_task(rq, current);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004420
4421 /*
4422 * Since we are going to call schedule() anyway, there's
4423 * no need to preempt or enable interrupts:
4424 */
4425 __release(rq->lock);
Ingo Molnar8a25d5d2006-07-03 00:24:54 -07004426 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004427 _raw_spin_unlock(&rq->lock);
4428 preempt_enable_no_resched();
4429
4430 schedule();
4431
4432 return 0;
4433}
4434
Andrew Mortone7b38402006-06-30 01:56:00 -07004435static void __cond_resched(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004436{
Ingo Molnar8e0a43d2006-06-23 02:05:23 -07004437#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4438 __might_sleep(__FILE__, __LINE__);
4439#endif
Ingo Molnar5bbcfd92005-07-07 17:57:04 -07004440 /*
4441 * The BKS might be reacquired before we have dropped
4442 * PREEMPT_ACTIVE, which could trigger a second
4443 * cond_resched() call.
4444 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07004445 do {
4446 add_preempt_count(PREEMPT_ACTIVE);
4447 schedule();
4448 sub_preempt_count(PREEMPT_ACTIVE);
4449 } while (need_resched());
4450}
4451
4452int __sched cond_resched(void)
4453{
Ingo Molnar94142322006-12-29 16:48:13 -08004454 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
4455 system_state == SYSTEM_RUNNING) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07004456 __cond_resched();
4457 return 1;
4458 }
4459 return 0;
4460}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004461EXPORT_SYMBOL(cond_resched);
4462
4463/*
4464 * cond_resched_lock() - if a reschedule is pending, drop the given lock,
4465 * call schedule, and on return reacquire the lock.
4466 *
4467 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
4468 * operations here to prevent schedule() from being called twice (once via
4469 * spin_unlock(), once by hand).
4470 */
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07004471int cond_resched_lock(spinlock_t *lock)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004472{
Jan Kara6df3cec2005-06-13 15:52:32 -07004473 int ret = 0;
4474
Linus Torvalds1da177e2005-04-16 15:20:36 -07004475 if (need_lockbreak(lock)) {
4476 spin_unlock(lock);
4477 cpu_relax();
Jan Kara6df3cec2005-06-13 15:52:32 -07004478 ret = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004479 spin_lock(lock);
4480 }
Ingo Molnar94142322006-12-29 16:48:13 -08004481 if (need_resched() && system_state == SYSTEM_RUNNING) {
Ingo Molnar8a25d5d2006-07-03 00:24:54 -07004482 spin_release(&lock->dep_map, 1, _THIS_IP_);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004483 _raw_spin_unlock(lock);
4484 preempt_enable_no_resched();
4485 __cond_resched();
Jan Kara6df3cec2005-06-13 15:52:32 -07004486 ret = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004487 spin_lock(lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004488 }
Jan Kara6df3cec2005-06-13 15:52:32 -07004489 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004490}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004491EXPORT_SYMBOL(cond_resched_lock);
4492
4493int __sched cond_resched_softirq(void)
4494{
4495 BUG_ON(!in_softirq());
4496
Ingo Molnar94142322006-12-29 16:48:13 -08004497 if (need_resched() && system_state == SYSTEM_RUNNING) {
Thomas Gleixner98d825672007-05-23 13:58:18 -07004498 local_bh_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004499 __cond_resched();
4500 local_bh_disable();
4501 return 1;
4502 }
4503 return 0;
4504}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004505EXPORT_SYMBOL(cond_resched_softirq);
4506
Linus Torvalds1da177e2005-04-16 15:20:36 -07004507/**
4508 * yield - yield the current processor to other threads.
4509 *
Robert P. J. Day72fd4a32007-02-10 01:45:59 -08004510 * This is a shortcut for kernel-space yielding - it marks the
Linus Torvalds1da177e2005-04-16 15:20:36 -07004511 * thread runnable and calls sys_sched_yield().
4512 */
4513void __sched yield(void)
4514{
4515 set_current_state(TASK_RUNNING);
4516 sys_sched_yield();
4517}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004518EXPORT_SYMBOL(yield);
4519
4520/*
4521 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
4522 * that process accounting knows that this is a task in IO wait state.
4523 *
4524 * But don't do that if it is a deliberate, throttling IO wait (this task
4525 * has set its backing_dev_info: the queue against which it should throttle)
4526 */
4527void __sched io_schedule(void)
4528{
Ingo Molnar70b97a72006-07-03 00:25:42 -07004529 struct rq *rq = &__raw_get_cpu_var(runqueues);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004530
Shailabh Nagar0ff92242006-07-14 00:24:37 -07004531 delayacct_blkio_start();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004532 atomic_inc(&rq->nr_iowait);
4533 schedule();
4534 atomic_dec(&rq->nr_iowait);
Shailabh Nagar0ff92242006-07-14 00:24:37 -07004535 delayacct_blkio_end();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004536}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004537EXPORT_SYMBOL(io_schedule);
4538
4539long __sched io_schedule_timeout(long timeout)
4540{
Ingo Molnar70b97a72006-07-03 00:25:42 -07004541 struct rq *rq = &__raw_get_cpu_var(runqueues);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004542 long ret;
4543
Shailabh Nagar0ff92242006-07-14 00:24:37 -07004544 delayacct_blkio_start();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004545 atomic_inc(&rq->nr_iowait);
4546 ret = schedule_timeout(timeout);
4547 atomic_dec(&rq->nr_iowait);
Shailabh Nagar0ff92242006-07-14 00:24:37 -07004548 delayacct_blkio_end();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004549 return ret;
4550}
4551
4552/**
4553 * sys_sched_get_priority_max - return maximum RT priority.
4554 * @policy: scheduling class.
4555 *
4556 * this syscall returns the maximum rt_priority that can be used
4557 * by a given scheduling class.
4558 */
4559asmlinkage long sys_sched_get_priority_max(int policy)
4560{
4561 int ret = -EINVAL;
4562
4563 switch (policy) {
4564 case SCHED_FIFO:
4565 case SCHED_RR:
4566 ret = MAX_USER_RT_PRIO-1;
4567 break;
4568 case SCHED_NORMAL:
Ingo Molnarb0a94992006-01-14 13:20:41 -08004569 case SCHED_BATCH:
Ingo Molnardd41f592007-07-09 18:51:59 +02004570 case SCHED_IDLE:
Linus Torvalds1da177e2005-04-16 15:20:36 -07004571 ret = 0;
4572 break;
4573 }
4574 return ret;
4575}
4576
4577/**
4578 * sys_sched_get_priority_min - return minimum RT priority.
4579 * @policy: scheduling class.
4580 *
4581 * this syscall returns the minimum rt_priority that can be used
4582 * by a given scheduling class.
4583 */
4584asmlinkage long sys_sched_get_priority_min(int policy)
4585{
4586 int ret = -EINVAL;
4587
4588 switch (policy) {
4589 case SCHED_FIFO:
4590 case SCHED_RR:
4591 ret = 1;
4592 break;
4593 case SCHED_NORMAL:
Ingo Molnarb0a94992006-01-14 13:20:41 -08004594 case SCHED_BATCH:
Ingo Molnardd41f592007-07-09 18:51:59 +02004595 case SCHED_IDLE:
Linus Torvalds1da177e2005-04-16 15:20:36 -07004596 ret = 0;
4597 }
4598 return ret;
4599}
4600
4601/**
4602 * sys_sched_rr_get_interval - return the default timeslice of a process.
4603 * @pid: pid of the process.
4604 * @interval: userspace pointer to the timeslice value.
4605 *
4606 * this syscall writes the default timeslice value of a given process
4607 * into the user-space timespec buffer. A value of '0' means infinity.
4608 */
4609asmlinkage
4610long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4611{
Ingo Molnar36c8b582006-07-03 00:25:41 -07004612 struct task_struct *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004613 int retval = -EINVAL;
4614 struct timespec t;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004615
4616 if (pid < 0)
4617 goto out_nounlock;
4618
4619 retval = -ESRCH;
4620 read_lock(&tasklist_lock);
4621 p = find_process_by_pid(pid);
4622 if (!p)
4623 goto out_unlock;
4624
4625 retval = security_task_getscheduler(p);
4626 if (retval)
4627 goto out_unlock;
4628
Peter Williamsb78709c2006-06-26 16:58:00 +10004629 jiffies_to_timespec(p->policy == SCHED_FIFO ?
Ingo Molnardd41f592007-07-09 18:51:59 +02004630 0 : static_prio_timeslice(p->static_prio), &t);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004631 read_unlock(&tasklist_lock);
4632 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
4633out_nounlock:
4634 return retval;
4635out_unlock:
4636 read_unlock(&tasklist_lock);
4637 return retval;
4638}
4639
Andreas Mohr2ed6e342006-07-10 04:43:52 -07004640static const char stat_nam[] = "RSDTtZX";
Ingo Molnar36c8b582006-07-03 00:25:41 -07004641
4642static void show_task(struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004643{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004644 unsigned long free = 0;
Ingo Molnar36c8b582006-07-03 00:25:41 -07004645 unsigned state;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004646
Linus Torvalds1da177e2005-04-16 15:20:36 -07004647 state = p->state ? __ffs(p->state) + 1 : 0;
Andreas Mohr2ed6e342006-07-10 04:43:52 -07004648 printk("%-13.13s %c", p->comm,
4649 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
Linus Torvalds1da177e2005-04-16 15:20:36 -07004650#if (BITS_PER_LONG == 32)
4651 if (state == TASK_RUNNING)
4652 printk(" running ");
4653 else
4654 printk(" %08lX ", thread_saved_pc(p));
4655#else
4656 if (state == TASK_RUNNING)
4657 printk(" running task ");
4658 else
4659 printk(" %016lx ", thread_saved_pc(p));
4660#endif
4661#ifdef CONFIG_DEBUG_STACK_USAGE
4662 {
Al Viro10ebffd2005-11-13 16:06:56 -08004663 unsigned long *n = end_of_stack(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004664 while (!*n)
4665 n++;
Al Viro10ebffd2005-11-13 16:06:56 -08004666 free = (unsigned long)n - (unsigned long)end_of_stack(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004667 }
4668#endif
Ingo Molnar35f6f752007-04-06 21:18:06 +02004669 printk("%5lu %5d %6d", free, p->pid, p->parent->pid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004670 if (!p->mm)
4671 printk(" (L-TLB)\n");
4672 else
4673 printk(" (NOTLB)\n");
4674
4675 if (state != TASK_RUNNING)
4676 show_stack(p, NULL);
4677}
4678
Ingo Molnare59e2ae2006-12-06 20:35:59 -08004679void show_state_filter(unsigned long state_filter)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004680{
Ingo Molnar36c8b582006-07-03 00:25:41 -07004681 struct task_struct *g, *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004682
4683#if (BITS_PER_LONG == 32)
4684 printk("\n"
Chris Caputo301827a2006-12-06 20:39:11 -08004685 " free sibling\n");
4686 printk(" task PC stack pid father child younger older\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07004687#else
4688 printk("\n"
Chris Caputo301827a2006-12-06 20:39:11 -08004689 " free sibling\n");
4690 printk(" task PC stack pid father child younger older\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07004691#endif
4692 read_lock(&tasklist_lock);
4693 do_each_thread(g, p) {
4694 /*
4695 * reset the NMI-timeout, listing all files on a slow
4696 * console might take alot of time:
4697 */
4698 touch_nmi_watchdog();
Ingo Molnar39bc89f2007-04-25 20:50:03 -07004699 if (!state_filter || (p->state & state_filter))
Ingo Molnare59e2ae2006-12-06 20:35:59 -08004700 show_task(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004701 } while_each_thread(g, p);
4702
Jeremy Fitzhardinge04c91672007-05-08 00:28:05 -07004703 touch_all_softlockup_watchdogs();
4704
Ingo Molnardd41f592007-07-09 18:51:59 +02004705#ifdef CONFIG_SCHED_DEBUG
4706 sysrq_sched_debug_show();
4707#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07004708 read_unlock(&tasklist_lock);
Ingo Molnare59e2ae2006-12-06 20:35:59 -08004709 /*
4710 * Only show locks if all tasks are dumped:
4711 */
4712 if (state_filter == -1)
4713 debug_show_all_locks();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004714}
4715
Ingo Molnar1df21052007-07-09 18:51:58 +02004716void __cpuinit init_idle_bootup_task(struct task_struct *idle)
4717{
Ingo Molnardd41f592007-07-09 18:51:59 +02004718 idle->sched_class = &idle_sched_class;
Ingo Molnar1df21052007-07-09 18:51:58 +02004719}
4720
Ingo Molnarf340c0d2005-06-28 16:40:42 +02004721/**
4722 * init_idle - set up an idle thread for a given CPU
4723 * @idle: task in question
4724 * @cpu: cpu the idle task belongs to
4725 *
4726 * NOTE: this function does not set the idle thread's NEED_RESCHED
4727 * flag, to make booting more robust.
4728 */
Nick Piggin5c1e1762006-10-03 01:14:04 -07004729void __cpuinit init_idle(struct task_struct *idle, int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004730{
Ingo Molnar70b97a72006-07-03 00:25:42 -07004731 struct rq *rq = cpu_rq(cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004732 unsigned long flags;
4733
Ingo Molnardd41f592007-07-09 18:51:59 +02004734 __sched_fork(idle);
4735 idle->se.exec_start = sched_clock();
4736
Ingo Molnarb29739f2006-06-27 02:54:51 -07004737 idle->prio = idle->normal_prio = MAX_PRIO;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004738 idle->cpus_allowed = cpumask_of_cpu(cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02004739 __set_task_cpu(idle, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004740
4741 spin_lock_irqsave(&rq->lock, flags);
4742 rq->curr = rq->idle = idle;
Nick Piggin4866cde2005-06-25 14:57:23 -07004743#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
4744 idle->oncpu = 1;
4745#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07004746 spin_unlock_irqrestore(&rq->lock, flags);
4747
4748 /* Set the preempt count _outside_ the spinlocks! */
4749#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
Al Viroa1261f542005-11-13 16:06:55 -08004750 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004751#else
Al Viroa1261f542005-11-13 16:06:55 -08004752 task_thread_info(idle)->preempt_count = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004753#endif
Ingo Molnardd41f592007-07-09 18:51:59 +02004754 /*
4755 * The idle tasks have their own, simple scheduling class:
4756 */
4757 idle->sched_class = &idle_sched_class;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004758}
4759
4760/*
4761 * In a system that switches off the HZ timer nohz_cpu_mask
4762 * indicates which cpus entered this state. This is used
4763 * in the rcu update to wait only for active cpus. For system
4764 * which do not switch off the HZ timer nohz_cpu_mask should
4765 * always be CPU_MASK_NONE.
4766 */
4767cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4768
Ingo Molnardd41f592007-07-09 18:51:59 +02004769/*
4770 * Increase the granularity value when there are more CPUs,
4771 * because with more CPUs the 'effective latency' as visible
4772 * to users decreases. But the relationship is not linear,
4773 * so pick a second-best guess by going with the log2 of the
4774 * number of CPUs.
4775 *
4776 * This idea comes from the SD scheduler of Con Kolivas:
4777 */
4778static inline void sched_init_granularity(void)
4779{
4780 unsigned int factor = 1 + ilog2(num_online_cpus());
4781 const unsigned long gran_limit = 10000000;
4782
4783 sysctl_sched_granularity *= factor;
4784 if (sysctl_sched_granularity > gran_limit)
4785 sysctl_sched_granularity = gran_limit;
4786
4787 sysctl_sched_runtime_limit = sysctl_sched_granularity * 4;
4788 sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
4789}
4790
Linus Torvalds1da177e2005-04-16 15:20:36 -07004791#ifdef CONFIG_SMP
4792/*
4793 * This is how migration works:
4794 *
Ingo Molnar70b97a72006-07-03 00:25:42 -07004795 * 1) we queue a struct migration_req structure in the source CPU's
Linus Torvalds1da177e2005-04-16 15:20:36 -07004796 * runqueue and wake up that CPU's migration thread.
4797 * 2) we down() the locked semaphore => thread blocks.
4798 * 3) migration thread wakes up (implicitly it forces the migrated
4799 * thread off the CPU)
4800 * 4) it gets the migration request and checks whether the migrated
4801 * task is still in the wrong runqueue.
4802 * 5) if it's in the wrong runqueue then the migration thread removes
4803 * it and puts it into the right queue.
4804 * 6) migration thread up()s the semaphore.
4805 * 7) we wake up and the migration is done.
4806 */
4807
4808/*
4809 * Change a given task's CPU affinity. Migrate the thread to a
4810 * proper CPU and schedule it away if the CPU it's executing on
4811 * is removed from the allowed bitmask.
4812 *
4813 * NOTE: the caller must have a valid reference to the task, the
4814 * task must not exit() & deallocate itself prematurely. The
4815 * call is not atomic; no spinlocks may be held.
4816 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07004817int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004818{
Ingo Molnar70b97a72006-07-03 00:25:42 -07004819 struct migration_req req;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004820 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07004821 struct rq *rq;
Ingo Molnar48f24c42006-07-03 00:25:40 -07004822 int ret = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004823
4824 rq = task_rq_lock(p, &flags);
4825 if (!cpus_intersects(new_mask, cpu_online_map)) {
4826 ret = -EINVAL;
4827 goto out;
4828 }
4829
4830 p->cpus_allowed = new_mask;
4831 /* Can the task run on the task's current CPU? If so, we're done */
4832 if (cpu_isset(task_cpu(p), new_mask))
4833 goto out;
4834
4835 if (migrate_task(p, any_online_cpu(new_mask), &req)) {
4836 /* Need help from migration thread: drop lock and wait. */
4837 task_rq_unlock(rq, &flags);
4838 wake_up_process(rq->migration_thread);
4839 wait_for_completion(&req.done);
4840 tlb_migrate_finish(p->mm);
4841 return 0;
4842 }
4843out:
4844 task_rq_unlock(rq, &flags);
Ingo Molnar48f24c42006-07-03 00:25:40 -07004845
Linus Torvalds1da177e2005-04-16 15:20:36 -07004846 return ret;
4847}
Linus Torvalds1da177e2005-04-16 15:20:36 -07004848EXPORT_SYMBOL_GPL(set_cpus_allowed);
4849
4850/*
4851 * Move (not current) task off this cpu, onto dest cpu. We're doing
4852 * this because either it can't run here any more (set_cpus_allowed()
4853 * away from this CPU, or CPU going down), or because we're
4854 * attempting to rebalance this task on exec (sched_exec).
4855 *
4856 * So we race with normal scheduler movements, but that's OK, as long
4857 * as the task is no longer on this CPU.
Kirill Korotaevefc30812006-06-27 02:54:32 -07004858 *
4859 * Returns non-zero if task was successfully migrated.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004860 */
Kirill Korotaevefc30812006-06-27 02:54:32 -07004861static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004862{
Ingo Molnar70b97a72006-07-03 00:25:42 -07004863 struct rq *rq_dest, *rq_src;
Ingo Molnardd41f592007-07-09 18:51:59 +02004864 int ret = 0, on_rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004865
4866 if (unlikely(cpu_is_offline(dest_cpu)))
Kirill Korotaevefc30812006-06-27 02:54:32 -07004867 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004868
4869 rq_src = cpu_rq(src_cpu);
4870 rq_dest = cpu_rq(dest_cpu);
4871
4872 double_rq_lock(rq_src, rq_dest);
4873 /* Already moved. */
4874 if (task_cpu(p) != src_cpu)
4875 goto out;
4876 /* Affinity changed (again). */
4877 if (!cpu_isset(dest_cpu, p->cpus_allowed))
4878 goto out;
4879
Ingo Molnardd41f592007-07-09 18:51:59 +02004880 on_rq = p->se.on_rq;
4881 if (on_rq)
4882 deactivate_task(rq_src, p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004883 set_task_cpu(p, dest_cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02004884 if (on_rq) {
4885 activate_task(rq_dest, p, 0);
4886 check_preempt_curr(rq_dest, p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004887 }
Kirill Korotaevefc30812006-06-27 02:54:32 -07004888 ret = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004889out:
4890 double_rq_unlock(rq_src, rq_dest);
Kirill Korotaevefc30812006-06-27 02:54:32 -07004891 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004892}
4893
4894/*
4895 * migration_thread - this is a highprio system thread that performs
4896 * thread migration by bumping thread off CPU then 'pushing' onto
4897 * another runqueue.
4898 */
Ingo Molnar95cdf3b2005-09-10 00:26:11 -07004899static int migration_thread(void *data)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004900{
Linus Torvalds1da177e2005-04-16 15:20:36 -07004901 int cpu = (long)data;
Ingo Molnar70b97a72006-07-03 00:25:42 -07004902 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004903
4904 rq = cpu_rq(cpu);
4905 BUG_ON(rq->migration_thread != current);
4906
4907 set_current_state(TASK_INTERRUPTIBLE);
4908 while (!kthread_should_stop()) {
Ingo Molnar70b97a72006-07-03 00:25:42 -07004909 struct migration_req *req;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004910 struct list_head *head;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004911
Christoph Lameter3e1d1d22005-06-24 23:13:50 -07004912 try_to_freeze();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004913
4914 spin_lock_irq(&rq->lock);
4915
4916 if (cpu_is_offline(cpu)) {
4917 spin_unlock_irq(&rq->lock);
4918 goto wait_to_die;
4919 }
4920
4921 if (rq->active_balance) {
4922 active_load_balance(rq, cpu);
4923 rq->active_balance = 0;
4924 }
4925
4926 head = &rq->migration_queue;
4927
4928 if (list_empty(head)) {
4929 spin_unlock_irq(&rq->lock);
4930 schedule();
4931 set_current_state(TASK_INTERRUPTIBLE);
4932 continue;
4933 }
Ingo Molnar70b97a72006-07-03 00:25:42 -07004934 req = list_entry(head->next, struct migration_req, list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004935 list_del_init(head->next);
4936
Nick Piggin674311d2005-06-25 14:57:27 -07004937 spin_unlock(&rq->lock);
4938 __migrate_task(req->task, cpu, req->dest_cpu);
4939 local_irq_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -07004940
4941 complete(&req->done);
4942 }
4943 __set_current_state(TASK_RUNNING);
4944 return 0;
4945
4946wait_to_die:
4947 /* Wait for kthread_stop */
4948 set_current_state(TASK_INTERRUPTIBLE);
4949 while (!kthread_should_stop()) {
4950 schedule();
4951 set_current_state(TASK_INTERRUPTIBLE);
4952 }
4953 __set_current_state(TASK_RUNNING);
4954 return 0;
4955}
4956
4957#ifdef CONFIG_HOTPLUG_CPU
Kirill Korotaev054b9102006-12-10 02:20:11 -08004958/*
4959 * Figure out where task on dead CPU should go, use force if neccessary.
4960 * NOTE: interrupts should be disabled by the caller
4961 */
Ingo Molnar48f24c42006-07-03 00:25:40 -07004962static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07004963{
Kirill Korotaevefc30812006-06-27 02:54:32 -07004964 unsigned long flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004965 cpumask_t mask;
Ingo Molnar70b97a72006-07-03 00:25:42 -07004966 struct rq *rq;
4967 int dest_cpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004968
Kirill Korotaevefc30812006-06-27 02:54:32 -07004969restart:
Linus Torvalds1da177e2005-04-16 15:20:36 -07004970 /* On same node? */
4971 mask = node_to_cpumask(cpu_to_node(dead_cpu));
Ingo Molnar48f24c42006-07-03 00:25:40 -07004972 cpus_and(mask, mask, p->cpus_allowed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004973 dest_cpu = any_online_cpu(mask);
4974
4975 /* On any allowed CPU? */
4976 if (dest_cpu == NR_CPUS)
Ingo Molnar48f24c42006-07-03 00:25:40 -07004977 dest_cpu = any_online_cpu(p->cpus_allowed);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004978
4979 /* No more Mr. Nice Guy. */
4980 if (dest_cpu == NR_CPUS) {
Ingo Molnar48f24c42006-07-03 00:25:40 -07004981 rq = task_rq_lock(p, &flags);
4982 cpus_setall(p->cpus_allowed);
4983 dest_cpu = any_online_cpu(p->cpus_allowed);
Kirill Korotaevefc30812006-06-27 02:54:32 -07004984 task_rq_unlock(rq, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004985
4986 /*
4987 * Don't tell them about moving exiting tasks or
4988 * kernel threads (both mm NULL), since they never
4989 * leave kernel.
4990 */
Ingo Molnar48f24c42006-07-03 00:25:40 -07004991 if (p->mm && printk_ratelimit())
Linus Torvalds1da177e2005-04-16 15:20:36 -07004992 printk(KERN_INFO "process %d (%s) no "
4993 "longer affine to cpu%d\n",
Ingo Molnar48f24c42006-07-03 00:25:40 -07004994 p->pid, p->comm, dead_cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07004995 }
Ingo Molnar48f24c42006-07-03 00:25:40 -07004996 if (!__migrate_task(p, dead_cpu, dest_cpu))
Kirill Korotaevefc30812006-06-27 02:54:32 -07004997 goto restart;
Linus Torvalds1da177e2005-04-16 15:20:36 -07004998}
4999
5000/*
5001 * While a dead CPU has no uninterruptible tasks queued at this point,
5002 * it might still have a nonzero ->nr_uninterruptible counter, because
5003 * for performance reasons the counter is not stricly tracking tasks to
5004 * their home CPUs. So we just add the counter to another CPU's counter,
5005 * to keep the global sum constant after CPU-down:
5006 */
Ingo Molnar70b97a72006-07-03 00:25:42 -07005007static void migrate_nr_uninterruptible(struct rq *rq_src)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005008{
Ingo Molnar70b97a72006-07-03 00:25:42 -07005009 struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
Linus Torvalds1da177e2005-04-16 15:20:36 -07005010 unsigned long flags;
5011
5012 local_irq_save(flags);
5013 double_rq_lock(rq_src, rq_dest);
5014 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5015 rq_src->nr_uninterruptible = 0;
5016 double_rq_unlock(rq_src, rq_dest);
5017 local_irq_restore(flags);
5018}
5019
5020/* Run through task list and migrate tasks from the dead cpu. */
5021static void migrate_live_tasks(int src_cpu)
5022{
Ingo Molnar48f24c42006-07-03 00:25:40 -07005023 struct task_struct *p, *t;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005024
5025 write_lock_irq(&tasklist_lock);
5026
Ingo Molnar48f24c42006-07-03 00:25:40 -07005027 do_each_thread(t, p) {
5028 if (p == current)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005029 continue;
5030
Ingo Molnar48f24c42006-07-03 00:25:40 -07005031 if (task_cpu(p) == src_cpu)
5032 move_task_off_dead_cpu(src_cpu, p);
5033 } while_each_thread(t, p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005034
5035 write_unlock_irq(&tasklist_lock);
5036}
5037
Ingo Molnardd41f592007-07-09 18:51:59 +02005038/*
5039 * Schedules idle task to be the next runnable task on current CPU.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005040 * It does so by boosting its priority to highest possible and adding it to
Ingo Molnar48f24c42006-07-03 00:25:40 -07005041 * the _front_ of the runqueue. Used by CPU offline code.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005042 */
5043void sched_idle_next(void)
5044{
Ingo Molnar48f24c42006-07-03 00:25:40 -07005045 int this_cpu = smp_processor_id();
Ingo Molnar70b97a72006-07-03 00:25:42 -07005046 struct rq *rq = cpu_rq(this_cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005047 struct task_struct *p = rq->idle;
5048 unsigned long flags;
5049
5050 /* cpu has to be offline */
Ingo Molnar48f24c42006-07-03 00:25:40 -07005051 BUG_ON(cpu_online(this_cpu));
Linus Torvalds1da177e2005-04-16 15:20:36 -07005052
Ingo Molnar48f24c42006-07-03 00:25:40 -07005053 /*
5054 * Strictly not necessary since rest of the CPUs are stopped by now
5055 * and interrupts disabled on the current cpu.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005056 */
5057 spin_lock_irqsave(&rq->lock, flags);
5058
Ingo Molnardd41f592007-07-09 18:51:59 +02005059 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005060
5061 /* Add idle task to the _front_ of its priority queue: */
Ingo Molnardd41f592007-07-09 18:51:59 +02005062 activate_idle_task(p, rq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005063
5064 spin_unlock_irqrestore(&rq->lock, flags);
5065}
5066
Ingo Molnar48f24c42006-07-03 00:25:40 -07005067/*
5068 * Ensures that the idle task is using init_mm right before its cpu goes
Linus Torvalds1da177e2005-04-16 15:20:36 -07005069 * offline.
5070 */
5071void idle_task_exit(void)
5072{
5073 struct mm_struct *mm = current->active_mm;
5074
5075 BUG_ON(cpu_online(smp_processor_id()));
5076
5077 if (mm != &init_mm)
5078 switch_mm(mm, &init_mm, current);
5079 mmdrop(mm);
5080}
5081
Kirill Korotaev054b9102006-12-10 02:20:11 -08005082/* called under rq->lock with disabled interrupts */
Ingo Molnar36c8b582006-07-03 00:25:41 -07005083static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005084{
Ingo Molnar70b97a72006-07-03 00:25:42 -07005085 struct rq *rq = cpu_rq(dead_cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005086
5087 /* Must be exiting, otherwise would be on tasklist. */
Ingo Molnar48f24c42006-07-03 00:25:40 -07005088 BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005089
5090 /* Cannot have done final schedule yet: would have vanished. */
Oleg Nesterovc394cc92006-09-29 02:01:11 -07005091 BUG_ON(p->state == TASK_DEAD);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005092
Ingo Molnar48f24c42006-07-03 00:25:40 -07005093 get_task_struct(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005094
5095 /*
5096 * Drop lock around migration; if someone else moves it,
5097 * that's OK. No task can be added to this CPU, so iteration is
5098 * fine.
Kirill Korotaev054b9102006-12-10 02:20:11 -08005099 * NOTE: interrupts should be left disabled --dev@
Linus Torvalds1da177e2005-04-16 15:20:36 -07005100 */
Kirill Korotaev054b9102006-12-10 02:20:11 -08005101 spin_unlock(&rq->lock);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005102 move_task_off_dead_cpu(dead_cpu, p);
Kirill Korotaev054b9102006-12-10 02:20:11 -08005103 spin_lock(&rq->lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005104
Ingo Molnar48f24c42006-07-03 00:25:40 -07005105 put_task_struct(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005106}
5107
5108/* release_task() removes task from tasklist, so we won't find dead tasks. */
5109static void migrate_dead_tasks(unsigned int dead_cpu)
5110{
Ingo Molnar70b97a72006-07-03 00:25:42 -07005111 struct rq *rq = cpu_rq(dead_cpu);
Ingo Molnardd41f592007-07-09 18:51:59 +02005112 struct task_struct *next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005113
Ingo Molnardd41f592007-07-09 18:51:59 +02005114 for ( ; ; ) {
5115 if (!rq->nr_running)
5116 break;
5117 next = pick_next_task(rq, rq->curr, rq_clock(rq));
5118 if (!next)
5119 break;
5120 migrate_dead(dead_cpu, next);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005121 }
5122}
5123#endif /* CONFIG_HOTPLUG_CPU */
5124
5125/*
5126 * migration_call - callback that gets triggered when a CPU is added.
5127 * Here we can start up the necessary migration thread for the new CPU.
5128 */
Ingo Molnar48f24c42006-07-03 00:25:40 -07005129static int __cpuinit
5130migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005131{
Linus Torvalds1da177e2005-04-16 15:20:36 -07005132 struct task_struct *p;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005133 int cpu = (long)hcpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005134 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07005135 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005136
5137 switch (action) {
Gautham R Shenoy5be93612007-05-09 02:34:04 -07005138 case CPU_LOCK_ACQUIRE:
5139 mutex_lock(&sched_hotcpu_mutex);
5140 break;
5141
Linus Torvalds1da177e2005-04-16 15:20:36 -07005142 case CPU_UP_PREPARE:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07005143 case CPU_UP_PREPARE_FROZEN:
Ingo Molnardd41f592007-07-09 18:51:59 +02005144 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005145 if (IS_ERR(p))
5146 return NOTIFY_BAD;
5147 p->flags |= PF_NOFREEZE;
5148 kthread_bind(p, cpu);
5149 /* Must be high prio: stop_machine expects to yield to it. */
5150 rq = task_rq_lock(p, &flags);
Ingo Molnardd41f592007-07-09 18:51:59 +02005151 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005152 task_rq_unlock(rq, &flags);
5153 cpu_rq(cpu)->migration_thread = p;
5154 break;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005155
Linus Torvalds1da177e2005-04-16 15:20:36 -07005156 case CPU_ONLINE:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07005157 case CPU_ONLINE_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07005158 /* Strictly unneccessary, as first user will wake it. */
5159 wake_up_process(cpu_rq(cpu)->migration_thread);
5160 break;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005161
Linus Torvalds1da177e2005-04-16 15:20:36 -07005162#ifdef CONFIG_HOTPLUG_CPU
5163 case CPU_UP_CANCELED:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07005164 case CPU_UP_CANCELED_FROZEN:
Heiko Carstensfc75cdf2006-06-25 05:49:10 -07005165 if (!cpu_rq(cpu)->migration_thread)
5166 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005167 /* Unbind it from offline cpu so it can run. Fall thru. */
Heiko Carstensa4c4af72005-11-07 00:58:38 -08005168 kthread_bind(cpu_rq(cpu)->migration_thread,
5169 any_online_cpu(cpu_online_map));
Linus Torvalds1da177e2005-04-16 15:20:36 -07005170 kthread_stop(cpu_rq(cpu)->migration_thread);
5171 cpu_rq(cpu)->migration_thread = NULL;
5172 break;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005173
Linus Torvalds1da177e2005-04-16 15:20:36 -07005174 case CPU_DEAD:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07005175 case CPU_DEAD_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07005176 migrate_live_tasks(cpu);
5177 rq = cpu_rq(cpu);
5178 kthread_stop(rq->migration_thread);
5179 rq->migration_thread = NULL;
5180 /* Idle task back to normal (off runqueue, low prio) */
5181 rq = task_rq_lock(rq->idle, &flags);
Ingo Molnardd41f592007-07-09 18:51:59 +02005182 deactivate_task(rq, rq->idle, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005183 rq->idle->static_prio = MAX_PRIO;
Ingo Molnardd41f592007-07-09 18:51:59 +02005184 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5185 rq->idle->sched_class = &idle_sched_class;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005186 migrate_dead_tasks(cpu);
5187 task_rq_unlock(rq, &flags);
5188 migrate_nr_uninterruptible(rq);
5189 BUG_ON(rq->nr_running != 0);
5190
5191 /* No need to migrate the tasks: it was best-effort if
Gautham R Shenoy5be93612007-05-09 02:34:04 -07005192 * they didn't take sched_hotcpu_mutex. Just wake up
Linus Torvalds1da177e2005-04-16 15:20:36 -07005193 * the requestors. */
5194 spin_lock_irq(&rq->lock);
5195 while (!list_empty(&rq->migration_queue)) {
Ingo Molnar70b97a72006-07-03 00:25:42 -07005196 struct migration_req *req;
5197
Linus Torvalds1da177e2005-04-16 15:20:36 -07005198 req = list_entry(rq->migration_queue.next,
Ingo Molnar70b97a72006-07-03 00:25:42 -07005199 struct migration_req, list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005200 list_del_init(&req->list);
5201 complete(&req->done);
5202 }
5203 spin_unlock_irq(&rq->lock);
5204 break;
5205#endif
Gautham R Shenoy5be93612007-05-09 02:34:04 -07005206 case CPU_LOCK_RELEASE:
5207 mutex_unlock(&sched_hotcpu_mutex);
5208 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005209 }
5210 return NOTIFY_OK;
5211}
5212
5213/* Register at highest priority so that task migration (migrate_all_tasks)
5214 * happens before everything else.
5215 */
Chandra Seetharaman26c21432006-06-27 02:54:10 -07005216static struct notifier_block __cpuinitdata migration_notifier = {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005217 .notifier_call = migration_call,
5218 .priority = 10
5219};
5220
5221int __init migration_init(void)
5222{
5223 void *cpu = (void *)(long)smp_processor_id();
Akinobu Mita07dccf32006-09-29 02:00:22 -07005224 int err;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005225
5226 /* Start one for the boot CPU: */
Akinobu Mita07dccf32006-09-29 02:00:22 -07005227 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5228 BUG_ON(err == NOTIFY_BAD);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005229 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5230 register_cpu_notifier(&migration_notifier);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005231
Linus Torvalds1da177e2005-04-16 15:20:36 -07005232 return 0;
5233}
5234#endif
5235
5236#ifdef CONFIG_SMP
Christoph Lameter476f3532007-05-06 14:48:58 -07005237
5238/* Number of possible processor ids */
5239int nr_cpu_ids __read_mostly = NR_CPUS;
5240EXPORT_SYMBOL(nr_cpu_ids);
5241
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005242#undef SCHED_DOMAIN_DEBUG
Linus Torvalds1da177e2005-04-16 15:20:36 -07005243#ifdef SCHED_DOMAIN_DEBUG
5244static void sched_domain_debug(struct sched_domain *sd, int cpu)
5245{
5246 int level = 0;
5247
Nick Piggin41c7ce92005-06-25 14:57:24 -07005248 if (!sd) {
5249 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
5250 return;
5251 }
5252
Linus Torvalds1da177e2005-04-16 15:20:36 -07005253 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
5254
5255 do {
5256 int i;
5257 char str[NR_CPUS];
5258 struct sched_group *group = sd->groups;
5259 cpumask_t groupmask;
5260
5261 cpumask_scnprintf(str, NR_CPUS, sd->span);
5262 cpus_clear(groupmask);
5263
5264 printk(KERN_DEBUG);
5265 for (i = 0; i < level + 1; i++)
5266 printk(" ");
5267 printk("domain %d: ", level);
5268
5269 if (!(sd->flags & SD_LOAD_BALANCE)) {
5270 printk("does not load-balance\n");
5271 if (sd->parent)
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005272 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
5273 " has parent");
Linus Torvalds1da177e2005-04-16 15:20:36 -07005274 break;
5275 }
5276
5277 printk("span %s\n", str);
5278
5279 if (!cpu_isset(cpu, sd->span))
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005280 printk(KERN_ERR "ERROR: domain->span does not contain "
5281 "CPU%d\n", cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005282 if (!cpu_isset(cpu, group->cpumask))
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005283 printk(KERN_ERR "ERROR: domain->groups does not contain"
5284 " CPU%d\n", cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005285
5286 printk(KERN_DEBUG);
5287 for (i = 0; i < level + 2; i++)
5288 printk(" ");
5289 printk("groups:");
5290 do {
5291 if (!group) {
5292 printk("\n");
5293 printk(KERN_ERR "ERROR: group is NULL\n");
5294 break;
5295 }
5296
Eric Dumazet5517d862007-05-08 00:32:57 -07005297 if (!group->__cpu_power) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005298 printk("\n");
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005299 printk(KERN_ERR "ERROR: domain->cpu_power not "
5300 "set\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07005301 }
5302
5303 if (!cpus_weight(group->cpumask)) {
5304 printk("\n");
5305 printk(KERN_ERR "ERROR: empty group\n");
5306 }
5307
5308 if (cpus_intersects(groupmask, group->cpumask)) {
5309 printk("\n");
5310 printk(KERN_ERR "ERROR: repeated CPUs\n");
5311 }
5312
5313 cpus_or(groupmask, groupmask, group->cpumask);
5314
5315 cpumask_scnprintf(str, NR_CPUS, group->cpumask);
5316 printk(" %s", str);
5317
5318 group = group->next;
5319 } while (group != sd->groups);
5320 printk("\n");
5321
5322 if (!cpus_equal(sd->span, groupmask))
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005323 printk(KERN_ERR "ERROR: groups don't span "
5324 "domain->span\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07005325
5326 level++;
5327 sd = sd->parent;
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005328 if (!sd)
5329 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005330
Miguel Ojeda Sandonis33859f72006-12-10 02:20:38 -08005331 if (!cpus_subset(groupmask, sd->span))
5332 printk(KERN_ERR "ERROR: parent span is not a superset "
5333 "of domain->span\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -07005334
5335 } while (sd);
5336}
5337#else
Ingo Molnar48f24c42006-07-03 00:25:40 -07005338# define sched_domain_debug(sd, cpu) do { } while (0)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005339#endif
5340
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005341static int sd_degenerate(struct sched_domain *sd)
Suresh Siddha245af2c2005-06-25 14:57:25 -07005342{
5343 if (cpus_weight(sd->span) == 1)
5344 return 1;
5345
5346 /* Following flags need at least 2 groups */
5347 if (sd->flags & (SD_LOAD_BALANCE |
5348 SD_BALANCE_NEWIDLE |
5349 SD_BALANCE_FORK |
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005350 SD_BALANCE_EXEC |
5351 SD_SHARE_CPUPOWER |
5352 SD_SHARE_PKG_RESOURCES)) {
Suresh Siddha245af2c2005-06-25 14:57:25 -07005353 if (sd->groups != sd->groups->next)
5354 return 0;
5355 }
5356
5357 /* Following flags don't use groups */
5358 if (sd->flags & (SD_WAKE_IDLE |
5359 SD_WAKE_AFFINE |
5360 SD_WAKE_BALANCE))
5361 return 0;
5362
5363 return 1;
5364}
5365
Ingo Molnar48f24c42006-07-03 00:25:40 -07005366static int
5367sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
Suresh Siddha245af2c2005-06-25 14:57:25 -07005368{
5369 unsigned long cflags = sd->flags, pflags = parent->flags;
5370
5371 if (sd_degenerate(parent))
5372 return 1;
5373
5374 if (!cpus_equal(sd->span, parent->span))
5375 return 0;
5376
5377 /* Does parent contain flags not in child? */
5378 /* WAKE_BALANCE is a subset of WAKE_AFFINE */
5379 if (cflags & SD_WAKE_AFFINE)
5380 pflags &= ~SD_WAKE_BALANCE;
5381 /* Flags needing groups don't count if only 1 group in parent */
5382 if (parent->groups == parent->groups->next) {
5383 pflags &= ~(SD_LOAD_BALANCE |
5384 SD_BALANCE_NEWIDLE |
5385 SD_BALANCE_FORK |
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005386 SD_BALANCE_EXEC |
5387 SD_SHARE_CPUPOWER |
5388 SD_SHARE_PKG_RESOURCES);
Suresh Siddha245af2c2005-06-25 14:57:25 -07005389 }
5390 if (~cflags & pflags)
5391 return 0;
5392
5393 return 1;
5394}
5395
Linus Torvalds1da177e2005-04-16 15:20:36 -07005396/*
5397 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
5398 * hold the hotplug lock.
5399 */
John Hawkes9c1cfda2005-09-06 15:18:14 -07005400static void cpu_attach_domain(struct sched_domain *sd, int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005401{
Ingo Molnar70b97a72006-07-03 00:25:42 -07005402 struct rq *rq = cpu_rq(cpu);
Suresh Siddha245af2c2005-06-25 14:57:25 -07005403 struct sched_domain *tmp;
5404
5405 /* Remove the sched domains which do not contribute to scheduling. */
5406 for (tmp = sd; tmp; tmp = tmp->parent) {
5407 struct sched_domain *parent = tmp->parent;
5408 if (!parent)
5409 break;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005410 if (sd_parent_degenerate(tmp, parent)) {
Suresh Siddha245af2c2005-06-25 14:57:25 -07005411 tmp->parent = parent->parent;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005412 if (parent->parent)
5413 parent->parent->child = tmp;
5414 }
Suresh Siddha245af2c2005-06-25 14:57:25 -07005415 }
5416
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005417 if (sd && sd_degenerate(sd)) {
Suresh Siddha245af2c2005-06-25 14:57:25 -07005418 sd = sd->parent;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005419 if (sd)
5420 sd->child = NULL;
5421 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005422
5423 sched_domain_debug(sd, cpu);
5424
Nick Piggin674311d2005-06-25 14:57:27 -07005425 rcu_assign_pointer(rq->sd, sd);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005426}
5427
5428/* cpus with isolated domains */
Tim Chen67af63a2006-12-22 01:07:50 -08005429static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005430
5431/* Setup the mask of cpus configured for isolated domains */
5432static int __init isolated_cpu_setup(char *str)
5433{
5434 int ints[NR_CPUS], i;
5435
5436 str = get_options(str, ARRAY_SIZE(ints), ints);
5437 cpus_clear(cpu_isolated_map);
5438 for (i = 1; i <= ints[0]; i++)
5439 if (ints[i] < NR_CPUS)
5440 cpu_set(ints[i], cpu_isolated_map);
5441 return 1;
5442}
5443
5444__setup ("isolcpus=", isolated_cpu_setup);
5445
5446/*
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005447 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
5448 * to a function which identifies what group(along with sched group) a CPU
5449 * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
5450 * (due to the fact that we keep track of groups covered with a cpumask_t).
Linus Torvalds1da177e2005-04-16 15:20:36 -07005451 *
5452 * init_sched_build_groups will build a circular linked list of the groups
5453 * covered by the given span, and will set each group's ->cpumask correctly,
5454 * and ->cpu_power to 0.
5455 */
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005456static void
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005457init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5458 int (*group_fn)(int cpu, const cpumask_t *cpu_map,
5459 struct sched_group **sg))
Linus Torvalds1da177e2005-04-16 15:20:36 -07005460{
5461 struct sched_group *first = NULL, *last = NULL;
5462 cpumask_t covered = CPU_MASK_NONE;
5463 int i;
5464
5465 for_each_cpu_mask(i, span) {
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005466 struct sched_group *sg;
5467 int group = group_fn(i, cpu_map, &sg);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005468 int j;
5469
5470 if (cpu_isset(i, covered))
5471 continue;
5472
5473 sg->cpumask = CPU_MASK_NONE;
Eric Dumazet5517d862007-05-08 00:32:57 -07005474 sg->__cpu_power = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005475
5476 for_each_cpu_mask(j, span) {
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005477 if (group_fn(j, cpu_map, NULL) != group)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005478 continue;
5479
5480 cpu_set(j, covered);
5481 cpu_set(j, sg->cpumask);
5482 }
5483 if (!first)
5484 first = sg;
5485 if (last)
5486 last->next = sg;
5487 last = sg;
5488 }
5489 last->next = first;
5490}
5491
John Hawkes9c1cfda2005-09-06 15:18:14 -07005492#define SD_NODES_PER_DOMAIN 16
Linus Torvalds1da177e2005-04-16 15:20:36 -07005493
John Hawkes9c1cfda2005-09-06 15:18:14 -07005494#ifdef CONFIG_NUMA
akpm@osdl.org198e2f12006-01-12 01:05:30 -08005495
John Hawkes9c1cfda2005-09-06 15:18:14 -07005496/**
5497 * find_next_best_node - find the next node to include in a sched_domain
5498 * @node: node whose sched_domain we're building
5499 * @used_nodes: nodes already in the sched_domain
5500 *
5501 * Find the next node to include in a given scheduling domain. Simply
5502 * finds the closest node not already in the @used_nodes map.
5503 *
5504 * Should use nodemask_t.
5505 */
5506static int find_next_best_node(int node, unsigned long *used_nodes)
5507{
5508 int i, n, val, min_val, best_node = 0;
5509
5510 min_val = INT_MAX;
5511
5512 for (i = 0; i < MAX_NUMNODES; i++) {
5513 /* Start at @node */
5514 n = (node + i) % MAX_NUMNODES;
5515
5516 if (!nr_cpus_node(n))
5517 continue;
5518
5519 /* Skip already used nodes */
5520 if (test_bit(n, used_nodes))
5521 continue;
5522
5523 /* Simple min distance search */
5524 val = node_distance(node, n);
5525
5526 if (val < min_val) {
5527 min_val = val;
5528 best_node = n;
5529 }
5530 }
5531
5532 set_bit(best_node, used_nodes);
5533 return best_node;
5534}
5535
5536/**
5537 * sched_domain_node_span - get a cpumask for a node's sched_domain
5538 * @node: node whose cpumask we're constructing
5539 * @size: number of nodes to include in this span
5540 *
5541 * Given a node, construct a good cpumask for its sched_domain to span. It
5542 * should be one that prevents unnecessary balancing, but also spreads tasks
5543 * out optimally.
5544 */
5545static cpumask_t sched_domain_node_span(int node)
5546{
John Hawkes9c1cfda2005-09-06 15:18:14 -07005547 DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005548 cpumask_t span, nodemask;
5549 int i;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005550
5551 cpus_clear(span);
5552 bitmap_zero(used_nodes, MAX_NUMNODES);
5553
5554 nodemask = node_to_cpumask(node);
5555 cpus_or(span, span, nodemask);
5556 set_bit(node, used_nodes);
5557
5558 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5559 int next_node = find_next_best_node(node, used_nodes);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005560
John Hawkes9c1cfda2005-09-06 15:18:14 -07005561 nodemask = node_to_cpumask(next_node);
5562 cpus_or(span, span, nodemask);
5563 }
5564
5565 return span;
5566}
5567#endif
5568
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07005569int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005570
John Hawkes9c1cfda2005-09-06 15:18:14 -07005571/*
Ingo Molnar48f24c42006-07-03 00:25:40 -07005572 * SMT sched-domains:
John Hawkes9c1cfda2005-09-06 15:18:14 -07005573 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005574#ifdef CONFIG_SCHED_SMT
5575static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005576static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005577
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005578static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map,
5579 struct sched_group **sg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005580{
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005581 if (sg)
5582 *sg = &per_cpu(sched_group_cpus, cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005583 return cpu;
5584}
5585#endif
5586
Ingo Molnar48f24c42006-07-03 00:25:40 -07005587/*
5588 * multi-core sched-domains:
5589 */
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005590#ifdef CONFIG_SCHED_MC
5591static DEFINE_PER_CPU(struct sched_domain, core_domains);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005592static DEFINE_PER_CPU(struct sched_group, sched_group_core);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005593#endif
5594
5595#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005596static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5597 struct sched_group **sg)
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005598{
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005599 int group;
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005600 cpumask_t mask = cpu_sibling_map[cpu];
5601 cpus_and(mask, mask, *cpu_map);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005602 group = first_cpu(mask);
5603 if (sg)
5604 *sg = &per_cpu(sched_group_core, group);
5605 return group;
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005606}
5607#elif defined(CONFIG_SCHED_MC)
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005608static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
5609 struct sched_group **sg)
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005610{
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005611 if (sg)
5612 *sg = &per_cpu(sched_group_core, cpu);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005613 return cpu;
5614}
5615#endif
5616
Linus Torvalds1da177e2005-04-16 15:20:36 -07005617static DEFINE_PER_CPU(struct sched_domain, phys_domains);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005618static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
Ingo Molnar48f24c42006-07-03 00:25:40 -07005619
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005620static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map,
5621 struct sched_group **sg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005622{
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005623 int group;
Ingo Molnar48f24c42006-07-03 00:25:40 -07005624#ifdef CONFIG_SCHED_MC
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005625 cpumask_t mask = cpu_coregroup_map(cpu);
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005626 cpus_and(mask, mask, *cpu_map);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005627 group = first_cpu(mask);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005628#elif defined(CONFIG_SCHED_SMT)
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005629 cpumask_t mask = cpu_sibling_map[cpu];
5630 cpus_and(mask, mask, *cpu_map);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005631 group = first_cpu(mask);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005632#else
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005633 group = cpu;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005634#endif
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005635 if (sg)
5636 *sg = &per_cpu(sched_group_phys, group);
5637 return group;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005638}
5639
5640#ifdef CONFIG_NUMA
John Hawkes9c1cfda2005-09-06 15:18:14 -07005641/*
5642 * The init_sched_build_groups can't handle what we want to do with node
5643 * groups, so roll our own. Now each node has its own list of groups which
5644 * gets dynamically allocated.
5645 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07005646static DEFINE_PER_CPU(struct sched_domain, node_domains);
John Hawkesd1b55132005-09-06 15:18:14 -07005647static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
John Hawkes9c1cfda2005-09-06 15:18:14 -07005648
5649static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005650static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
John Hawkes9c1cfda2005-09-06 15:18:14 -07005651
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005652static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
5653 struct sched_group **sg)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005654{
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005655 cpumask_t nodemask = node_to_cpumask(cpu_to_node(cpu));
5656 int group;
5657
5658 cpus_and(nodemask, nodemask, *cpu_map);
5659 group = first_cpu(nodemask);
5660
5661 if (sg)
5662 *sg = &per_cpu(sched_group_allnodes, group);
5663 return group;
Linus Torvalds1da177e2005-04-16 15:20:36 -07005664}
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005665
Siddha, Suresh B08069032006-03-27 01:15:23 -08005666static void init_numa_sched_groups_power(struct sched_group *group_head)
5667{
5668 struct sched_group *sg = group_head;
5669 int j;
5670
5671 if (!sg)
5672 return;
5673next_sg:
5674 for_each_cpu_mask(j, sg->cpumask) {
5675 struct sched_domain *sd;
5676
5677 sd = &per_cpu(phys_domains, j);
5678 if (j != first_cpu(sd->groups->cpumask)) {
5679 /*
5680 * Only add "power" once for each
5681 * physical package.
5682 */
5683 continue;
5684 }
5685
Eric Dumazet5517d862007-05-08 00:32:57 -07005686 sg_inc_cpu_power(sg, sd->groups->__cpu_power);
Siddha, Suresh B08069032006-03-27 01:15:23 -08005687 }
5688 sg = sg->next;
5689 if (sg != group_head)
5690 goto next_sg;
5691}
Linus Torvalds1da177e2005-04-16 15:20:36 -07005692#endif
5693
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005694#ifdef CONFIG_NUMA
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005695/* Free memory allocated for various sched_group structures */
5696static void free_sched_groups(const cpumask_t *cpu_map)
5697{
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005698 int cpu, i;
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005699
5700 for_each_cpu_mask(cpu, *cpu_map) {
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005701 struct sched_group **sched_group_nodes
5702 = sched_group_nodes_bycpu[cpu];
5703
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005704 if (!sched_group_nodes)
5705 continue;
5706
5707 for (i = 0; i < MAX_NUMNODES; i++) {
5708 cpumask_t nodemask = node_to_cpumask(i);
5709 struct sched_group *oldsg, *sg = sched_group_nodes[i];
5710
5711 cpus_and(nodemask, nodemask, *cpu_map);
5712 if (cpus_empty(nodemask))
5713 continue;
5714
5715 if (sg == NULL)
5716 continue;
5717 sg = sg->next;
5718next_sg:
5719 oldsg = sg;
5720 sg = sg->next;
5721 kfree(oldsg);
5722 if (oldsg != sched_group_nodes[i])
5723 goto next_sg;
5724 }
5725 kfree(sched_group_nodes);
5726 sched_group_nodes_bycpu[cpu] = NULL;
5727 }
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005728}
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07005729#else
5730static void free_sched_groups(const cpumask_t *cpu_map)
5731{
5732}
5733#endif
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005734
Linus Torvalds1da177e2005-04-16 15:20:36 -07005735/*
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005736 * Initialize sched groups cpu_power.
5737 *
5738 * cpu_power indicates the capacity of sched group, which is used while
5739 * distributing the load between different sched groups in a sched domain.
5740 * Typically cpu_power for all the groups in a sched domain will be same unless
5741 * there are asymmetries in the topology. If there are asymmetries, group
5742 * having more cpu_power will pickup more load compared to the group having
5743 * less cpu_power.
5744 *
5745 * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
5746 * the maximum number of tasks a group can handle in the presence of other idle
5747 * or lightly loaded groups in the same sched domain.
5748 */
5749static void init_sched_groups_power(int cpu, struct sched_domain *sd)
5750{
5751 struct sched_domain *child;
5752 struct sched_group *group;
5753
5754 WARN_ON(!sd || !sd->groups);
5755
5756 if (cpu != first_cpu(sd->groups->cpumask))
5757 return;
5758
5759 child = sd->child;
5760
Eric Dumazet5517d862007-05-08 00:32:57 -07005761 sd->groups->__cpu_power = 0;
5762
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005763 /*
5764 * For perf policy, if the groups in child domain share resources
5765 * (for example cores sharing some portions of the cache hierarchy
5766 * or SMT), then set this domain groups cpu_power such that each group
5767 * can handle only one task, when there are other idle groups in the
5768 * same sched domain.
5769 */
5770 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
5771 (child->flags &
5772 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
Eric Dumazet5517d862007-05-08 00:32:57 -07005773 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005774 return;
5775 }
5776
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005777 /*
5778 * add cpu_power of each child group to this groups cpu_power
5779 */
5780 group = child->groups;
5781 do {
Eric Dumazet5517d862007-05-08 00:32:57 -07005782 sg_inc_cpu_power(sd->groups, group->__cpu_power);
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005783 group = group->next;
5784 } while (group != child->groups);
5785}
5786
5787/*
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005788 * Build sched domains for a given set of cpus and attach the sched domains
5789 * to the individual cpus
Linus Torvalds1da177e2005-04-16 15:20:36 -07005790 */
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005791static int build_sched_domains(const cpumask_t *cpu_map)
Linus Torvalds1da177e2005-04-16 15:20:36 -07005792{
5793 int i;
John Hawkesd1b55132005-09-06 15:18:14 -07005794#ifdef CONFIG_NUMA
5795 struct sched_group **sched_group_nodes = NULL;
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005796 int sd_allnodes = 0;
John Hawkesd1b55132005-09-06 15:18:14 -07005797
5798 /*
5799 * Allocate the per-node list of sched groups
5800 */
Ingo Molnardd41f592007-07-09 18:51:59 +02005801 sched_group_nodes = kzalloc(sizeof(struct sched_group *)*MAX_NUMNODES,
Srivatsa Vaddagirid3a5aa92006-06-27 02:54:39 -07005802 GFP_KERNEL);
John Hawkesd1b55132005-09-06 15:18:14 -07005803 if (!sched_group_nodes) {
5804 printk(KERN_WARNING "Can not alloc sched group node list\n");
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005805 return -ENOMEM;
John Hawkesd1b55132005-09-06 15:18:14 -07005806 }
5807 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
5808#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07005809
5810 /*
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005811 * Set up domains for cpus specified by the cpu_map.
Linus Torvalds1da177e2005-04-16 15:20:36 -07005812 */
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005813 for_each_cpu_mask(i, *cpu_map) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005814 struct sched_domain *sd = NULL, *p;
5815 cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
5816
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005817 cpus_and(nodemask, nodemask, *cpu_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005818
5819#ifdef CONFIG_NUMA
Ingo Molnardd41f592007-07-09 18:51:59 +02005820 if (cpus_weight(*cpu_map) >
5821 SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
John Hawkes9c1cfda2005-09-06 15:18:14 -07005822 sd = &per_cpu(allnodes_domains, i);
5823 *sd = SD_ALLNODES_INIT;
5824 sd->span = *cpu_map;
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005825 cpu_to_allnodes_group(i, cpu_map, &sd->groups);
John Hawkes9c1cfda2005-09-06 15:18:14 -07005826 p = sd;
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005827 sd_allnodes = 1;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005828 } else
5829 p = NULL;
5830
Linus Torvalds1da177e2005-04-16 15:20:36 -07005831 sd = &per_cpu(node_domains, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005832 *sd = SD_NODE_INIT;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005833 sd->span = sched_domain_node_span(cpu_to_node(i));
5834 sd->parent = p;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005835 if (p)
5836 p->child = sd;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005837 cpus_and(sd->span, sd->span, *cpu_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005838#endif
5839
5840 p = sd;
5841 sd = &per_cpu(phys_domains, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005842 *sd = SD_CPU_INIT;
5843 sd->span = nodemask;
5844 sd->parent = p;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005845 if (p)
5846 p->child = sd;
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005847 cpu_to_phys_group(i, cpu_map, &sd->groups);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005848
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005849#ifdef CONFIG_SCHED_MC
5850 p = sd;
5851 sd = &per_cpu(core_domains, i);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005852 *sd = SD_MC_INIT;
5853 sd->span = cpu_coregroup_map(i);
5854 cpus_and(sd->span, sd->span, *cpu_map);
5855 sd->parent = p;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005856 p->child = sd;
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005857 cpu_to_core_group(i, cpu_map, &sd->groups);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005858#endif
5859
Linus Torvalds1da177e2005-04-16 15:20:36 -07005860#ifdef CONFIG_SCHED_SMT
5861 p = sd;
5862 sd = &per_cpu(cpu_domains, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005863 *sd = SD_SIBLING_INIT;
5864 sd->span = cpu_sibling_map[i];
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005865 cpus_and(sd->span, sd->span, *cpu_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005866 sd->parent = p;
Siddha, Suresh B1a848872006-10-03 01:14:08 -07005867 p->child = sd;
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005868 cpu_to_cpu_group(i, cpu_map, &sd->groups);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005869#endif
5870 }
5871
5872#ifdef CONFIG_SCHED_SMT
5873 /* Set up CPU (sibling) groups */
John Hawkes9c1cfda2005-09-06 15:18:14 -07005874 for_each_cpu_mask(i, *cpu_map) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07005875 cpumask_t this_sibling_map = cpu_sibling_map[i];
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005876 cpus_and(this_sibling_map, this_sibling_map, *cpu_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005877 if (i != first_cpu(this_sibling_map))
5878 continue;
5879
Ingo Molnardd41f592007-07-09 18:51:59 +02005880 init_sched_build_groups(this_sibling_map, cpu_map,
5881 &cpu_to_cpu_group);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005882 }
5883#endif
5884
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005885#ifdef CONFIG_SCHED_MC
5886 /* Set up multi-core groups */
5887 for_each_cpu_mask(i, *cpu_map) {
5888 cpumask_t this_core_map = cpu_coregroup_map(i);
5889 cpus_and(this_core_map, this_core_map, *cpu_map);
5890 if (i != first_cpu(this_core_map))
5891 continue;
Ingo Molnardd41f592007-07-09 18:51:59 +02005892 init_sched_build_groups(this_core_map, cpu_map,
5893 &cpu_to_core_group);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08005894 }
5895#endif
5896
Linus Torvalds1da177e2005-04-16 15:20:36 -07005897 /* Set up physical groups */
5898 for (i = 0; i < MAX_NUMNODES; i++) {
5899 cpumask_t nodemask = node_to_cpumask(i);
5900
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005901 cpus_and(nodemask, nodemask, *cpu_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005902 if (cpus_empty(nodemask))
5903 continue;
5904
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005905 init_sched_build_groups(nodemask, cpu_map, &cpu_to_phys_group);
Linus Torvalds1da177e2005-04-16 15:20:36 -07005906 }
5907
5908#ifdef CONFIG_NUMA
5909 /* Set up node groups */
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08005910 if (sd_allnodes)
Ingo Molnardd41f592007-07-09 18:51:59 +02005911 init_sched_build_groups(*cpu_map, cpu_map,
5912 &cpu_to_allnodes_group);
John Hawkes9c1cfda2005-09-06 15:18:14 -07005913
5914 for (i = 0; i < MAX_NUMNODES; i++) {
5915 /* Set up node groups */
5916 struct sched_group *sg, *prev;
5917 cpumask_t nodemask = node_to_cpumask(i);
5918 cpumask_t domainspan;
5919 cpumask_t covered = CPU_MASK_NONE;
5920 int j;
5921
5922 cpus_and(nodemask, nodemask, *cpu_map);
John Hawkesd1b55132005-09-06 15:18:14 -07005923 if (cpus_empty(nodemask)) {
5924 sched_group_nodes[i] = NULL;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005925 continue;
John Hawkesd1b55132005-09-06 15:18:14 -07005926 }
John Hawkes9c1cfda2005-09-06 15:18:14 -07005927
5928 domainspan = sched_domain_node_span(i);
5929 cpus_and(domainspan, domainspan, *cpu_map);
5930
Srivatsa Vaddagiri15f0b672006-06-27 02:54:40 -07005931 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005932 if (!sg) {
5933 printk(KERN_WARNING "Can not alloc domain group for "
5934 "node %d\n", i);
5935 goto error;
5936 }
John Hawkes9c1cfda2005-09-06 15:18:14 -07005937 sched_group_nodes[i] = sg;
5938 for_each_cpu_mask(j, nodemask) {
5939 struct sched_domain *sd;
Ingo Molnar9761eea2007-07-09 18:52:00 +02005940
John Hawkes9c1cfda2005-09-06 15:18:14 -07005941 sd = &per_cpu(node_domains, j);
5942 sd->groups = sg;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005943 }
Eric Dumazet5517d862007-05-08 00:32:57 -07005944 sg->__cpu_power = 0;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005945 sg->cpumask = nodemask;
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005946 sg->next = sg;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005947 cpus_or(covered, covered, nodemask);
5948 prev = sg;
5949
5950 for (j = 0; j < MAX_NUMNODES; j++) {
5951 cpumask_t tmp, notcovered;
5952 int n = (i + j) % MAX_NUMNODES;
5953
5954 cpus_complement(notcovered, covered);
5955 cpus_and(tmp, notcovered, *cpu_map);
5956 cpus_and(tmp, tmp, domainspan);
5957 if (cpus_empty(tmp))
5958 break;
5959
5960 nodemask = node_to_cpumask(n);
5961 cpus_and(tmp, tmp, nodemask);
5962 if (cpus_empty(tmp))
5963 continue;
5964
Srivatsa Vaddagiri15f0b672006-06-27 02:54:40 -07005965 sg = kmalloc_node(sizeof(struct sched_group),
5966 GFP_KERNEL, i);
John Hawkes9c1cfda2005-09-06 15:18:14 -07005967 if (!sg) {
5968 printk(KERN_WARNING
5969 "Can not alloc domain group for node %d\n", j);
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005970 goto error;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005971 }
Eric Dumazet5517d862007-05-08 00:32:57 -07005972 sg->__cpu_power = 0;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005973 sg->cpumask = tmp;
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07005974 sg->next = prev->next;
John Hawkes9c1cfda2005-09-06 15:18:14 -07005975 cpus_or(covered, covered, tmp);
5976 prev->next = sg;
5977 prev = sg;
5978 }
John Hawkes9c1cfda2005-09-06 15:18:14 -07005979 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07005980#endif
5981
5982 /* Calculate CPU power for physical packages and nodes */
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07005983#ifdef CONFIG_SCHED_SMT
5984 for_each_cpu_mask(i, *cpu_map) {
Ingo Molnardd41f592007-07-09 18:51:59 +02005985 struct sched_domain *sd = &per_cpu(cpu_domains, i);
5986
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005987 init_sched_groups_power(i, sd);
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07005988 }
5989#endif
5990#ifdef CONFIG_SCHED_MC
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07005991 for_each_cpu_mask(i, *cpu_map) {
Ingo Molnardd41f592007-07-09 18:51:59 +02005992 struct sched_domain *sd = &per_cpu(core_domains, i);
5993
Siddha, Suresh B89c47102006-10-03 01:14:09 -07005994 init_sched_groups_power(i, sd);
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07005995 }
5996#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07005997
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07005998 for_each_cpu_mask(i, *cpu_map) {
Ingo Molnardd41f592007-07-09 18:51:59 +02005999 struct sched_domain *sd = &per_cpu(phys_domains, i);
6000
Siddha, Suresh B89c47102006-10-03 01:14:09 -07006001 init_sched_groups_power(i, sd);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006002 }
6003
John Hawkes9c1cfda2005-09-06 15:18:14 -07006004#ifdef CONFIG_NUMA
Siddha, Suresh B08069032006-03-27 01:15:23 -08006005 for (i = 0; i < MAX_NUMNODES; i++)
6006 init_numa_sched_groups_power(sched_group_nodes[i]);
John Hawkes9c1cfda2005-09-06 15:18:14 -07006007
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08006008 if (sd_allnodes) {
6009 struct sched_group *sg;
Siddha, Suresh Bf712c0c72006-07-30 03:02:59 -07006010
Siddha, Suresh B6711cab2006-12-10 02:20:07 -08006011 cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg);
Siddha, Suresh Bf712c0c72006-07-30 03:02:59 -07006012 init_numa_sched_groups_power(sg);
6013 }
John Hawkes9c1cfda2005-09-06 15:18:14 -07006014#endif
6015
Linus Torvalds1da177e2005-04-16 15:20:36 -07006016 /* Attach the domains */
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006017 for_each_cpu_mask(i, *cpu_map) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07006018 struct sched_domain *sd;
6019#ifdef CONFIG_SCHED_SMT
6020 sd = &per_cpu(cpu_domains, i);
Siddha, Suresh B1e9f28f2006-03-27 01:15:22 -08006021#elif defined(CONFIG_SCHED_MC)
6022 sd = &per_cpu(core_domains, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006023#else
6024 sd = &per_cpu(phys_domains, i);
6025#endif
6026 cpu_attach_domain(sd, i);
6027 }
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006028
6029 return 0;
6030
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07006031#ifdef CONFIG_NUMA
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006032error:
6033 free_sched_groups(cpu_map);
6034 return -ENOMEM;
Siddha, Suresh Ba6160582006-10-03 01:14:06 -07006035#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07006036}
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006037/*
6038 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
6039 */
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006040static int arch_init_sched_domains(const cpumask_t *cpu_map)
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006041{
6042 cpumask_t cpu_default_map;
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006043 int err;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006044
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006045 /*
6046 * Setup mask for cpus without special case scheduling requirements.
6047 * For now this just excludes isolated cpus, but could be used to
6048 * exclude other special cases in the future.
6049 */
6050 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
6051
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006052 err = build_sched_domains(&cpu_default_map);
6053
6054 return err;
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006055}
6056
6057static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
Linus Torvalds1da177e2005-04-16 15:20:36 -07006058{
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006059 free_sched_groups(cpu_map);
John Hawkes9c1cfda2005-09-06 15:18:14 -07006060}
Linus Torvalds1da177e2005-04-16 15:20:36 -07006061
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006062/*
6063 * Detach sched domains from a group of cpus specified in cpu_map
6064 * These cpus will now be attached to the NULL domain
6065 */
Arjan van de Ven858119e2006-01-14 13:20:43 -08006066static void detach_destroy_domains(const cpumask_t *cpu_map)
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006067{
6068 int i;
6069
6070 for_each_cpu_mask(i, *cpu_map)
6071 cpu_attach_domain(NULL, i);
6072 synchronize_sched();
6073 arch_destroy_sched_domains(cpu_map);
6074}
6075
6076/*
6077 * Partition sched domains as specified by the cpumasks below.
6078 * This attaches all cpus from the cpumasks to the NULL domain,
6079 * waits for a RCU quiescent period, recalculates sched
6080 * domain information and then attaches them back to the
6081 * correct sched domains
6082 * Call with hotplug lock held
6083 */
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006084int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006085{
6086 cpumask_t change_map;
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006087 int err = 0;
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006088
6089 cpus_and(*partition1, *partition1, cpu_online_map);
6090 cpus_and(*partition2, *partition2, cpu_online_map);
6091 cpus_or(change_map, *partition1, *partition2);
6092
6093 /* Detach sched domains from all of the affected cpus */
6094 detach_destroy_domains(&change_map);
6095 if (!cpus_empty(*partition1))
Srivatsa Vaddagiri51888ca2006-06-27 02:54:38 -07006096 err = build_sched_domains(partition1);
6097 if (!err && !cpus_empty(*partition2))
6098 err = build_sched_domains(partition2);
6099
6100 return err;
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006101}
6102
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006103#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6104int arch_reinit_sched_domains(void)
6105{
6106 int err;
6107
Gautham R Shenoy5be93612007-05-09 02:34:04 -07006108 mutex_lock(&sched_hotcpu_mutex);
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006109 detach_destroy_domains(&cpu_online_map);
6110 err = arch_init_sched_domains(&cpu_online_map);
Gautham R Shenoy5be93612007-05-09 02:34:04 -07006111 mutex_unlock(&sched_hotcpu_mutex);
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006112
6113 return err;
6114}
6115
6116static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6117{
6118 int ret;
6119
6120 if (buf[0] != '0' && buf[0] != '1')
6121 return -EINVAL;
6122
6123 if (smt)
6124 sched_smt_power_savings = (buf[0] == '1');
6125 else
6126 sched_mc_power_savings = (buf[0] == '1');
6127
6128 ret = arch_reinit_sched_domains();
6129
6130 return ret ? ret : count;
6131}
6132
6133int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
6134{
6135 int err = 0;
Ingo Molnar48f24c42006-07-03 00:25:40 -07006136
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006137#ifdef CONFIG_SCHED_SMT
6138 if (smt_capable())
6139 err = sysfs_create_file(&cls->kset.kobj,
6140 &attr_sched_smt_power_savings.attr);
6141#endif
6142#ifdef CONFIG_SCHED_MC
6143 if (!err && mc_capable())
6144 err = sysfs_create_file(&cls->kset.kobj,
6145 &attr_sched_mc_power_savings.attr);
6146#endif
6147 return err;
6148}
6149#endif
6150
6151#ifdef CONFIG_SCHED_MC
6152static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
6153{
6154 return sprintf(page, "%u\n", sched_mc_power_savings);
6155}
Ingo Molnar48f24c42006-07-03 00:25:40 -07006156static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
6157 const char *buf, size_t count)
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006158{
6159 return sched_power_savings_store(buf, count, 0);
6160}
6161SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
6162 sched_mc_power_savings_store);
6163#endif
6164
6165#ifdef CONFIG_SCHED_SMT
6166static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
6167{
6168 return sprintf(page, "%u\n", sched_smt_power_savings);
6169}
Ingo Molnar48f24c42006-07-03 00:25:40 -07006170static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
6171 const char *buf, size_t count)
Siddha, Suresh B5c45bf22006-06-27 02:54:42 -07006172{
6173 return sched_power_savings_store(buf, count, 1);
6174}
6175SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6176 sched_smt_power_savings_store);
6177#endif
6178
Linus Torvalds1da177e2005-04-16 15:20:36 -07006179/*
6180 * Force a reinitialization of the sched domains hierarchy. The domains
6181 * and groups cannot be updated in place without racing with the balancing
Nick Piggin41c7ce92005-06-25 14:57:24 -07006182 * code, so we temporarily attach all running cpus to the NULL domain
Linus Torvalds1da177e2005-04-16 15:20:36 -07006183 * which will prevent rebalancing while the sched domains are recalculated.
6184 */
6185static int update_sched_domains(struct notifier_block *nfb,
6186 unsigned long action, void *hcpu)
6187{
Linus Torvalds1da177e2005-04-16 15:20:36 -07006188 switch (action) {
6189 case CPU_UP_PREPARE:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006190 case CPU_UP_PREPARE_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07006191 case CPU_DOWN_PREPARE:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006192 case CPU_DOWN_PREPARE_FROZEN:
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006193 detach_destroy_domains(&cpu_online_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006194 return NOTIFY_OK;
6195
6196 case CPU_UP_CANCELED:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006197 case CPU_UP_CANCELED_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07006198 case CPU_DOWN_FAILED:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006199 case CPU_DOWN_FAILED_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07006200 case CPU_ONLINE:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006201 case CPU_ONLINE_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07006202 case CPU_DEAD:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -07006203 case CPU_DEAD_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -07006204 /*
6205 * Fall through and re-initialise the domains.
6206 */
6207 break;
6208 default:
6209 return NOTIFY_DONE;
6210 }
6211
6212 /* The hotplug lock is already held by cpu_up/cpu_down */
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006213 arch_init_sched_domains(&cpu_online_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006214
6215 return NOTIFY_OK;
6216}
Linus Torvalds1da177e2005-04-16 15:20:36 -07006217
6218void __init sched_init_smp(void)
6219{
Nick Piggin5c1e1762006-10-03 01:14:04 -07006220 cpumask_t non_isolated_cpus;
6221
Gautham R Shenoy5be93612007-05-09 02:34:04 -07006222 mutex_lock(&sched_hotcpu_mutex);
Dinakar Guniguntala1a20ff22005-06-25 14:57:33 -07006223 arch_init_sched_domains(&cpu_online_map);
Nathan Lynche5e56732007-01-10 23:15:28 -08006224 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
Nick Piggin5c1e1762006-10-03 01:14:04 -07006225 if (cpus_empty(non_isolated_cpus))
6226 cpu_set(smp_processor_id(), non_isolated_cpus);
Gautham R Shenoy5be93612007-05-09 02:34:04 -07006227 mutex_unlock(&sched_hotcpu_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006228 /* XXX: Theoretical race here - CPU may be hotplugged now */
6229 hotcpu_notifier(update_sched_domains, 0);
Nick Piggin5c1e1762006-10-03 01:14:04 -07006230
6231 /* Move init over to a non-isolated CPU */
6232 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6233 BUG();
Ingo Molnardd41f592007-07-09 18:51:59 +02006234 sched_init_granularity();
Linus Torvalds1da177e2005-04-16 15:20:36 -07006235}
6236#else
6237void __init sched_init_smp(void)
6238{
Ingo Molnardd41f592007-07-09 18:51:59 +02006239 sched_init_granularity();
Linus Torvalds1da177e2005-04-16 15:20:36 -07006240}
6241#endif /* CONFIG_SMP */
6242
6243int in_sched_functions(unsigned long addr)
6244{
6245 /* Linker adds these: start and end of __sched functions */
6246 extern char __sched_text_start[], __sched_text_end[];
Ingo Molnar48f24c42006-07-03 00:25:40 -07006247
Linus Torvalds1da177e2005-04-16 15:20:36 -07006248 return in_lock_functions(addr) ||
6249 (addr >= (unsigned long)__sched_text_start
6250 && addr < (unsigned long)__sched_text_end);
6251}
6252
Ingo Molnardd41f592007-07-09 18:51:59 +02006253static inline void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6254{
6255 cfs_rq->tasks_timeline = RB_ROOT;
6256 cfs_rq->fair_clock = 1;
6257#ifdef CONFIG_FAIR_GROUP_SCHED
6258 cfs_rq->rq = rq;
6259#endif
6260}
6261
Linus Torvalds1da177e2005-04-16 15:20:36 -07006262void __init sched_init(void)
6263{
Ingo Molnardd41f592007-07-09 18:51:59 +02006264 u64 now = sched_clock();
Christoph Lameter476f3532007-05-06 14:48:58 -07006265 int highest_cpu = 0;
Ingo Molnardd41f592007-07-09 18:51:59 +02006266 int i, j;
6267
6268 /*
6269 * Link up the scheduling class hierarchy:
6270 */
6271 rt_sched_class.next = &fair_sched_class;
6272 fair_sched_class.next = &idle_sched_class;
6273 idle_sched_class.next = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006274
KAMEZAWA Hiroyuki0a945022006-03-28 01:56:37 -08006275 for_each_possible_cpu(i) {
Ingo Molnardd41f592007-07-09 18:51:59 +02006276 struct rt_prio_array *array;
Ingo Molnar70b97a72006-07-03 00:25:42 -07006277 struct rq *rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006278
6279 rq = cpu_rq(i);
6280 spin_lock_init(&rq->lock);
Ingo Molnarfcb99372006-07-03 00:25:10 -07006281 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
Nick Piggin78979862005-06-25 14:57:13 -07006282 rq->nr_running = 0;
Ingo Molnardd41f592007-07-09 18:51:59 +02006283 rq->clock = 1;
6284 init_cfs_rq(&rq->cfs, rq);
6285#ifdef CONFIG_FAIR_GROUP_SCHED
6286 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6287 list_add(&rq->cfs.leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
6288#endif
6289 rq->ls.load_update_last = now;
6290 rq->ls.load_update_start = now;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006291
Ingo Molnardd41f592007-07-09 18:51:59 +02006292 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6293 rq->cpu_load[j] = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006294#ifdef CONFIG_SMP
Nick Piggin41c7ce92005-06-25 14:57:24 -07006295 rq->sd = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006296 rq->active_balance = 0;
Ingo Molnardd41f592007-07-09 18:51:59 +02006297 rq->next_balance = jiffies;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006298 rq->push_cpu = 0;
Christoph Lameter0a2966b2006-09-25 23:30:51 -07006299 rq->cpu = i;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006300 rq->migration_thread = NULL;
6301 INIT_LIST_HEAD(&rq->migration_queue);
6302#endif
6303 atomic_set(&rq->nr_iowait, 0);
6304
Ingo Molnardd41f592007-07-09 18:51:59 +02006305 array = &rq->rt.active;
6306 for (j = 0; j < MAX_RT_PRIO; j++) {
6307 INIT_LIST_HEAD(array->queue + j);
6308 __clear_bit(j, array->bitmap);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006309 }
Christoph Lameter476f3532007-05-06 14:48:58 -07006310 highest_cpu = i;
Ingo Molnardd41f592007-07-09 18:51:59 +02006311 /* delimiter for bitsearch: */
6312 __set_bit(MAX_RT_PRIO, array->bitmap);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006313 }
6314
Peter Williams2dd73a42006-06-27 02:54:34 -07006315 set_load_weight(&init_task);
Heiko Carstensb50f60c2006-07-30 03:03:52 -07006316
Christoph Lameterc9819f42006-12-10 02:20:25 -08006317#ifdef CONFIG_SMP
Christoph Lameter476f3532007-05-06 14:48:58 -07006318 nr_cpu_ids = highest_cpu + 1;
Christoph Lameterc9819f42006-12-10 02:20:25 -08006319 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL);
6320#endif
6321
Heiko Carstensb50f60c2006-07-30 03:03:52 -07006322#ifdef CONFIG_RT_MUTEXES
6323 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
6324#endif
6325
Linus Torvalds1da177e2005-04-16 15:20:36 -07006326 /*
6327 * The boot idle thread does lazy MMU switching as well:
6328 */
6329 atomic_inc(&init_mm.mm_count);
6330 enter_lazy_tlb(&init_mm, current);
6331
6332 /*
6333 * Make us the idle thread. Technically, schedule() should not be
6334 * called from this thread, however somewhere below it might be,
6335 * but because we are the idle thread, we just pick up running again
6336 * when this runqueue becomes "idle".
6337 */
6338 init_idle(current, smp_processor_id());
Ingo Molnardd41f592007-07-09 18:51:59 +02006339 /*
6340 * During early bootup we pretend to be a normal task:
6341 */
6342 current->sched_class = &fair_sched_class;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006343}
6344
6345#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
6346void __might_sleep(char *file, int line)
6347{
Ingo Molnar48f24c42006-07-03 00:25:40 -07006348#ifdef in_atomic
Linus Torvalds1da177e2005-04-16 15:20:36 -07006349 static unsigned long prev_jiffy; /* ratelimiting */
6350
6351 if ((in_atomic() || irqs_disabled()) &&
6352 system_state == SYSTEM_RUNNING && !oops_in_progress) {
6353 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6354 return;
6355 prev_jiffy = jiffies;
Ingo Molnar91368d72006-03-23 03:00:54 -08006356 printk(KERN_ERR "BUG: sleeping function called from invalid"
Linus Torvalds1da177e2005-04-16 15:20:36 -07006357 " context at %s:%d\n", file, line);
6358 printk("in_atomic():%d, irqs_disabled():%d\n",
6359 in_atomic(), irqs_disabled());
Peter Zijlstraa4c410f2006-12-06 20:37:21 -08006360 debug_show_held_locks(current);
Ingo Molnar3117df02006-12-13 00:34:43 -08006361 if (irqs_disabled())
6362 print_irqtrace_events(current);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006363 dump_stack();
6364 }
6365#endif
6366}
6367EXPORT_SYMBOL(__might_sleep);
6368#endif
6369
6370#ifdef CONFIG_MAGIC_SYSRQ
6371void normalize_rt_tasks(void)
6372{
Ingo Molnara0f98a12007-06-17 18:37:45 +02006373 struct task_struct *g, *p;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006374 unsigned long flags;
Ingo Molnar70b97a72006-07-03 00:25:42 -07006375 struct rq *rq;
Ingo Molnardd41f592007-07-09 18:51:59 +02006376 int on_rq;
Linus Torvalds1da177e2005-04-16 15:20:36 -07006377
6378 read_lock_irq(&tasklist_lock);
Ingo Molnara0f98a12007-06-17 18:37:45 +02006379 do_each_thread(g, p) {
Ingo Molnardd41f592007-07-09 18:51:59 +02006380 p->se.fair_key = 0;
6381 p->se.wait_runtime = 0;
6382 p->se.wait_start_fair = 0;
6383 p->se.wait_start = 0;
6384 p->se.exec_start = 0;
6385 p->se.sleep_start = 0;
6386 p->se.sleep_start_fair = 0;
6387 p->se.block_start = 0;
6388 task_rq(p)->cfs.fair_clock = 0;
6389 task_rq(p)->clock = 0;
6390
6391 if (!rt_task(p)) {
6392 /*
6393 * Renice negative nice level userspace
6394 * tasks back to 0:
6395 */
6396 if (TASK_NICE(p) < 0 && p->mm)
6397 set_user_nice(p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006398 continue;
Ingo Molnardd41f592007-07-09 18:51:59 +02006399 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07006400
Ingo Molnarb29739f2006-06-27 02:54:51 -07006401 spin_lock_irqsave(&p->pi_lock, flags);
6402 rq = __task_rq_lock(p);
Ingo Molnardd41f592007-07-09 18:51:59 +02006403#ifdef CONFIG_SMP
6404 /*
6405 * Do not touch the migration thread:
6406 */
6407 if (p == rq->migration_thread)
6408 goto out_unlock;
6409#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07006410
Ingo Molnardd41f592007-07-09 18:51:59 +02006411 on_rq = p->se.on_rq;
6412 if (on_rq)
6413 deactivate_task(task_rq(p), p, 0);
6414 __setscheduler(rq, p, SCHED_NORMAL, 0);
6415 if (on_rq) {
6416 activate_task(task_rq(p), p, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07006417 resched_task(rq->curr);
6418 }
Ingo Molnardd41f592007-07-09 18:51:59 +02006419#ifdef CONFIG_SMP
6420 out_unlock:
6421#endif
Ingo Molnarb29739f2006-06-27 02:54:51 -07006422 __task_rq_unlock(rq);
6423 spin_unlock_irqrestore(&p->pi_lock, flags);
Ingo Molnara0f98a12007-06-17 18:37:45 +02006424 } while_each_thread(g, p);
6425
Linus Torvalds1da177e2005-04-16 15:20:36 -07006426 read_unlock_irq(&tasklist_lock);
6427}
6428
6429#endif /* CONFIG_MAGIC_SYSRQ */
Linus Torvalds1df5c102005-09-12 07:59:21 -07006430
6431#ifdef CONFIG_IA64
6432/*
6433 * These functions are only useful for the IA64 MCA handling.
6434 *
6435 * They can only be called when the whole system has been
6436 * stopped - every CPU needs to be quiescent, and no scheduling
6437 * activity can take place. Using them for anything else would
6438 * be a serious bug, and as a result, they aren't even visible
6439 * under any other configuration.
6440 */
6441
6442/**
6443 * curr_task - return the current task for a given cpu.
6444 * @cpu: the processor in question.
6445 *
6446 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6447 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07006448struct task_struct *curr_task(int cpu)
Linus Torvalds1df5c102005-09-12 07:59:21 -07006449{
6450 return cpu_curr(cpu);
6451}
6452
6453/**
6454 * set_curr_task - set the current task for a given cpu.
6455 * @cpu: the processor in question.
6456 * @p: the task pointer to set.
6457 *
6458 * Description: This function must only be used when non-maskable interrupts
6459 * are serviced on a separate stack. It allows the architecture to switch the
6460 * notion of the current task on a cpu in a non-blocking manner. This function
6461 * must be called with all CPU's synchronized, and interrupts disabled, the
6462 * and caller must save the original value of the current task (see
6463 * curr_task() above) and restore that value before reenabling interrupts and
6464 * re-starting the system.
6465 *
6466 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6467 */
Ingo Molnar36c8b582006-07-03 00:25:41 -07006468void set_curr_task(int cpu, struct task_struct *p)
Linus Torvalds1df5c102005-09-12 07:59:21 -07006469{
6470 cpu_curr(cpu) = p;
6471}
6472
6473#endif