blob: 5e95b257ee26208612b8f28229612e8ce6796660 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * linux/arch/x86-64/kernel/process.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 *
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
8 *
9 * X86-64 port
10 * Andi Kleen.
Ashok Raj76e4f662005-06-25 14:55:00 -070011 *
12 * CPU hotplug support - ashok.raj@intel.com
Linus Torvalds1da177e2005-04-16 15:20:36 -070013 */
14
15/*
16 * This file handles the architecture-dependent parts of process handling..
17 */
18
19#include <stdarg.h>
20
Ashok Raj76e4f662005-06-25 14:55:00 -070021#include <linux/cpu.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070022#include <linux/errno.h>
23#include <linux/sched.h>
24#include <linux/kernel.h>
25#include <linux/mm.h>
26#include <linux/elfcore.h>
27#include <linux/smp.h>
28#include <linux/slab.h>
29#include <linux/user.h>
30#include <linux/module.h>
31#include <linux/a.out.h>
32#include <linux/interrupt.h>
33#include <linux/delay.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070034#include <linux/ptrace.h>
35#include <linux/utsname.h>
36#include <linux/random.h>
Andi Kleen95833c82006-01-11 22:44:36 +010037#include <linux/notifier.h>
bibo maoc6fd91f2006-03-26 01:38:20 -080038#include <linux/kprobes.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070039
40#include <asm/uaccess.h>
41#include <asm/pgtable.h>
42#include <asm/system.h>
43#include <asm/io.h>
44#include <asm/processor.h>
45#include <asm/i387.h>
46#include <asm/mmu_context.h>
47#include <asm/pda.h>
48#include <asm/prctl.h>
49#include <asm/kdebug.h>
50#include <asm/desc.h>
51#include <asm/proto.h>
52#include <asm/ia32.h>
Andi Kleen95833c82006-01-11 22:44:36 +010053#include <asm/idle.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070054
55asmlinkage extern void ret_from_fork(void);
56
57unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
58
Linus Torvalds1da177e2005-04-16 15:20:36 -070059unsigned long boot_option_idle_override = 0;
60EXPORT_SYMBOL(boot_option_idle_override);
61
62/*
63 * Powermanagement idle function, if any..
64 */
65void (*pm_idle)(void);
Andi Kleen2ee60e172006-06-26 13:59:44 +020066EXPORT_SYMBOL(pm_idle);
Linus Torvalds1da177e2005-04-16 15:20:36 -070067static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
68
Alan Sterne041c682006-03-27 01:16:30 -080069static ATOMIC_NOTIFIER_HEAD(idle_notifier);
Andi Kleen95833c82006-01-11 22:44:36 +010070
71void idle_notifier_register(struct notifier_block *n)
72{
Alan Sterne041c682006-03-27 01:16:30 -080073 atomic_notifier_chain_register(&idle_notifier, n);
Andi Kleen95833c82006-01-11 22:44:36 +010074}
75EXPORT_SYMBOL_GPL(idle_notifier_register);
76
77void idle_notifier_unregister(struct notifier_block *n)
78{
Alan Sterne041c682006-03-27 01:16:30 -080079 atomic_notifier_chain_unregister(&idle_notifier, n);
Andi Kleen95833c82006-01-11 22:44:36 +010080}
81EXPORT_SYMBOL(idle_notifier_unregister);
82
Andi Kleen95833c82006-01-11 22:44:36 +010083void enter_idle(void)
84{
Andi Kleena15da492006-09-26 10:52:40 +020085 write_pda(isidle, 1);
Alan Sterne041c682006-03-27 01:16:30 -080086 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
Andi Kleen95833c82006-01-11 22:44:36 +010087}
88
89static void __exit_idle(void)
90{
Andi Kleena15da492006-09-26 10:52:40 +020091 if (read_pda(isidle) == 0)
92 return;
93 write_pda(isidle, 0);
Alan Sterne041c682006-03-27 01:16:30 -080094 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
Andi Kleen95833c82006-01-11 22:44:36 +010095}
96
97/* Called from interrupts to signify idle end */
98void exit_idle(void)
99{
Andi Kleena15da492006-09-26 10:52:40 +0200100 /* idle loop has pid 0 */
101 if (current->pid)
Andi Kleen95833c82006-01-11 22:44:36 +0100102 return;
103 __exit_idle();
104}
105
Linus Torvalds1da177e2005-04-16 15:20:36 -0700106/*
107 * We use this if we don't have any better
108 * idle routine..
109 */
Adrian Bunkcdb04522006-03-24 03:15:57 -0800110static void default_idle(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700111{
Nick Piggin64c7c8f2005-11-08 21:39:04 -0800112 local_irq_enable();
113
Andi Kleen495ab9c2006-06-26 13:59:11 +0200114 current_thread_info()->status &= ~TS_POLLING;
Andi Kleen2d52ede2006-01-11 22:42:42 +0100115 smp_mb__after_clear_bit();
116 while (!need_resched()) {
117 local_irq_disable();
118 if (!need_resched())
119 safe_halt();
120 else
121 local_irq_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700122 }
Andi Kleen495ab9c2006-06-26 13:59:11 +0200123 current_thread_info()->status |= TS_POLLING;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700124}
125
126/*
127 * On SMP it's slightly faster (but much more power-consuming!)
128 * to poll the ->need_resched flag instead of waiting for the
129 * cross-CPU IPI to arrive. Use this option with caution.
130 */
131static void poll_idle (void)
132{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700133 local_irq_enable();
134
Nick Piggin64c7c8f2005-11-08 21:39:04 -0800135 asm volatile(
136 "2:"
137 "testl %0,%1;"
138 "rep; nop;"
139 "je 2b;"
140 : :
141 "i" (_TIF_NEED_RESCHED),
142 "m" (current_thread_info()->flags));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700143}
144
145void cpu_idle_wait(void)
146{
147 unsigned int cpu, this_cpu = get_cpu();
148 cpumask_t map;
149
150 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
151 put_cpu();
152
153 cpus_clear(map);
154 for_each_online_cpu(cpu) {
155 per_cpu(cpu_idle_state, cpu) = 1;
156 cpu_set(cpu, map);
157 }
158
159 __get_cpu_var(cpu_idle_state) = 0;
160
161 wmb();
162 do {
163 ssleep(1);
164 for_each_online_cpu(cpu) {
Andi Kleena88cde12005-11-05 17:25:54 +0100165 if (cpu_isset(cpu, map) &&
166 !per_cpu(cpu_idle_state, cpu))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700167 cpu_clear(cpu, map);
168 }
169 cpus_and(map, map, cpu_online_map);
170 } while (!cpus_empty(map));
171}
172EXPORT_SYMBOL_GPL(cpu_idle_wait);
173
Ashok Raj76e4f662005-06-25 14:55:00 -0700174#ifdef CONFIG_HOTPLUG_CPU
175DECLARE_PER_CPU(int, cpu_state);
176
177#include <asm/nmi.h>
Shaohua Li1fa744e2006-01-06 00:12:20 -0800178/* We halt the CPU with physical CPU hotplug */
Ashok Raj76e4f662005-06-25 14:55:00 -0700179static inline void play_dead(void)
180{
181 idle_task_exit();
182 wbinvd();
183 mb();
184 /* Ack it */
185 __get_cpu_var(cpu_state) = CPU_DEAD;
186
Shaohua Li1fa744e2006-01-06 00:12:20 -0800187 local_irq_disable();
Ashok Raj76e4f662005-06-25 14:55:00 -0700188 while (1)
Shaohua Li1fa744e2006-01-06 00:12:20 -0800189 halt();
Ashok Raj76e4f662005-06-25 14:55:00 -0700190}
191#else
192static inline void play_dead(void)
193{
194 BUG();
195}
196#endif /* CONFIG_HOTPLUG_CPU */
197
Linus Torvalds1da177e2005-04-16 15:20:36 -0700198/*
199 * The idle thread. There's no useful work to be
200 * done, so just try to conserve power and have a
201 * low exit latency (ie sit in a loop waiting for
202 * somebody to say that they'd like to reschedule)
203 */
204void cpu_idle (void)
205{
Andi Kleen495ab9c2006-06-26 13:59:11 +0200206 current_thread_info()->status |= TS_POLLING;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700207 /* endless idle loop with no priority at all */
208 while (1) {
209 while (!need_resched()) {
210 void (*idle)(void);
211
212 if (__get_cpu_var(cpu_idle_state))
213 __get_cpu_var(cpu_idle_state) = 0;
214
215 rmb();
216 idle = pm_idle;
217 if (!idle)
218 idle = default_idle;
Ashok Raj76e4f662005-06-25 14:55:00 -0700219 if (cpu_is_offline(smp_processor_id()))
220 play_dead();
Andi Kleen95833c82006-01-11 22:44:36 +0100221 enter_idle();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700222 idle();
Andi Kleena15da492006-09-26 10:52:40 +0200223 /* In many cases the interrupt that ended idle
224 has already called exit_idle. But some idle
225 loops can be woken up without interrupt. */
Andi Kleen95833c82006-01-11 22:44:36 +0100226 __exit_idle();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700227 }
228
Nick Piggin5bfb5d62005-11-08 21:39:01 -0800229 preempt_enable_no_resched();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700230 schedule();
Nick Piggin5bfb5d62005-11-08 21:39:01 -0800231 preempt_disable();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700232 }
233}
234
235/*
236 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
237 * which can obviate IPI to trigger checking of need_resched.
238 * We execute MONITOR against need_resched and enter optimized wait state
239 * through MWAIT. Whenever someone changes need_resched, we would be woken
240 * up from MWAIT (without an IPI).
241 */
242static void mwait_idle(void)
243{
244 local_irq_enable();
245
Nick Piggin64c7c8f2005-11-08 21:39:04 -0800246 while (!need_resched()) {
247 __monitor((void *)&current_thread_info()->flags, 0, 0);
248 smp_mb();
249 if (need_resched())
250 break;
251 __mwait(0, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700252 }
253}
254
Ashok Raje6982c62005-06-25 14:54:58 -0700255void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700256{
257 static int printed;
258 if (cpu_has(c, X86_FEATURE_MWAIT)) {
259 /*
260 * Skip, if setup has overridden idle.
261 * One CPU supports mwait => All CPUs supports mwait
262 */
263 if (!pm_idle) {
264 if (!printed) {
265 printk("using mwait in idle threads.\n");
266 printed = 1;
267 }
268 pm_idle = mwait_idle;
269 }
270 }
271}
272
273static int __init idle_setup (char *str)
274{
275 if (!strncmp(str, "poll", 4)) {
276 printk("using polling idle threads.\n");
277 pm_idle = poll_idle;
278 }
279
280 boot_option_idle_override = 1;
281 return 1;
282}
283
284__setup("idle=", idle_setup);
285
286/* Prints also some state that isn't saved in the pt_regs */
287void __show_regs(struct pt_regs * regs)
288{
289 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
290 unsigned int fsindex,gsindex;
291 unsigned int ds,cs,es;
292
293 printk("\n");
294 print_modules();
Andi Kleen9acf23c2005-09-12 18:49:24 +0200295 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
296 current->pid, current->comm, print_tainted(),
Serge E. Hallyn96b644b2006-10-02 02:18:13 -0700297 init_utsname()->release,
298 (int)strcspn(init_utsname()->version, " "),
299 init_utsname()->version);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700300 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
301 printk_address(regs->rip);
Ingo Molnar3ac94932006-07-03 00:24:36 -0700302 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
Andi Kleena88cde12005-11-05 17:25:54 +0100303 regs->eflags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700304 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
305 regs->rax, regs->rbx, regs->rcx);
306 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
307 regs->rdx, regs->rsi, regs->rdi);
308 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
309 regs->rbp, regs->r8, regs->r9);
310 printk("R10: %016lx R11: %016lx R12: %016lx\n",
311 regs->r10, regs->r11, regs->r12);
312 printk("R13: %016lx R14: %016lx R15: %016lx\n",
313 regs->r13, regs->r14, regs->r15);
314
315 asm("movl %%ds,%0" : "=r" (ds));
316 asm("movl %%cs,%0" : "=r" (cs));
317 asm("movl %%es,%0" : "=r" (es));
318 asm("movl %%fs,%0" : "=r" (fsindex));
319 asm("movl %%gs,%0" : "=r" (gsindex));
320
321 rdmsrl(MSR_FS_BASE, fs);
322 rdmsrl(MSR_GS_BASE, gs);
323 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
324
325 asm("movq %%cr0, %0": "=r" (cr0));
326 asm("movq %%cr2, %0": "=r" (cr2));
327 asm("movq %%cr3, %0": "=r" (cr3));
328 asm("movq %%cr4, %0": "=r" (cr4));
329
330 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
331 fs,fsindex,gs,gsindex,shadowgs);
332 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
333 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
334}
335
336void show_regs(struct pt_regs *regs)
337{
Zwane Mwaikamboc078d322005-09-06 15:16:16 -0700338 printk("CPU %d:", smp_processor_id());
Linus Torvalds1da177e2005-04-16 15:20:36 -0700339 __show_regs(regs);
Jan Beulichb538ed22006-06-26 13:57:32 +0200340 show_trace(NULL, regs, (void *)(regs + 1));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700341}
342
343/*
344 * Free current thread data structures etc..
345 */
346void exit_thread(void)
347{
348 struct task_struct *me = current;
349 struct thread_struct *t = &me->thread;
Rusty Lynch73649da2005-06-23 00:09:23 -0700350
Linus Torvalds1da177e2005-04-16 15:20:36 -0700351 if (me->thread.io_bitmap_ptr) {
352 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
353
354 kfree(t->io_bitmap_ptr);
355 t->io_bitmap_ptr = NULL;
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200356 clear_thread_flag(TIF_IO_BITMAP);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700357 /*
358 * Careful, clear this in the TSS too:
359 */
360 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
361 t->io_bitmap_max = 0;
362 put_cpu();
363 }
364}
365
366void flush_thread(void)
367{
368 struct task_struct *tsk = current;
369 struct thread_info *t = current_thread_info();
370
Andi Kleen4d9bc792006-06-26 13:57:19 +0200371 if (t->flags & _TIF_ABI_PENDING) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700372 t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
Andi Kleen4d9bc792006-06-26 13:57:19 +0200373 if (t->flags & _TIF_IA32)
374 current_thread_info()->status |= TS_COMPAT;
375 }
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200376 t->flags &= ~_TIF_DEBUG;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700377
378 tsk->thread.debugreg0 = 0;
379 tsk->thread.debugreg1 = 0;
380 tsk->thread.debugreg2 = 0;
381 tsk->thread.debugreg3 = 0;
382 tsk->thread.debugreg6 = 0;
383 tsk->thread.debugreg7 = 0;
384 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
385 /*
386 * Forget coprocessor state..
387 */
388 clear_fpu(tsk);
389 clear_used_math();
390}
391
392void release_thread(struct task_struct *dead_task)
393{
394 if (dead_task->mm) {
395 if (dead_task->mm->context.size) {
396 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
397 dead_task->comm,
398 dead_task->mm->context.ldt,
399 dead_task->mm->context.size);
400 BUG();
401 }
402 }
403}
404
405static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
406{
407 struct user_desc ud = {
408 .base_addr = addr,
409 .limit = 0xfffff,
410 .seg_32bit = 1,
411 .limit_in_pages = 1,
412 .useable = 1,
413 };
414 struct n_desc_struct *desc = (void *)t->thread.tls_array;
415 desc += tls;
416 desc->a = LDT_entry_a(&ud);
417 desc->b = LDT_entry_b(&ud);
418}
419
420static inline u32 read_32bit_tls(struct task_struct *t, int tls)
421{
422 struct desc_struct *desc = (void *)t->thread.tls_array;
423 desc += tls;
424 return desc->base0 |
425 (((u32)desc->base1) << 16) |
426 (((u32)desc->base2) << 24);
427}
428
429/*
430 * This gets called before we allocate a new thread and copy
431 * the current task into it.
432 */
433void prepare_to_copy(struct task_struct *tsk)
434{
435 unlazy_fpu(tsk);
436}
437
438int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
439 unsigned long unused,
440 struct task_struct * p, struct pt_regs * regs)
441{
442 int err;
443 struct pt_regs * childregs;
444 struct task_struct *me = current;
445
Andi Kleena88cde12005-11-05 17:25:54 +0100446 childregs = ((struct pt_regs *)
Al Viro57eafdc2006-01-12 01:05:39 -0800447 (THREAD_SIZE + task_stack_page(p))) - 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700448 *childregs = *regs;
449
450 childregs->rax = 0;
451 childregs->rsp = rsp;
Andi Kleena88cde12005-11-05 17:25:54 +0100452 if (rsp == ~0UL)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700453 childregs->rsp = (unsigned long)childregs;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700454
455 p->thread.rsp = (unsigned long) childregs;
456 p->thread.rsp0 = (unsigned long) (childregs+1);
457 p->thread.userrsp = me->thread.userrsp;
458
Al Viroe4f17c42006-01-12 01:05:38 -0800459 set_tsk_thread_flag(p, TIF_FORK);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700460
461 p->thread.fs = me->thread.fs;
462 p->thread.gs = me->thread.gs;
463
H. J. Lufd51f662005-05-01 08:58:48 -0700464 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
465 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
466 asm("mov %%es,%0" : "=m" (p->thread.es));
467 asm("mov %%ds,%0" : "=m" (p->thread.ds));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700468
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200469 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700470 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
471 if (!p->thread.io_bitmap_ptr) {
472 p->thread.io_bitmap_max = 0;
473 return -ENOMEM;
474 }
Andi Kleena88cde12005-11-05 17:25:54 +0100475 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
476 IO_BITMAP_BYTES);
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200477 set_tsk_thread_flag(p, TIF_IO_BITMAP);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700478 }
479
480 /*
481 * Set a new TLS for the child thread?
482 */
483 if (clone_flags & CLONE_SETTLS) {
484#ifdef CONFIG_IA32_EMULATION
485 if (test_thread_flag(TIF_IA32))
486 err = ia32_child_tls(p, childregs);
487 else
488#endif
489 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
490 if (err)
491 goto out;
492 }
493 err = 0;
494out:
495 if (err && p->thread.io_bitmap_ptr) {
496 kfree(p->thread.io_bitmap_ptr);
497 p->thread.io_bitmap_max = 0;
498 }
499 return err;
500}
501
502/*
503 * This special macro can be used to load a debugging register
504 */
Jan Beulich2b514e72006-03-25 16:29:22 +0100505#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700506
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200507static inline void __switch_to_xtra(struct task_struct *prev_p,
508 struct task_struct *next_p,
509 struct tss_struct *tss)
510{
511 struct thread_struct *prev, *next;
512
513 prev = &prev_p->thread,
514 next = &next_p->thread;
515
516 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
517 loaddebug(next, 0);
518 loaddebug(next, 1);
519 loaddebug(next, 2);
520 loaddebug(next, 3);
521 /* no 4 and 5 */
522 loaddebug(next, 6);
523 loaddebug(next, 7);
524 }
525
526 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
527 /*
528 * Copy the relevant range of the IO bitmap.
529 * Normally this is 128 bytes or less:
530 */
531 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
532 max(prev->io_bitmap_max, next->io_bitmap_max));
533 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
534 /*
535 * Clear any possible leftover bits:
536 */
537 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
538 }
539}
540
Linus Torvalds1da177e2005-04-16 15:20:36 -0700541/*
542 * switch_to(x,y) should switch tasks from x to y.
543 *
544 * This could still be optimized:
545 * - fold all the options into a flag word and test it with a single test.
546 * - could test fs/gs bitsliced
Andi Kleen099f3182006-02-03 21:51:38 +0100547 *
548 * Kprobes not supported here. Set the probe on schedule instead.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700549 */
Andi Kleen099f3182006-02-03 21:51:38 +0100550__kprobes struct task_struct *
Andi Kleena88cde12005-11-05 17:25:54 +0100551__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700552{
553 struct thread_struct *prev = &prev_p->thread,
554 *next = &next_p->thread;
555 int cpu = smp_processor_id();
556 struct tss_struct *tss = &per_cpu(init_tss, cpu);
557
Arjan van de Vene07e23e2006-09-26 10:52:36 +0200558 /* we're going to use this soon, after a few expensive things */
559 if (next_p->fpu_counter>5)
560 prefetch(&next->i387.fxsave);
561
Linus Torvalds1da177e2005-04-16 15:20:36 -0700562 /*
563 * Reload esp0, LDT and the page table pointer:
564 */
565 tss->rsp0 = next->rsp0;
566
567 /*
568 * Switch DS and ES.
569 * This won't pick up thread selector changes, but I guess that is ok.
570 */
H. J. Lufd51f662005-05-01 08:58:48 -0700571 asm volatile("mov %%es,%0" : "=m" (prev->es));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700572 if (unlikely(next->es | prev->es))
573 loadsegment(es, next->es);
574
H. J. Lufd51f662005-05-01 08:58:48 -0700575 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700576 if (unlikely(next->ds | prev->ds))
577 loadsegment(ds, next->ds);
578
579 load_TLS(next, cpu);
580
581 /*
582 * Switch FS and GS.
583 */
584 {
585 unsigned fsindex;
586 asm volatile("movl %%fs,%0" : "=r" (fsindex));
587 /* segment register != 0 always requires a reload.
588 also reload when it has changed.
589 when prev process used 64bit base always reload
590 to avoid an information leak. */
591 if (unlikely(fsindex | next->fsindex | prev->fs)) {
592 loadsegment(fs, next->fsindex);
593 /* check if the user used a selector != 0
594 * if yes clear 64bit base, since overloaded base
595 * is always mapped to the Null selector
596 */
597 if (fsindex)
598 prev->fs = 0;
599 }
600 /* when next process has a 64bit base use it */
601 if (next->fs)
602 wrmsrl(MSR_FS_BASE, next->fs);
603 prev->fsindex = fsindex;
604 }
605 {
606 unsigned gsindex;
607 asm volatile("movl %%gs,%0" : "=r" (gsindex));
608 if (unlikely(gsindex | next->gsindex | prev->gs)) {
609 load_gs_index(next->gsindex);
610 if (gsindex)
611 prev->gs = 0;
612 }
613 if (next->gs)
614 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
615 prev->gsindex = gsindex;
616 }
617
Andi Kleen0a5ace22006-10-05 18:47:22 +0200618 /* Must be after DS reload */
619 unlazy_fpu(prev_p);
620
Linus Torvalds1da177e2005-04-16 15:20:36 -0700621 /*
Jan Beulich45948d72006-03-25 16:29:25 +0100622 * Switch the PDA and FPU contexts.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700623 */
624 prev->userrsp = read_pda(oldrsp);
625 write_pda(oldrsp, next->userrsp);
626 write_pda(pcurrent, next_p);
Andi Kleen18bd0572006-04-20 02:36:45 +0200627
Andi Kleena88cde12005-11-05 17:25:54 +0100628 write_pda(kernelstack,
Andi Kleen7b0bda72006-09-26 10:52:39 +0200629 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
Arjan van de Ven0a4254052006-09-26 10:52:38 +0200630#ifdef CONFIG_CC_STACKPROTECTOR
631 write_pda(stack_canary, next_p->stack_canary);
632 /*
633 * Build time only check to make sure the stack_canary is at
634 * offset 40 in the pda; this is a gcc ABI requirement
635 */
636 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
637#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700638
639 /*
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200640 * Now maybe reload the debug registers and handle I/O bitmaps
Linus Torvalds1da177e2005-04-16 15:20:36 -0700641 */
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200642 if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
643 || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
644 __switch_to_xtra(prev_p, next_p, tss);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700645
Arjan van de Vene07e23e2006-09-26 10:52:36 +0200646 /* If the task has used fpu the last 5 timeslices, just do a full
647 * restore of the math state immediately to avoid the trap; the
648 * chances of needing FPU soon are obviously high now
649 */
650 if (next_p->fpu_counter>5)
651 math_state_restore();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700652 return prev_p;
653}
654
655/*
656 * sys_execve() executes a new program.
657 */
658asmlinkage
659long sys_execve(char __user *name, char __user * __user *argv,
660 char __user * __user *envp, struct pt_regs regs)
661{
662 long error;
663 char * filename;
664
665 filename = getname(name);
666 error = PTR_ERR(filename);
667 if (IS_ERR(filename))
668 return error;
669 error = do_execve(filename, argv, envp, &regs);
670 if (error == 0) {
671 task_lock(current);
672 current->ptrace &= ~PT_DTRACE;
673 task_unlock(current);
674 }
675 putname(filename);
676 return error;
677}
678
679void set_personality_64bit(void)
680{
681 /* inherit personality from parent */
682
683 /* Make sure to be in 64bit mode */
684 clear_thread_flag(TIF_IA32);
685
686 /* TBD: overwrites user setup. Should have two bits.
687 But 64bit processes have always behaved this way,
688 so it's not too bad. The main problem is just that
689 32bit childs are affected again. */
690 current->personality &= ~READ_IMPLIES_EXEC;
691}
692
693asmlinkage long sys_fork(struct pt_regs *regs)
694{
695 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
696}
697
Andi Kleena88cde12005-11-05 17:25:54 +0100698asmlinkage long
699sys_clone(unsigned long clone_flags, unsigned long newsp,
700 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700701{
702 if (!newsp)
703 newsp = regs->rsp;
704 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
705}
706
707/*
708 * This is trivial, and on the face of it looks like it
709 * could equally well be done in user mode.
710 *
711 * Not so, for quite unobvious reasons - register pressure.
712 * In user mode vfork() cannot have a stack frame, and if
713 * done by calling the "clone()" system call directly, you
714 * do not have enough call-clobbered registers to hold all
715 * the information you need.
716 */
717asmlinkage long sys_vfork(struct pt_regs *regs)
718{
719 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
720 NULL, NULL);
721}
722
723unsigned long get_wchan(struct task_struct *p)
724{
725 unsigned long stack;
726 u64 fp,rip;
727 int count = 0;
728
729 if (!p || p == current || p->state==TASK_RUNNING)
730 return 0;
Al Viro57eafdc2006-01-12 01:05:39 -0800731 stack = (unsigned long)task_stack_page(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700732 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
733 return 0;
734 fp = *(u64 *)(p->thread.rsp);
735 do {
Andi Kleena88cde12005-11-05 17:25:54 +0100736 if (fp < (unsigned long)stack ||
737 fp > (unsigned long)stack+THREAD_SIZE)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700738 return 0;
739 rip = *(u64 *)(fp+8);
740 if (!in_sched_functions(rip))
741 return rip;
742 fp = *(u64 *)fp;
743 } while (count++ < 16);
744 return 0;
745}
746
747long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
748{
749 int ret = 0;
750 int doit = task == current;
751 int cpu;
752
753 switch (code) {
754 case ARCH_SET_GS:
Suresh Siddha84929802005-06-21 17:14:32 -0700755 if (addr >= TASK_SIZE_OF(task))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700756 return -EPERM;
757 cpu = get_cpu();
758 /* handle small bases via the GDT because that's faster to
759 switch. */
760 if (addr <= 0xffffffff) {
761 set_32bit_tls(task, GS_TLS, addr);
762 if (doit) {
763 load_TLS(&task->thread, cpu);
764 load_gs_index(GS_TLS_SEL);
765 }
766 task->thread.gsindex = GS_TLS_SEL;
767 task->thread.gs = 0;
768 } else {
769 task->thread.gsindex = 0;
770 task->thread.gs = addr;
771 if (doit) {
Andi Kleena88cde12005-11-05 17:25:54 +0100772 load_gs_index(0);
773 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700774 }
775 }
776 put_cpu();
777 break;
778 case ARCH_SET_FS:
779 /* Not strictly needed for fs, but do it for symmetry
780 with gs */
Suresh Siddha84929802005-06-21 17:14:32 -0700781 if (addr >= TASK_SIZE_OF(task))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700782 return -EPERM;
783 cpu = get_cpu();
784 /* handle small bases via the GDT because that's faster to
785 switch. */
786 if (addr <= 0xffffffff) {
787 set_32bit_tls(task, FS_TLS, addr);
788 if (doit) {
789 load_TLS(&task->thread, cpu);
Andi Kleena88cde12005-11-05 17:25:54 +0100790 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700791 }
792 task->thread.fsindex = FS_TLS_SEL;
793 task->thread.fs = 0;
794 } else {
795 task->thread.fsindex = 0;
796 task->thread.fs = addr;
797 if (doit) {
798 /* set the selector to 0 to not confuse
799 __switch_to */
Andi Kleena88cde12005-11-05 17:25:54 +0100800 asm volatile("movl %0,%%fs" :: "r" (0));
801 ret = checking_wrmsrl(MSR_FS_BASE, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700802 }
803 }
804 put_cpu();
805 break;
806 case ARCH_GET_FS: {
807 unsigned long base;
808 if (task->thread.fsindex == FS_TLS_SEL)
809 base = read_32bit_tls(task, FS_TLS);
Andi Kleena88cde12005-11-05 17:25:54 +0100810 else if (doit)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700811 rdmsrl(MSR_FS_BASE, base);
Andi Kleena88cde12005-11-05 17:25:54 +0100812 else
Linus Torvalds1da177e2005-04-16 15:20:36 -0700813 base = task->thread.fs;
814 ret = put_user(base, (unsigned long __user *)addr);
815 break;
816 }
817 case ARCH_GET_GS: {
818 unsigned long base;
John Blackwood97c28032006-04-07 19:50:25 +0200819 unsigned gsindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700820 if (task->thread.gsindex == GS_TLS_SEL)
821 base = read_32bit_tls(task, GS_TLS);
John Blackwood97c28032006-04-07 19:50:25 +0200822 else if (doit) {
823 asm("movl %%gs,%0" : "=r" (gsindex));
824 if (gsindex)
825 rdmsrl(MSR_KERNEL_GS_BASE, base);
826 else
827 base = task->thread.gs;
828 }
Andi Kleena88cde12005-11-05 17:25:54 +0100829 else
Linus Torvalds1da177e2005-04-16 15:20:36 -0700830 base = task->thread.gs;
831 ret = put_user(base, (unsigned long __user *)addr);
832 break;
833 }
834
835 default:
836 ret = -EINVAL;
837 break;
838 }
839
840 return ret;
841}
842
843long sys_arch_prctl(int code, unsigned long addr)
844{
845 return do_arch_prctl(current, code, addr);
846}
847
848/*
849 * Capture the user space registers if the task is not running (in user space)
850 */
851int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
852{
853 struct pt_regs *pp, ptregs;
854
Al Virobb049232006-01-12 01:05:38 -0800855 pp = task_pt_regs(tsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700856
857 ptregs = *pp;
858 ptregs.cs &= 0xffff;
859 ptregs.ss &= 0xffff;
860
861 elf_core_copy_regs(regs, &ptregs);
862
863 return 1;
864}
865
866unsigned long arch_align_stack(unsigned long sp)
867{
Andi Kleenc16b63e02006-09-26 10:52:28 +0200868 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700869 sp -= get_random_int() % 8192;
870 return sp & ~0xf;
871}