blob: 458006ae19f3837c240477611cdcb6c1e3df503f [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * linux/arch/x86-64/kernel/process.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 *
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
8 *
9 * X86-64 port
10 * Andi Kleen.
Ashok Raj76e4f662005-06-25 14:55:00 -070011 *
12 * CPU hotplug support - ashok.raj@intel.com
Linus Torvalds1da177e2005-04-16 15:20:36 -070013 */
14
15/*
16 * This file handles the architecture-dependent parts of process handling..
17 */
18
19#include <stdarg.h>
20
Ashok Raj76e4f662005-06-25 14:55:00 -070021#include <linux/cpu.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070022#include <linux/errno.h>
23#include <linux/sched.h>
24#include <linux/kernel.h>
25#include <linux/mm.h>
26#include <linux/elfcore.h>
27#include <linux/smp.h>
28#include <linux/slab.h>
29#include <linux/user.h>
30#include <linux/module.h>
31#include <linux/a.out.h>
32#include <linux/interrupt.h>
33#include <linux/delay.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070034#include <linux/ptrace.h>
35#include <linux/utsname.h>
36#include <linux/random.h>
Andi Kleen95833c82006-01-11 22:44:36 +010037#include <linux/notifier.h>
bibo maoc6fd91f2006-03-26 01:38:20 -080038#include <linux/kprobes.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070039
40#include <asm/uaccess.h>
41#include <asm/pgtable.h>
42#include <asm/system.h>
43#include <asm/io.h>
44#include <asm/processor.h>
45#include <asm/i387.h>
46#include <asm/mmu_context.h>
47#include <asm/pda.h>
48#include <asm/prctl.h>
49#include <asm/kdebug.h>
50#include <asm/desc.h>
51#include <asm/proto.h>
52#include <asm/ia32.h>
Andi Kleen95833c82006-01-11 22:44:36 +010053#include <asm/idle.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070054
55asmlinkage extern void ret_from_fork(void);
56
57unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
58
Linus Torvalds1da177e2005-04-16 15:20:36 -070059unsigned long boot_option_idle_override = 0;
60EXPORT_SYMBOL(boot_option_idle_override);
61
62/*
63 * Powermanagement idle function, if any..
64 */
65void (*pm_idle)(void);
Andi Kleen2ee60e172006-06-26 13:59:44 +020066EXPORT_SYMBOL(pm_idle);
Linus Torvalds1da177e2005-04-16 15:20:36 -070067static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
68
Alan Sterne041c682006-03-27 01:16:30 -080069static ATOMIC_NOTIFIER_HEAD(idle_notifier);
Andi Kleen95833c82006-01-11 22:44:36 +010070
71void idle_notifier_register(struct notifier_block *n)
72{
Alan Sterne041c682006-03-27 01:16:30 -080073 atomic_notifier_chain_register(&idle_notifier, n);
Andi Kleen95833c82006-01-11 22:44:36 +010074}
75EXPORT_SYMBOL_GPL(idle_notifier_register);
76
77void idle_notifier_unregister(struct notifier_block *n)
78{
Alan Sterne041c682006-03-27 01:16:30 -080079 atomic_notifier_chain_unregister(&idle_notifier, n);
Andi Kleen95833c82006-01-11 22:44:36 +010080}
81EXPORT_SYMBOL(idle_notifier_unregister);
82
Andi Kleen95833c82006-01-11 22:44:36 +010083void enter_idle(void)
84{
Andi Kleena15da492006-09-26 10:52:40 +020085 write_pda(isidle, 1);
Alan Sterne041c682006-03-27 01:16:30 -080086 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
Andi Kleen95833c82006-01-11 22:44:36 +010087}
88
89static void __exit_idle(void)
90{
Andi Kleena15da492006-09-26 10:52:40 +020091 if (read_pda(isidle) == 0)
92 return;
93 write_pda(isidle, 0);
Alan Sterne041c682006-03-27 01:16:30 -080094 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
Andi Kleen95833c82006-01-11 22:44:36 +010095}
96
97/* Called from interrupts to signify idle end */
98void exit_idle(void)
99{
Andi Kleena15da492006-09-26 10:52:40 +0200100 /* idle loop has pid 0 */
101 if (current->pid)
Andi Kleen95833c82006-01-11 22:44:36 +0100102 return;
103 __exit_idle();
104}
105
Linus Torvalds1da177e2005-04-16 15:20:36 -0700106/*
107 * We use this if we don't have any better
108 * idle routine..
109 */
Adrian Bunkcdb04522006-03-24 03:15:57 -0800110static void default_idle(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700111{
Nick Piggin64c7c8f2005-11-08 21:39:04 -0800112 local_irq_enable();
113
Andi Kleen495ab9c2006-06-26 13:59:11 +0200114 current_thread_info()->status &= ~TS_POLLING;
Andi Kleen2d52ede2006-01-11 22:42:42 +0100115 smp_mb__after_clear_bit();
116 while (!need_resched()) {
117 local_irq_disable();
118 if (!need_resched())
119 safe_halt();
120 else
121 local_irq_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700122 }
Andi Kleen495ab9c2006-06-26 13:59:11 +0200123 current_thread_info()->status |= TS_POLLING;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700124}
125
126/*
127 * On SMP it's slightly faster (but much more power-consuming!)
128 * to poll the ->need_resched flag instead of waiting for the
129 * cross-CPU IPI to arrive. Use this option with caution.
130 */
131static void poll_idle (void)
132{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700133 local_irq_enable();
134
Nick Piggin64c7c8f2005-11-08 21:39:04 -0800135 asm volatile(
136 "2:"
137 "testl %0,%1;"
138 "rep; nop;"
139 "je 2b;"
140 : :
141 "i" (_TIF_NEED_RESCHED),
142 "m" (current_thread_info()->flags));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700143}
144
145void cpu_idle_wait(void)
146{
147 unsigned int cpu, this_cpu = get_cpu();
148 cpumask_t map;
149
150 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
151 put_cpu();
152
153 cpus_clear(map);
154 for_each_online_cpu(cpu) {
155 per_cpu(cpu_idle_state, cpu) = 1;
156 cpu_set(cpu, map);
157 }
158
159 __get_cpu_var(cpu_idle_state) = 0;
160
161 wmb();
162 do {
163 ssleep(1);
164 for_each_online_cpu(cpu) {
Andi Kleena88cde12005-11-05 17:25:54 +0100165 if (cpu_isset(cpu, map) &&
166 !per_cpu(cpu_idle_state, cpu))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700167 cpu_clear(cpu, map);
168 }
169 cpus_and(map, map, cpu_online_map);
170 } while (!cpus_empty(map));
171}
172EXPORT_SYMBOL_GPL(cpu_idle_wait);
173
Ashok Raj76e4f662005-06-25 14:55:00 -0700174#ifdef CONFIG_HOTPLUG_CPU
175DECLARE_PER_CPU(int, cpu_state);
176
177#include <asm/nmi.h>
Shaohua Li1fa744e2006-01-06 00:12:20 -0800178/* We halt the CPU with physical CPU hotplug */
Ashok Raj76e4f662005-06-25 14:55:00 -0700179static inline void play_dead(void)
180{
181 idle_task_exit();
182 wbinvd();
183 mb();
184 /* Ack it */
185 __get_cpu_var(cpu_state) = CPU_DEAD;
186
Shaohua Li1fa744e2006-01-06 00:12:20 -0800187 local_irq_disable();
Ashok Raj76e4f662005-06-25 14:55:00 -0700188 while (1)
Shaohua Li1fa744e2006-01-06 00:12:20 -0800189 halt();
Ashok Raj76e4f662005-06-25 14:55:00 -0700190}
191#else
192static inline void play_dead(void)
193{
194 BUG();
195}
196#endif /* CONFIG_HOTPLUG_CPU */
197
Linus Torvalds1da177e2005-04-16 15:20:36 -0700198/*
199 * The idle thread. There's no useful work to be
200 * done, so just try to conserve power and have a
201 * low exit latency (ie sit in a loop waiting for
202 * somebody to say that they'd like to reschedule)
203 */
204void cpu_idle (void)
205{
Andi Kleen495ab9c2006-06-26 13:59:11 +0200206 current_thread_info()->status |= TS_POLLING;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700207 /* endless idle loop with no priority at all */
208 while (1) {
209 while (!need_resched()) {
210 void (*idle)(void);
211
212 if (__get_cpu_var(cpu_idle_state))
213 __get_cpu_var(cpu_idle_state) = 0;
214
215 rmb();
216 idle = pm_idle;
217 if (!idle)
218 idle = default_idle;
Ashok Raj76e4f662005-06-25 14:55:00 -0700219 if (cpu_is_offline(smp_processor_id()))
220 play_dead();
Andi Kleen95833c82006-01-11 22:44:36 +0100221 enter_idle();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700222 idle();
Andi Kleena15da492006-09-26 10:52:40 +0200223 /* In many cases the interrupt that ended idle
224 has already called exit_idle. But some idle
225 loops can be woken up without interrupt. */
Andi Kleen95833c82006-01-11 22:44:36 +0100226 __exit_idle();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700227 }
228
Nick Piggin5bfb5d62005-11-08 21:39:01 -0800229 preempt_enable_no_resched();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700230 schedule();
Nick Piggin5bfb5d62005-11-08 21:39:01 -0800231 preempt_disable();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700232 }
233}
234
235/*
236 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
237 * which can obviate IPI to trigger checking of need_resched.
238 * We execute MONITOR against need_resched and enter optimized wait state
239 * through MWAIT. Whenever someone changes need_resched, we would be woken
240 * up from MWAIT (without an IPI).
241 */
242static void mwait_idle(void)
243{
244 local_irq_enable();
245
Nick Piggin64c7c8f2005-11-08 21:39:04 -0800246 while (!need_resched()) {
247 __monitor((void *)&current_thread_info()->flags, 0, 0);
248 smp_mb();
249 if (need_resched())
250 break;
251 __mwait(0, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700252 }
253}
254
Ashok Raje6982c62005-06-25 14:54:58 -0700255void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700256{
257 static int printed;
258 if (cpu_has(c, X86_FEATURE_MWAIT)) {
259 /*
260 * Skip, if setup has overridden idle.
261 * One CPU supports mwait => All CPUs supports mwait
262 */
263 if (!pm_idle) {
264 if (!printed) {
265 printk("using mwait in idle threads.\n");
266 printed = 1;
267 }
268 pm_idle = mwait_idle;
269 }
270 }
271}
272
273static int __init idle_setup (char *str)
274{
275 if (!strncmp(str, "poll", 4)) {
276 printk("using polling idle threads.\n");
277 pm_idle = poll_idle;
278 }
279
280 boot_option_idle_override = 1;
281 return 1;
282}
283
284__setup("idle=", idle_setup);
285
286/* Prints also some state that isn't saved in the pt_regs */
287void __show_regs(struct pt_regs * regs)
288{
289 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
290 unsigned int fsindex,gsindex;
291 unsigned int ds,cs,es;
292
293 printk("\n");
294 print_modules();
Andi Kleen9acf23c2005-09-12 18:49:24 +0200295 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
296 current->pid, current->comm, print_tainted(),
297 system_utsname.release,
298 (int)strcspn(system_utsname.version, " "),
299 system_utsname.version);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700300 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
301 printk_address(regs->rip);
Ingo Molnar3ac94932006-07-03 00:24:36 -0700302 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
Andi Kleena88cde12005-11-05 17:25:54 +0100303 regs->eflags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700304 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
305 regs->rax, regs->rbx, regs->rcx);
306 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
307 regs->rdx, regs->rsi, regs->rdi);
308 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
309 regs->rbp, regs->r8, regs->r9);
310 printk("R10: %016lx R11: %016lx R12: %016lx\n",
311 regs->r10, regs->r11, regs->r12);
312 printk("R13: %016lx R14: %016lx R15: %016lx\n",
313 regs->r13, regs->r14, regs->r15);
314
315 asm("movl %%ds,%0" : "=r" (ds));
316 asm("movl %%cs,%0" : "=r" (cs));
317 asm("movl %%es,%0" : "=r" (es));
318 asm("movl %%fs,%0" : "=r" (fsindex));
319 asm("movl %%gs,%0" : "=r" (gsindex));
320
321 rdmsrl(MSR_FS_BASE, fs);
322 rdmsrl(MSR_GS_BASE, gs);
323 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
324
325 asm("movq %%cr0, %0": "=r" (cr0));
326 asm("movq %%cr2, %0": "=r" (cr2));
327 asm("movq %%cr3, %0": "=r" (cr3));
328 asm("movq %%cr4, %0": "=r" (cr4));
329
330 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
331 fs,fsindex,gs,gsindex,shadowgs);
332 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
333 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
334}
335
336void show_regs(struct pt_regs *regs)
337{
Zwane Mwaikamboc078d322005-09-06 15:16:16 -0700338 printk("CPU %d:", smp_processor_id());
Linus Torvalds1da177e2005-04-16 15:20:36 -0700339 __show_regs(regs);
Jan Beulichb538ed22006-06-26 13:57:32 +0200340 show_trace(NULL, regs, (void *)(regs + 1));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700341}
342
343/*
344 * Free current thread data structures etc..
345 */
346void exit_thread(void)
347{
348 struct task_struct *me = current;
349 struct thread_struct *t = &me->thread;
Rusty Lynch73649da2005-06-23 00:09:23 -0700350
Linus Torvalds1da177e2005-04-16 15:20:36 -0700351 if (me->thread.io_bitmap_ptr) {
352 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
353
354 kfree(t->io_bitmap_ptr);
355 t->io_bitmap_ptr = NULL;
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200356 clear_thread_flag(TIF_IO_BITMAP);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700357 /*
358 * Careful, clear this in the TSS too:
359 */
360 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
361 t->io_bitmap_max = 0;
362 put_cpu();
363 }
364}
365
366void flush_thread(void)
367{
368 struct task_struct *tsk = current;
369 struct thread_info *t = current_thread_info();
370
Andi Kleen4d9bc792006-06-26 13:57:19 +0200371 if (t->flags & _TIF_ABI_PENDING) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700372 t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
Andi Kleen4d9bc792006-06-26 13:57:19 +0200373 if (t->flags & _TIF_IA32)
374 current_thread_info()->status |= TS_COMPAT;
375 }
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200376 t->flags &= ~_TIF_DEBUG;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700377
378 tsk->thread.debugreg0 = 0;
379 tsk->thread.debugreg1 = 0;
380 tsk->thread.debugreg2 = 0;
381 tsk->thread.debugreg3 = 0;
382 tsk->thread.debugreg6 = 0;
383 tsk->thread.debugreg7 = 0;
384 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
385 /*
386 * Forget coprocessor state..
387 */
388 clear_fpu(tsk);
389 clear_used_math();
390}
391
392void release_thread(struct task_struct *dead_task)
393{
394 if (dead_task->mm) {
395 if (dead_task->mm->context.size) {
396 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
397 dead_task->comm,
398 dead_task->mm->context.ldt,
399 dead_task->mm->context.size);
400 BUG();
401 }
402 }
403}
404
405static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
406{
407 struct user_desc ud = {
408 .base_addr = addr,
409 .limit = 0xfffff,
410 .seg_32bit = 1,
411 .limit_in_pages = 1,
412 .useable = 1,
413 };
414 struct n_desc_struct *desc = (void *)t->thread.tls_array;
415 desc += tls;
416 desc->a = LDT_entry_a(&ud);
417 desc->b = LDT_entry_b(&ud);
418}
419
420static inline u32 read_32bit_tls(struct task_struct *t, int tls)
421{
422 struct desc_struct *desc = (void *)t->thread.tls_array;
423 desc += tls;
424 return desc->base0 |
425 (((u32)desc->base1) << 16) |
426 (((u32)desc->base2) << 24);
427}
428
429/*
430 * This gets called before we allocate a new thread and copy
431 * the current task into it.
432 */
433void prepare_to_copy(struct task_struct *tsk)
434{
435 unlazy_fpu(tsk);
436}
437
438int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
439 unsigned long unused,
440 struct task_struct * p, struct pt_regs * regs)
441{
442 int err;
443 struct pt_regs * childregs;
444 struct task_struct *me = current;
445
Andi Kleena88cde12005-11-05 17:25:54 +0100446 childregs = ((struct pt_regs *)
Al Viro57eafdc2006-01-12 01:05:39 -0800447 (THREAD_SIZE + task_stack_page(p))) - 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700448 *childregs = *regs;
449
450 childregs->rax = 0;
451 childregs->rsp = rsp;
Andi Kleena88cde12005-11-05 17:25:54 +0100452 if (rsp == ~0UL)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700453 childregs->rsp = (unsigned long)childregs;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700454
455 p->thread.rsp = (unsigned long) childregs;
456 p->thread.rsp0 = (unsigned long) (childregs+1);
457 p->thread.userrsp = me->thread.userrsp;
458
Al Viroe4f17c42006-01-12 01:05:38 -0800459 set_tsk_thread_flag(p, TIF_FORK);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700460
461 p->thread.fs = me->thread.fs;
462 p->thread.gs = me->thread.gs;
463
H. J. Lufd51f662005-05-01 08:58:48 -0700464 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
465 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
466 asm("mov %%es,%0" : "=m" (p->thread.es));
467 asm("mov %%ds,%0" : "=m" (p->thread.ds));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700468
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200469 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700470 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
471 if (!p->thread.io_bitmap_ptr) {
472 p->thread.io_bitmap_max = 0;
473 return -ENOMEM;
474 }
Andi Kleena88cde12005-11-05 17:25:54 +0100475 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
476 IO_BITMAP_BYTES);
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200477 set_tsk_thread_flag(p, TIF_IO_BITMAP);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700478 }
479
480 /*
481 * Set a new TLS for the child thread?
482 */
483 if (clone_flags & CLONE_SETTLS) {
484#ifdef CONFIG_IA32_EMULATION
485 if (test_thread_flag(TIF_IA32))
486 err = ia32_child_tls(p, childregs);
487 else
488#endif
489 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
490 if (err)
491 goto out;
492 }
493 err = 0;
494out:
495 if (err && p->thread.io_bitmap_ptr) {
496 kfree(p->thread.io_bitmap_ptr);
497 p->thread.io_bitmap_max = 0;
498 }
499 return err;
500}
501
502/*
503 * This special macro can be used to load a debugging register
504 */
Jan Beulich2b514e72006-03-25 16:29:22 +0100505#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700506
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200507static inline void __switch_to_xtra(struct task_struct *prev_p,
508 struct task_struct *next_p,
509 struct tss_struct *tss)
510{
511 struct thread_struct *prev, *next;
512
513 prev = &prev_p->thread,
514 next = &next_p->thread;
515
516 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
517 loaddebug(next, 0);
518 loaddebug(next, 1);
519 loaddebug(next, 2);
520 loaddebug(next, 3);
521 /* no 4 and 5 */
522 loaddebug(next, 6);
523 loaddebug(next, 7);
524 }
525
526 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
527 /*
528 * Copy the relevant range of the IO bitmap.
529 * Normally this is 128 bytes or less:
530 */
531 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
532 max(prev->io_bitmap_max, next->io_bitmap_max));
533 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
534 /*
535 * Clear any possible leftover bits:
536 */
537 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
538 }
539}
540
Linus Torvalds1da177e2005-04-16 15:20:36 -0700541/*
542 * switch_to(x,y) should switch tasks from x to y.
543 *
544 * This could still be optimized:
545 * - fold all the options into a flag word and test it with a single test.
546 * - could test fs/gs bitsliced
Andi Kleen099f3182006-02-03 21:51:38 +0100547 *
548 * Kprobes not supported here. Set the probe on schedule instead.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700549 */
Andi Kleen099f3182006-02-03 21:51:38 +0100550__kprobes struct task_struct *
Andi Kleena88cde12005-11-05 17:25:54 +0100551__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700552{
553 struct thread_struct *prev = &prev_p->thread,
554 *next = &next_p->thread;
555 int cpu = smp_processor_id();
556 struct tss_struct *tss = &per_cpu(init_tss, cpu);
557
Arjan van de Vene07e23e2006-09-26 10:52:36 +0200558 /* we're going to use this soon, after a few expensive things */
559 if (next_p->fpu_counter>5)
560 prefetch(&next->i387.fxsave);
561
Linus Torvalds1da177e2005-04-16 15:20:36 -0700562 /*
563 * Reload esp0, LDT and the page table pointer:
564 */
565 tss->rsp0 = next->rsp0;
566
567 /*
568 * Switch DS and ES.
569 * This won't pick up thread selector changes, but I guess that is ok.
570 */
H. J. Lufd51f662005-05-01 08:58:48 -0700571 asm volatile("mov %%es,%0" : "=m" (prev->es));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700572 if (unlikely(next->es | prev->es))
573 loadsegment(es, next->es);
574
H. J. Lufd51f662005-05-01 08:58:48 -0700575 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700576 if (unlikely(next->ds | prev->ds))
577 loadsegment(ds, next->ds);
578
579 load_TLS(next, cpu);
580
581 /*
582 * Switch FS and GS.
583 */
584 {
585 unsigned fsindex;
586 asm volatile("movl %%fs,%0" : "=r" (fsindex));
587 /* segment register != 0 always requires a reload.
588 also reload when it has changed.
589 when prev process used 64bit base always reload
590 to avoid an information leak. */
591 if (unlikely(fsindex | next->fsindex | prev->fs)) {
592 loadsegment(fs, next->fsindex);
593 /* check if the user used a selector != 0
594 * if yes clear 64bit base, since overloaded base
595 * is always mapped to the Null selector
596 */
597 if (fsindex)
598 prev->fs = 0;
599 }
600 /* when next process has a 64bit base use it */
601 if (next->fs)
602 wrmsrl(MSR_FS_BASE, next->fs);
603 prev->fsindex = fsindex;
604 }
605 {
606 unsigned gsindex;
607 asm volatile("movl %%gs,%0" : "=r" (gsindex));
608 if (unlikely(gsindex | next->gsindex | prev->gs)) {
609 load_gs_index(next->gsindex);
610 if (gsindex)
611 prev->gs = 0;
612 }
613 if (next->gs)
614 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
615 prev->gsindex = gsindex;
616 }
617
618 /*
Jan Beulich45948d72006-03-25 16:29:25 +0100619 * Switch the PDA and FPU contexts.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700620 */
621 prev->userrsp = read_pda(oldrsp);
622 write_pda(oldrsp, next->userrsp);
623 write_pda(pcurrent, next_p);
Andi Kleen18bd0572006-04-20 02:36:45 +0200624
Jan Beulich45948d72006-03-25 16:29:25 +0100625 /* This must be here to ensure both math_state_restore() and
Andi Kleen18bd0572006-04-20 02:36:45 +0200626 kernel_fpu_begin() work consistently.
627 And the AMD workaround requires it to be after DS reload. */
Jan Beulich45948d72006-03-25 16:29:25 +0100628 unlazy_fpu(prev_p);
Andi Kleena88cde12005-11-05 17:25:54 +0100629 write_pda(kernelstack,
Andi Kleen7b0bda72006-09-26 10:52:39 +0200630 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
Arjan van de Ven0a4254052006-09-26 10:52:38 +0200631#ifdef CONFIG_CC_STACKPROTECTOR
632 write_pda(stack_canary, next_p->stack_canary);
633 /*
634 * Build time only check to make sure the stack_canary is at
635 * offset 40 in the pda; this is a gcc ABI requirement
636 */
637 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
638#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700639
640 /*
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200641 * Now maybe reload the debug registers and handle I/O bitmaps
Linus Torvalds1da177e2005-04-16 15:20:36 -0700642 */
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200643 if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
644 || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
645 __switch_to_xtra(prev_p, next_p, tss);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700646
Arjan van de Vene07e23e2006-09-26 10:52:36 +0200647 /* If the task has used fpu the last 5 timeslices, just do a full
648 * restore of the math state immediately to avoid the trap; the
649 * chances of needing FPU soon are obviously high now
650 */
651 if (next_p->fpu_counter>5)
652 math_state_restore();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700653 return prev_p;
654}
655
656/*
657 * sys_execve() executes a new program.
658 */
659asmlinkage
660long sys_execve(char __user *name, char __user * __user *argv,
661 char __user * __user *envp, struct pt_regs regs)
662{
663 long error;
664 char * filename;
665
666 filename = getname(name);
667 error = PTR_ERR(filename);
668 if (IS_ERR(filename))
669 return error;
670 error = do_execve(filename, argv, envp, &regs);
671 if (error == 0) {
672 task_lock(current);
673 current->ptrace &= ~PT_DTRACE;
674 task_unlock(current);
675 }
676 putname(filename);
677 return error;
678}
679
680void set_personality_64bit(void)
681{
682 /* inherit personality from parent */
683
684 /* Make sure to be in 64bit mode */
685 clear_thread_flag(TIF_IA32);
686
687 /* TBD: overwrites user setup. Should have two bits.
688 But 64bit processes have always behaved this way,
689 so it's not too bad. The main problem is just that
690 32bit childs are affected again. */
691 current->personality &= ~READ_IMPLIES_EXEC;
692}
693
694asmlinkage long sys_fork(struct pt_regs *regs)
695{
696 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
697}
698
Andi Kleena88cde12005-11-05 17:25:54 +0100699asmlinkage long
700sys_clone(unsigned long clone_flags, unsigned long newsp,
701 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700702{
703 if (!newsp)
704 newsp = regs->rsp;
705 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
706}
707
708/*
709 * This is trivial, and on the face of it looks like it
710 * could equally well be done in user mode.
711 *
712 * Not so, for quite unobvious reasons - register pressure.
713 * In user mode vfork() cannot have a stack frame, and if
714 * done by calling the "clone()" system call directly, you
715 * do not have enough call-clobbered registers to hold all
716 * the information you need.
717 */
718asmlinkage long sys_vfork(struct pt_regs *regs)
719{
720 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
721 NULL, NULL);
722}
723
724unsigned long get_wchan(struct task_struct *p)
725{
726 unsigned long stack;
727 u64 fp,rip;
728 int count = 0;
729
730 if (!p || p == current || p->state==TASK_RUNNING)
731 return 0;
Al Viro57eafdc2006-01-12 01:05:39 -0800732 stack = (unsigned long)task_stack_page(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700733 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
734 return 0;
735 fp = *(u64 *)(p->thread.rsp);
736 do {
Andi Kleena88cde12005-11-05 17:25:54 +0100737 if (fp < (unsigned long)stack ||
738 fp > (unsigned long)stack+THREAD_SIZE)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700739 return 0;
740 rip = *(u64 *)(fp+8);
741 if (!in_sched_functions(rip))
742 return rip;
743 fp = *(u64 *)fp;
744 } while (count++ < 16);
745 return 0;
746}
747
748long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
749{
750 int ret = 0;
751 int doit = task == current;
752 int cpu;
753
754 switch (code) {
755 case ARCH_SET_GS:
Suresh Siddha84929802005-06-21 17:14:32 -0700756 if (addr >= TASK_SIZE_OF(task))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700757 return -EPERM;
758 cpu = get_cpu();
759 /* handle small bases via the GDT because that's faster to
760 switch. */
761 if (addr <= 0xffffffff) {
762 set_32bit_tls(task, GS_TLS, addr);
763 if (doit) {
764 load_TLS(&task->thread, cpu);
765 load_gs_index(GS_TLS_SEL);
766 }
767 task->thread.gsindex = GS_TLS_SEL;
768 task->thread.gs = 0;
769 } else {
770 task->thread.gsindex = 0;
771 task->thread.gs = addr;
772 if (doit) {
Andi Kleena88cde12005-11-05 17:25:54 +0100773 load_gs_index(0);
774 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700775 }
776 }
777 put_cpu();
778 break;
779 case ARCH_SET_FS:
780 /* Not strictly needed for fs, but do it for symmetry
781 with gs */
Suresh Siddha84929802005-06-21 17:14:32 -0700782 if (addr >= TASK_SIZE_OF(task))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700783 return -EPERM;
784 cpu = get_cpu();
785 /* handle small bases via the GDT because that's faster to
786 switch. */
787 if (addr <= 0xffffffff) {
788 set_32bit_tls(task, FS_TLS, addr);
789 if (doit) {
790 load_TLS(&task->thread, cpu);
Andi Kleena88cde12005-11-05 17:25:54 +0100791 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700792 }
793 task->thread.fsindex = FS_TLS_SEL;
794 task->thread.fs = 0;
795 } else {
796 task->thread.fsindex = 0;
797 task->thread.fs = addr;
798 if (doit) {
799 /* set the selector to 0 to not confuse
800 __switch_to */
Andi Kleena88cde12005-11-05 17:25:54 +0100801 asm volatile("movl %0,%%fs" :: "r" (0));
802 ret = checking_wrmsrl(MSR_FS_BASE, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700803 }
804 }
805 put_cpu();
806 break;
807 case ARCH_GET_FS: {
808 unsigned long base;
809 if (task->thread.fsindex == FS_TLS_SEL)
810 base = read_32bit_tls(task, FS_TLS);
Andi Kleena88cde12005-11-05 17:25:54 +0100811 else if (doit)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700812 rdmsrl(MSR_FS_BASE, base);
Andi Kleena88cde12005-11-05 17:25:54 +0100813 else
Linus Torvalds1da177e2005-04-16 15:20:36 -0700814 base = task->thread.fs;
815 ret = put_user(base, (unsigned long __user *)addr);
816 break;
817 }
818 case ARCH_GET_GS: {
819 unsigned long base;
John Blackwood97c28032006-04-07 19:50:25 +0200820 unsigned gsindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700821 if (task->thread.gsindex == GS_TLS_SEL)
822 base = read_32bit_tls(task, GS_TLS);
John Blackwood97c28032006-04-07 19:50:25 +0200823 else if (doit) {
824 asm("movl %%gs,%0" : "=r" (gsindex));
825 if (gsindex)
826 rdmsrl(MSR_KERNEL_GS_BASE, base);
827 else
828 base = task->thread.gs;
829 }
Andi Kleena88cde12005-11-05 17:25:54 +0100830 else
Linus Torvalds1da177e2005-04-16 15:20:36 -0700831 base = task->thread.gs;
832 ret = put_user(base, (unsigned long __user *)addr);
833 break;
834 }
835
836 default:
837 ret = -EINVAL;
838 break;
839 }
840
841 return ret;
842}
843
844long sys_arch_prctl(int code, unsigned long addr)
845{
846 return do_arch_prctl(current, code, addr);
847}
848
849/*
850 * Capture the user space registers if the task is not running (in user space)
851 */
852int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
853{
854 struct pt_regs *pp, ptregs;
855
Al Virobb049232006-01-12 01:05:38 -0800856 pp = task_pt_regs(tsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700857
858 ptregs = *pp;
859 ptregs.cs &= 0xffff;
860 ptregs.ss &= 0xffff;
861
862 elf_core_copy_regs(regs, &ptregs);
863
864 return 1;
865}
866
867unsigned long arch_align_stack(unsigned long sp)
868{
Andi Kleenc16b63e02006-09-26 10:52:28 +0200869 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700870 sp -= get_random_int() % 8192;
871 return sp & ~0xf;
872}