blob: b596837a1527cda12b28c15a8e896006197090e4 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * linux/arch/x86-64/kernel/process.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 *
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
8 *
9 * X86-64 port
10 * Andi Kleen.
Ashok Raj76e4f662005-06-25 14:55:00 -070011 *
12 * CPU hotplug support - ashok.raj@intel.com
Linus Torvalds1da177e2005-04-16 15:20:36 -070013 */
14
15/*
16 * This file handles the architecture-dependent parts of process handling..
17 */
18
19#include <stdarg.h>
20
Ashok Raj76e4f662005-06-25 14:55:00 -070021#include <linux/cpu.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070022#include <linux/errno.h>
23#include <linux/sched.h>
24#include <linux/kernel.h>
25#include <linux/mm.h>
26#include <linux/elfcore.h>
27#include <linux/smp.h>
28#include <linux/slab.h>
29#include <linux/user.h>
30#include <linux/module.h>
31#include <linux/a.out.h>
32#include <linux/interrupt.h>
33#include <linux/delay.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070034#include <linux/ptrace.h>
35#include <linux/utsname.h>
36#include <linux/random.h>
Andi Kleen95833c82006-01-11 22:44:36 +010037#include <linux/notifier.h>
bibo maoc6fd91f2006-03-26 01:38:20 -080038#include <linux/kprobes.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070039
40#include <asm/uaccess.h>
41#include <asm/pgtable.h>
42#include <asm/system.h>
43#include <asm/io.h>
44#include <asm/processor.h>
45#include <asm/i387.h>
46#include <asm/mmu_context.h>
47#include <asm/pda.h>
48#include <asm/prctl.h>
49#include <asm/kdebug.h>
50#include <asm/desc.h>
51#include <asm/proto.h>
52#include <asm/ia32.h>
Andi Kleen95833c82006-01-11 22:44:36 +010053#include <asm/idle.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070054
55asmlinkage extern void ret_from_fork(void);
56
57unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
58
Linus Torvalds1da177e2005-04-16 15:20:36 -070059unsigned long boot_option_idle_override = 0;
60EXPORT_SYMBOL(boot_option_idle_override);
61
62/*
63 * Powermanagement idle function, if any..
64 */
65void (*pm_idle)(void);
66static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
67
Alan Sterne041c682006-03-27 01:16:30 -080068static ATOMIC_NOTIFIER_HEAD(idle_notifier);
Andi Kleen95833c82006-01-11 22:44:36 +010069
70void idle_notifier_register(struct notifier_block *n)
71{
Alan Sterne041c682006-03-27 01:16:30 -080072 atomic_notifier_chain_register(&idle_notifier, n);
Andi Kleen95833c82006-01-11 22:44:36 +010073}
74EXPORT_SYMBOL_GPL(idle_notifier_register);
75
76void idle_notifier_unregister(struct notifier_block *n)
77{
Alan Sterne041c682006-03-27 01:16:30 -080078 atomic_notifier_chain_unregister(&idle_notifier, n);
Andi Kleen95833c82006-01-11 22:44:36 +010079}
80EXPORT_SYMBOL(idle_notifier_unregister);
81
82enum idle_state { CPU_IDLE, CPU_NOT_IDLE };
83static DEFINE_PER_CPU(enum idle_state, idle_state) = CPU_NOT_IDLE;
84
85void enter_idle(void)
86{
87 __get_cpu_var(idle_state) = CPU_IDLE;
Alan Sterne041c682006-03-27 01:16:30 -080088 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
Andi Kleen95833c82006-01-11 22:44:36 +010089}
90
91static void __exit_idle(void)
92{
93 __get_cpu_var(idle_state) = CPU_NOT_IDLE;
Alan Sterne041c682006-03-27 01:16:30 -080094 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
Andi Kleen95833c82006-01-11 22:44:36 +010095}
96
97/* Called from interrupts to signify idle end */
98void exit_idle(void)
99{
100 if (current->pid | read_pda(irqcount))
101 return;
102 __exit_idle();
103}
104
Linus Torvalds1da177e2005-04-16 15:20:36 -0700105/*
106 * We use this if we don't have any better
107 * idle routine..
108 */
Adrian Bunkcdb04522006-03-24 03:15:57 -0800109static void default_idle(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700110{
Nick Piggin64c7c8f2005-11-08 21:39:04 -0800111 local_irq_enable();
112
Andi Kleen495ab9c2006-06-26 13:59:11 +0200113 current_thread_info()->status &= ~TS_POLLING;
Andi Kleen2d52ede2006-01-11 22:42:42 +0100114 smp_mb__after_clear_bit();
115 while (!need_resched()) {
116 local_irq_disable();
117 if (!need_resched())
118 safe_halt();
119 else
120 local_irq_enable();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700121 }
Andi Kleen495ab9c2006-06-26 13:59:11 +0200122 current_thread_info()->status |= TS_POLLING;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700123}
124
125/*
126 * On SMP it's slightly faster (but much more power-consuming!)
127 * to poll the ->need_resched flag instead of waiting for the
128 * cross-CPU IPI to arrive. Use this option with caution.
129 */
130static void poll_idle (void)
131{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700132 local_irq_enable();
133
Nick Piggin64c7c8f2005-11-08 21:39:04 -0800134 asm volatile(
135 "2:"
136 "testl %0,%1;"
137 "rep; nop;"
138 "je 2b;"
139 : :
140 "i" (_TIF_NEED_RESCHED),
141 "m" (current_thread_info()->flags));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700142}
143
144void cpu_idle_wait(void)
145{
146 unsigned int cpu, this_cpu = get_cpu();
147 cpumask_t map;
148
149 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
150 put_cpu();
151
152 cpus_clear(map);
153 for_each_online_cpu(cpu) {
154 per_cpu(cpu_idle_state, cpu) = 1;
155 cpu_set(cpu, map);
156 }
157
158 __get_cpu_var(cpu_idle_state) = 0;
159
160 wmb();
161 do {
162 ssleep(1);
163 for_each_online_cpu(cpu) {
Andi Kleena88cde12005-11-05 17:25:54 +0100164 if (cpu_isset(cpu, map) &&
165 !per_cpu(cpu_idle_state, cpu))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700166 cpu_clear(cpu, map);
167 }
168 cpus_and(map, map, cpu_online_map);
169 } while (!cpus_empty(map));
170}
171EXPORT_SYMBOL_GPL(cpu_idle_wait);
172
Ashok Raj76e4f662005-06-25 14:55:00 -0700173#ifdef CONFIG_HOTPLUG_CPU
174DECLARE_PER_CPU(int, cpu_state);
175
176#include <asm/nmi.h>
Shaohua Li1fa744e2006-01-06 00:12:20 -0800177/* We halt the CPU with physical CPU hotplug */
Ashok Raj76e4f662005-06-25 14:55:00 -0700178static inline void play_dead(void)
179{
180 idle_task_exit();
181 wbinvd();
182 mb();
183 /* Ack it */
184 __get_cpu_var(cpu_state) = CPU_DEAD;
185
Shaohua Li1fa744e2006-01-06 00:12:20 -0800186 local_irq_disable();
Ashok Raj76e4f662005-06-25 14:55:00 -0700187 while (1)
Shaohua Li1fa744e2006-01-06 00:12:20 -0800188 halt();
Ashok Raj76e4f662005-06-25 14:55:00 -0700189}
190#else
191static inline void play_dead(void)
192{
193 BUG();
194}
195#endif /* CONFIG_HOTPLUG_CPU */
196
Linus Torvalds1da177e2005-04-16 15:20:36 -0700197/*
198 * The idle thread. There's no useful work to be
199 * done, so just try to conserve power and have a
200 * low exit latency (ie sit in a loop waiting for
201 * somebody to say that they'd like to reschedule)
202 */
203void cpu_idle (void)
204{
Andi Kleen495ab9c2006-06-26 13:59:11 +0200205 current_thread_info()->status |= TS_POLLING;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700206 /* endless idle loop with no priority at all */
207 while (1) {
208 while (!need_resched()) {
209 void (*idle)(void);
210
211 if (__get_cpu_var(cpu_idle_state))
212 __get_cpu_var(cpu_idle_state) = 0;
213
214 rmb();
215 idle = pm_idle;
216 if (!idle)
217 idle = default_idle;
Ashok Raj76e4f662005-06-25 14:55:00 -0700218 if (cpu_is_offline(smp_processor_id()))
219 play_dead();
Andi Kleen95833c82006-01-11 22:44:36 +0100220 enter_idle();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700221 idle();
Andi Kleen95833c82006-01-11 22:44:36 +0100222 __exit_idle();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700223 }
224
Nick Piggin5bfb5d62005-11-08 21:39:01 -0800225 preempt_enable_no_resched();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700226 schedule();
Nick Piggin5bfb5d62005-11-08 21:39:01 -0800227 preempt_disable();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700228 }
229}
230
231/*
232 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
233 * which can obviate IPI to trigger checking of need_resched.
234 * We execute MONITOR against need_resched and enter optimized wait state
235 * through MWAIT. Whenever someone changes need_resched, we would be woken
236 * up from MWAIT (without an IPI).
237 */
238static void mwait_idle(void)
239{
240 local_irq_enable();
241
Nick Piggin64c7c8f2005-11-08 21:39:04 -0800242 while (!need_resched()) {
243 __monitor((void *)&current_thread_info()->flags, 0, 0);
244 smp_mb();
245 if (need_resched())
246 break;
247 __mwait(0, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700248 }
249}
250
Ashok Raje6982c62005-06-25 14:54:58 -0700251void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700252{
253 static int printed;
254 if (cpu_has(c, X86_FEATURE_MWAIT)) {
255 /*
256 * Skip, if setup has overridden idle.
257 * One CPU supports mwait => All CPUs supports mwait
258 */
259 if (!pm_idle) {
260 if (!printed) {
261 printk("using mwait in idle threads.\n");
262 printed = 1;
263 }
264 pm_idle = mwait_idle;
265 }
266 }
267}
268
269static int __init idle_setup (char *str)
270{
271 if (!strncmp(str, "poll", 4)) {
272 printk("using polling idle threads.\n");
273 pm_idle = poll_idle;
274 }
275
276 boot_option_idle_override = 1;
277 return 1;
278}
279
280__setup("idle=", idle_setup);
281
282/* Prints also some state that isn't saved in the pt_regs */
283void __show_regs(struct pt_regs * regs)
284{
285 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
286 unsigned int fsindex,gsindex;
287 unsigned int ds,cs,es;
288
289 printk("\n");
290 print_modules();
Andi Kleen9acf23c2005-09-12 18:49:24 +0200291 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
292 current->pid, current->comm, print_tainted(),
293 system_utsname.release,
294 (int)strcspn(system_utsname.version, " "),
295 system_utsname.version);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700296 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
297 printk_address(regs->rip);
Andi Kleena88cde12005-11-05 17:25:54 +0100298 printk("\nRSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
299 regs->eflags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700300 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
301 regs->rax, regs->rbx, regs->rcx);
302 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
303 regs->rdx, regs->rsi, regs->rdi);
304 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
305 regs->rbp, regs->r8, regs->r9);
306 printk("R10: %016lx R11: %016lx R12: %016lx\n",
307 regs->r10, regs->r11, regs->r12);
308 printk("R13: %016lx R14: %016lx R15: %016lx\n",
309 regs->r13, regs->r14, regs->r15);
310
311 asm("movl %%ds,%0" : "=r" (ds));
312 asm("movl %%cs,%0" : "=r" (cs));
313 asm("movl %%es,%0" : "=r" (es));
314 asm("movl %%fs,%0" : "=r" (fsindex));
315 asm("movl %%gs,%0" : "=r" (gsindex));
316
317 rdmsrl(MSR_FS_BASE, fs);
318 rdmsrl(MSR_GS_BASE, gs);
319 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
320
321 asm("movq %%cr0, %0": "=r" (cr0));
322 asm("movq %%cr2, %0": "=r" (cr2));
323 asm("movq %%cr3, %0": "=r" (cr3));
324 asm("movq %%cr4, %0": "=r" (cr4));
325
326 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
327 fs,fsindex,gs,gsindex,shadowgs);
328 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
329 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
330}
331
332void show_regs(struct pt_regs *regs)
333{
Zwane Mwaikamboc078d322005-09-06 15:16:16 -0700334 printk("CPU %d:", smp_processor_id());
Linus Torvalds1da177e2005-04-16 15:20:36 -0700335 __show_regs(regs);
Jan Beulichb538ed22006-06-26 13:57:32 +0200336 show_trace(NULL, regs, (void *)(regs + 1));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700337}
338
339/*
340 * Free current thread data structures etc..
341 */
342void exit_thread(void)
343{
344 struct task_struct *me = current;
345 struct thread_struct *t = &me->thread;
Rusty Lynch73649da2005-06-23 00:09:23 -0700346
Linus Torvalds1da177e2005-04-16 15:20:36 -0700347 if (me->thread.io_bitmap_ptr) {
348 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
349
350 kfree(t->io_bitmap_ptr);
351 t->io_bitmap_ptr = NULL;
352 /*
353 * Careful, clear this in the TSS too:
354 */
355 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
356 t->io_bitmap_max = 0;
357 put_cpu();
358 }
359}
360
361void flush_thread(void)
362{
363 struct task_struct *tsk = current;
364 struct thread_info *t = current_thread_info();
365
Andi Kleen4d9bc792006-06-26 13:57:19 +0200366 if (t->flags & _TIF_ABI_PENDING) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700367 t->flags ^= (_TIF_ABI_PENDING | _TIF_IA32);
Andi Kleen4d9bc792006-06-26 13:57:19 +0200368 if (t->flags & _TIF_IA32)
369 current_thread_info()->status |= TS_COMPAT;
370 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700371
372 tsk->thread.debugreg0 = 0;
373 tsk->thread.debugreg1 = 0;
374 tsk->thread.debugreg2 = 0;
375 tsk->thread.debugreg3 = 0;
376 tsk->thread.debugreg6 = 0;
377 tsk->thread.debugreg7 = 0;
378 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
379 /*
380 * Forget coprocessor state..
381 */
382 clear_fpu(tsk);
383 clear_used_math();
384}
385
386void release_thread(struct task_struct *dead_task)
387{
388 if (dead_task->mm) {
389 if (dead_task->mm->context.size) {
390 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
391 dead_task->comm,
392 dead_task->mm->context.ldt,
393 dead_task->mm->context.size);
394 BUG();
395 }
396 }
397}
398
399static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
400{
401 struct user_desc ud = {
402 .base_addr = addr,
403 .limit = 0xfffff,
404 .seg_32bit = 1,
405 .limit_in_pages = 1,
406 .useable = 1,
407 };
408 struct n_desc_struct *desc = (void *)t->thread.tls_array;
409 desc += tls;
410 desc->a = LDT_entry_a(&ud);
411 desc->b = LDT_entry_b(&ud);
412}
413
414static inline u32 read_32bit_tls(struct task_struct *t, int tls)
415{
416 struct desc_struct *desc = (void *)t->thread.tls_array;
417 desc += tls;
418 return desc->base0 |
419 (((u32)desc->base1) << 16) |
420 (((u32)desc->base2) << 24);
421}
422
423/*
424 * This gets called before we allocate a new thread and copy
425 * the current task into it.
426 */
427void prepare_to_copy(struct task_struct *tsk)
428{
429 unlazy_fpu(tsk);
430}
431
432int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
433 unsigned long unused,
434 struct task_struct * p, struct pt_regs * regs)
435{
436 int err;
437 struct pt_regs * childregs;
438 struct task_struct *me = current;
439
Andi Kleena88cde12005-11-05 17:25:54 +0100440 childregs = ((struct pt_regs *)
Al Viro57eafdc2006-01-12 01:05:39 -0800441 (THREAD_SIZE + task_stack_page(p))) - 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700442 *childregs = *regs;
443
444 childregs->rax = 0;
445 childregs->rsp = rsp;
Andi Kleena88cde12005-11-05 17:25:54 +0100446 if (rsp == ~0UL)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700447 childregs->rsp = (unsigned long)childregs;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700448
449 p->thread.rsp = (unsigned long) childregs;
450 p->thread.rsp0 = (unsigned long) (childregs+1);
451 p->thread.userrsp = me->thread.userrsp;
452
Al Viroe4f17c42006-01-12 01:05:38 -0800453 set_tsk_thread_flag(p, TIF_FORK);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700454
455 p->thread.fs = me->thread.fs;
456 p->thread.gs = me->thread.gs;
457
H. J. Lufd51f662005-05-01 08:58:48 -0700458 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
459 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
460 asm("mov %%es,%0" : "=m" (p->thread.es));
461 asm("mov %%ds,%0" : "=m" (p->thread.ds));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700462
463 if (unlikely(me->thread.io_bitmap_ptr != NULL)) {
464 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
465 if (!p->thread.io_bitmap_ptr) {
466 p->thread.io_bitmap_max = 0;
467 return -ENOMEM;
468 }
Andi Kleena88cde12005-11-05 17:25:54 +0100469 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
470 IO_BITMAP_BYTES);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700471 }
472
473 /*
474 * Set a new TLS for the child thread?
475 */
476 if (clone_flags & CLONE_SETTLS) {
477#ifdef CONFIG_IA32_EMULATION
478 if (test_thread_flag(TIF_IA32))
479 err = ia32_child_tls(p, childregs);
480 else
481#endif
482 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
483 if (err)
484 goto out;
485 }
486 err = 0;
487out:
488 if (err && p->thread.io_bitmap_ptr) {
489 kfree(p->thread.io_bitmap_ptr);
490 p->thread.io_bitmap_max = 0;
491 }
492 return err;
493}
494
495/*
496 * This special macro can be used to load a debugging register
497 */
Jan Beulich2b514e72006-03-25 16:29:22 +0100498#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700499
500/*
501 * switch_to(x,y) should switch tasks from x to y.
502 *
503 * This could still be optimized:
504 * - fold all the options into a flag word and test it with a single test.
505 * - could test fs/gs bitsliced
Andi Kleen099f3182006-02-03 21:51:38 +0100506 *
507 * Kprobes not supported here. Set the probe on schedule instead.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700508 */
Andi Kleen099f3182006-02-03 21:51:38 +0100509__kprobes struct task_struct *
Andi Kleena88cde12005-11-05 17:25:54 +0100510__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700511{
512 struct thread_struct *prev = &prev_p->thread,
513 *next = &next_p->thread;
514 int cpu = smp_processor_id();
515 struct tss_struct *tss = &per_cpu(init_tss, cpu);
516
Linus Torvalds1da177e2005-04-16 15:20:36 -0700517 /*
518 * Reload esp0, LDT and the page table pointer:
519 */
520 tss->rsp0 = next->rsp0;
521
522 /*
523 * Switch DS and ES.
524 * This won't pick up thread selector changes, but I guess that is ok.
525 */
H. J. Lufd51f662005-05-01 08:58:48 -0700526 asm volatile("mov %%es,%0" : "=m" (prev->es));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700527 if (unlikely(next->es | prev->es))
528 loadsegment(es, next->es);
529
H. J. Lufd51f662005-05-01 08:58:48 -0700530 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700531 if (unlikely(next->ds | prev->ds))
532 loadsegment(ds, next->ds);
533
534 load_TLS(next, cpu);
535
536 /*
537 * Switch FS and GS.
538 */
539 {
540 unsigned fsindex;
541 asm volatile("movl %%fs,%0" : "=r" (fsindex));
542 /* segment register != 0 always requires a reload.
543 also reload when it has changed.
544 when prev process used 64bit base always reload
545 to avoid an information leak. */
546 if (unlikely(fsindex | next->fsindex | prev->fs)) {
547 loadsegment(fs, next->fsindex);
548 /* check if the user used a selector != 0
549 * if yes clear 64bit base, since overloaded base
550 * is always mapped to the Null selector
551 */
552 if (fsindex)
553 prev->fs = 0;
554 }
555 /* when next process has a 64bit base use it */
556 if (next->fs)
557 wrmsrl(MSR_FS_BASE, next->fs);
558 prev->fsindex = fsindex;
559 }
560 {
561 unsigned gsindex;
562 asm volatile("movl %%gs,%0" : "=r" (gsindex));
563 if (unlikely(gsindex | next->gsindex | prev->gs)) {
564 load_gs_index(next->gsindex);
565 if (gsindex)
566 prev->gs = 0;
567 }
568 if (next->gs)
569 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
570 prev->gsindex = gsindex;
571 }
572
573 /*
Jan Beulich45948d72006-03-25 16:29:25 +0100574 * Switch the PDA and FPU contexts.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700575 */
576 prev->userrsp = read_pda(oldrsp);
577 write_pda(oldrsp, next->userrsp);
578 write_pda(pcurrent, next_p);
Andi Kleen18bd0572006-04-20 02:36:45 +0200579
Jan Beulich45948d72006-03-25 16:29:25 +0100580 /* This must be here to ensure both math_state_restore() and
Andi Kleen18bd0572006-04-20 02:36:45 +0200581 kernel_fpu_begin() work consistently.
582 And the AMD workaround requires it to be after DS reload. */
Jan Beulich45948d72006-03-25 16:29:25 +0100583 unlazy_fpu(prev_p);
Andi Kleena88cde12005-11-05 17:25:54 +0100584 write_pda(kernelstack,
Al Viro57eafdc2006-01-12 01:05:39 -0800585 task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700586
587 /*
588 * Now maybe reload the debug registers
589 */
590 if (unlikely(next->debugreg7)) {
591 loaddebug(next, 0);
592 loaddebug(next, 1);
593 loaddebug(next, 2);
594 loaddebug(next, 3);
595 /* no 4 and 5 */
596 loaddebug(next, 6);
597 loaddebug(next, 7);
598 }
599
600
601 /*
602 * Handle the IO bitmap
603 */
604 if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
605 if (next->io_bitmap_ptr)
606 /*
607 * Copy the relevant range of the IO bitmap.
608 * Normally this is 128 bytes or less:
609 */
610 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
611 max(prev->io_bitmap_max, next->io_bitmap_max));
612 else {
613 /*
614 * Clear any possible leftover bits:
615 */
616 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
617 }
618 }
619
620 return prev_p;
621}
622
623/*
624 * sys_execve() executes a new program.
625 */
626asmlinkage
627long sys_execve(char __user *name, char __user * __user *argv,
628 char __user * __user *envp, struct pt_regs regs)
629{
630 long error;
631 char * filename;
632
633 filename = getname(name);
634 error = PTR_ERR(filename);
635 if (IS_ERR(filename))
636 return error;
637 error = do_execve(filename, argv, envp, &regs);
638 if (error == 0) {
639 task_lock(current);
640 current->ptrace &= ~PT_DTRACE;
641 task_unlock(current);
642 }
643 putname(filename);
644 return error;
645}
646
647void set_personality_64bit(void)
648{
649 /* inherit personality from parent */
650
651 /* Make sure to be in 64bit mode */
652 clear_thread_flag(TIF_IA32);
653
654 /* TBD: overwrites user setup. Should have two bits.
655 But 64bit processes have always behaved this way,
656 so it's not too bad. The main problem is just that
657 32bit childs are affected again. */
658 current->personality &= ~READ_IMPLIES_EXEC;
659}
660
661asmlinkage long sys_fork(struct pt_regs *regs)
662{
663 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
664}
665
Andi Kleena88cde12005-11-05 17:25:54 +0100666asmlinkage long
667sys_clone(unsigned long clone_flags, unsigned long newsp,
668 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700669{
670 if (!newsp)
671 newsp = regs->rsp;
672 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
673}
674
675/*
676 * This is trivial, and on the face of it looks like it
677 * could equally well be done in user mode.
678 *
679 * Not so, for quite unobvious reasons - register pressure.
680 * In user mode vfork() cannot have a stack frame, and if
681 * done by calling the "clone()" system call directly, you
682 * do not have enough call-clobbered registers to hold all
683 * the information you need.
684 */
685asmlinkage long sys_vfork(struct pt_regs *regs)
686{
687 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
688 NULL, NULL);
689}
690
691unsigned long get_wchan(struct task_struct *p)
692{
693 unsigned long stack;
694 u64 fp,rip;
695 int count = 0;
696
697 if (!p || p == current || p->state==TASK_RUNNING)
698 return 0;
Al Viro57eafdc2006-01-12 01:05:39 -0800699 stack = (unsigned long)task_stack_page(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700700 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
701 return 0;
702 fp = *(u64 *)(p->thread.rsp);
703 do {
Andi Kleena88cde12005-11-05 17:25:54 +0100704 if (fp < (unsigned long)stack ||
705 fp > (unsigned long)stack+THREAD_SIZE)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700706 return 0;
707 rip = *(u64 *)(fp+8);
708 if (!in_sched_functions(rip))
709 return rip;
710 fp = *(u64 *)fp;
711 } while (count++ < 16);
712 return 0;
713}
714
715long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
716{
717 int ret = 0;
718 int doit = task == current;
719 int cpu;
720
721 switch (code) {
722 case ARCH_SET_GS:
Suresh Siddha84929802005-06-21 17:14:32 -0700723 if (addr >= TASK_SIZE_OF(task))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700724 return -EPERM;
725 cpu = get_cpu();
726 /* handle small bases via the GDT because that's faster to
727 switch. */
728 if (addr <= 0xffffffff) {
729 set_32bit_tls(task, GS_TLS, addr);
730 if (doit) {
731 load_TLS(&task->thread, cpu);
732 load_gs_index(GS_TLS_SEL);
733 }
734 task->thread.gsindex = GS_TLS_SEL;
735 task->thread.gs = 0;
736 } else {
737 task->thread.gsindex = 0;
738 task->thread.gs = addr;
739 if (doit) {
Andi Kleena88cde12005-11-05 17:25:54 +0100740 load_gs_index(0);
741 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700742 }
743 }
744 put_cpu();
745 break;
746 case ARCH_SET_FS:
747 /* Not strictly needed for fs, but do it for symmetry
748 with gs */
Suresh Siddha84929802005-06-21 17:14:32 -0700749 if (addr >= TASK_SIZE_OF(task))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700750 return -EPERM;
751 cpu = get_cpu();
752 /* handle small bases via the GDT because that's faster to
753 switch. */
754 if (addr <= 0xffffffff) {
755 set_32bit_tls(task, FS_TLS, addr);
756 if (doit) {
757 load_TLS(&task->thread, cpu);
Andi Kleena88cde12005-11-05 17:25:54 +0100758 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700759 }
760 task->thread.fsindex = FS_TLS_SEL;
761 task->thread.fs = 0;
762 } else {
763 task->thread.fsindex = 0;
764 task->thread.fs = addr;
765 if (doit) {
766 /* set the selector to 0 to not confuse
767 __switch_to */
Andi Kleena88cde12005-11-05 17:25:54 +0100768 asm volatile("movl %0,%%fs" :: "r" (0));
769 ret = checking_wrmsrl(MSR_FS_BASE, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700770 }
771 }
772 put_cpu();
773 break;
774 case ARCH_GET_FS: {
775 unsigned long base;
776 if (task->thread.fsindex == FS_TLS_SEL)
777 base = read_32bit_tls(task, FS_TLS);
Andi Kleena88cde12005-11-05 17:25:54 +0100778 else if (doit)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700779 rdmsrl(MSR_FS_BASE, base);
Andi Kleena88cde12005-11-05 17:25:54 +0100780 else
Linus Torvalds1da177e2005-04-16 15:20:36 -0700781 base = task->thread.fs;
782 ret = put_user(base, (unsigned long __user *)addr);
783 break;
784 }
785 case ARCH_GET_GS: {
786 unsigned long base;
John Blackwood97c28032006-04-07 19:50:25 +0200787 unsigned gsindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700788 if (task->thread.gsindex == GS_TLS_SEL)
789 base = read_32bit_tls(task, GS_TLS);
John Blackwood97c28032006-04-07 19:50:25 +0200790 else if (doit) {
791 asm("movl %%gs,%0" : "=r" (gsindex));
792 if (gsindex)
793 rdmsrl(MSR_KERNEL_GS_BASE, base);
794 else
795 base = task->thread.gs;
796 }
Andi Kleena88cde12005-11-05 17:25:54 +0100797 else
Linus Torvalds1da177e2005-04-16 15:20:36 -0700798 base = task->thread.gs;
799 ret = put_user(base, (unsigned long __user *)addr);
800 break;
801 }
802
803 default:
804 ret = -EINVAL;
805 break;
806 }
807
808 return ret;
809}
810
811long sys_arch_prctl(int code, unsigned long addr)
812{
813 return do_arch_prctl(current, code, addr);
814}
815
816/*
817 * Capture the user space registers if the task is not running (in user space)
818 */
819int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
820{
821 struct pt_regs *pp, ptregs;
822
Al Virobb049232006-01-12 01:05:38 -0800823 pp = task_pt_regs(tsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700824
825 ptregs = *pp;
826 ptregs.cs &= 0xffff;
827 ptregs.ss &= 0xffff;
828
829 elf_core_copy_regs(regs, &ptregs);
830
831 return 1;
832}
833
834unsigned long arch_align_stack(unsigned long sp)
835{
836 if (randomize_va_space)
837 sp -= get_random_int() % 8192;
838 return sp & ~0xf;
839}