blob: 6f9dbbe65eef7c109e9d986dfca3bdda69072e9e [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * linux/arch/x86-64/kernel/process.c
3 *
4 * Copyright (C) 1995 Linus Torvalds
5 *
6 * Pentium III FXSR, SSE support
7 * Gareth Hughes <gareth@valinux.com>, May 2000
8 *
9 * X86-64 port
10 * Andi Kleen.
Ashok Raj76e4f662005-06-25 14:55:00 -070011 *
12 * CPU hotplug support - ashok.raj@intel.com
Linus Torvalds1da177e2005-04-16 15:20:36 -070013 */
14
15/*
16 * This file handles the architecture-dependent parts of process handling..
17 */
18
19#include <stdarg.h>
20
Ashok Raj76e4f662005-06-25 14:55:00 -070021#include <linux/cpu.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070022#include <linux/errno.h>
23#include <linux/sched.h>
24#include <linux/kernel.h>
25#include <linux/mm.h>
Alexey Dobriyan4e950f62007-07-30 02:36:13 +040026#include <linux/fs.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070027#include <linux/elfcore.h>
28#include <linux/smp.h>
29#include <linux/slab.h>
30#include <linux/user.h>
31#include <linux/module.h>
32#include <linux/a.out.h>
33#include <linux/interrupt.h>
34#include <linux/delay.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070035#include <linux/ptrace.h>
36#include <linux/utsname.h>
37#include <linux/random.h>
Andi Kleen95833c82006-01-11 22:44:36 +010038#include <linux/notifier.h>
bibo maoc6fd91f2006-03-26 01:38:20 -080039#include <linux/kprobes.h>
Christoph Hellwig1eeb66a2007-05-08 00:27:03 -070040#include <linux/kdebug.h>
Chris Wright02290682007-10-12 23:04:07 +020041#include <linux/tick.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070042
43#include <asm/uaccess.h>
44#include <asm/pgtable.h>
45#include <asm/system.h>
46#include <asm/io.h>
47#include <asm/processor.h>
48#include <asm/i387.h>
49#include <asm/mmu_context.h>
50#include <asm/pda.h>
51#include <asm/prctl.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070052#include <asm/desc.h>
53#include <asm/proto.h>
54#include <asm/ia32.h>
Andi Kleen95833c82006-01-11 22:44:36 +010055#include <asm/idle.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070056
57asmlinkage extern void ret_from_fork(void);
58
59unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
60
Linus Torvalds1da177e2005-04-16 15:20:36 -070061unsigned long boot_option_idle_override = 0;
62EXPORT_SYMBOL(boot_option_idle_override);
63
64/*
65 * Powermanagement idle function, if any..
66 */
67void (*pm_idle)(void);
Andi Kleen2ee60e172006-06-26 13:59:44 +020068EXPORT_SYMBOL(pm_idle);
Linus Torvalds1da177e2005-04-16 15:20:36 -070069static DEFINE_PER_CPU(unsigned int, cpu_idle_state);
70
Alan Sterne041c682006-03-27 01:16:30 -080071static ATOMIC_NOTIFIER_HEAD(idle_notifier);
Andi Kleen95833c82006-01-11 22:44:36 +010072
73void idle_notifier_register(struct notifier_block *n)
74{
Alan Sterne041c682006-03-27 01:16:30 -080075 atomic_notifier_chain_register(&idle_notifier, n);
Andi Kleen95833c82006-01-11 22:44:36 +010076}
77EXPORT_SYMBOL_GPL(idle_notifier_register);
78
79void idle_notifier_unregister(struct notifier_block *n)
80{
Alan Sterne041c682006-03-27 01:16:30 -080081 atomic_notifier_chain_unregister(&idle_notifier, n);
Andi Kleen95833c82006-01-11 22:44:36 +010082}
83EXPORT_SYMBOL(idle_notifier_unregister);
84
Andi Kleen95833c82006-01-11 22:44:36 +010085void enter_idle(void)
86{
Andi Kleena15da492006-09-26 10:52:40 +020087 write_pda(isidle, 1);
Alan Sterne041c682006-03-27 01:16:30 -080088 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
Andi Kleen95833c82006-01-11 22:44:36 +010089}
90
91static void __exit_idle(void)
92{
Andi Kleen94468682006-11-14 16:57:46 +010093 if (test_and_clear_bit_pda(0, isidle) == 0)
Andi Kleena15da492006-09-26 10:52:40 +020094 return;
Alan Sterne041c682006-03-27 01:16:30 -080095 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
Andi Kleen95833c82006-01-11 22:44:36 +010096}
97
98/* Called from interrupts to signify idle end */
99void exit_idle(void)
100{
Andi Kleena15da492006-09-26 10:52:40 +0200101 /* idle loop has pid 0 */
102 if (current->pid)
Andi Kleen95833c82006-01-11 22:44:36 +0100103 return;
104 __exit_idle();
105}
106
Linus Torvalds1da177e2005-04-16 15:20:36 -0700107/*
108 * We use this if we don't have any better
109 * idle routine..
110 */
Adrian Bunkcdb04522006-03-24 03:15:57 -0800111static void default_idle(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700112{
Andi Kleen495ab9c2006-06-26 13:59:11 +0200113 current_thread_info()->status &= ~TS_POLLING;
Ingo Molnar0888f062006-12-22 01:11:56 -0800114 /*
115 * TS_POLLING-cleared state must be visible before we
116 * test NEED_RESCHED:
117 */
118 smp_mb();
Andi Kleen72690a22006-12-07 02:14:03 +0100119 local_irq_disable();
120 if (!need_resched()) {
121 /* Enables interrupts one instruction before HLT.
122 x86 special cases this so there is no race. */
123 safe_halt();
124 } else
125 local_irq_enable();
Andi Kleen495ab9c2006-06-26 13:59:11 +0200126 current_thread_info()->status |= TS_POLLING;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700127}
128
129/*
130 * On SMP it's slightly faster (but much more power-consuming!)
131 * to poll the ->need_resched flag instead of waiting for the
132 * cross-CPU IPI to arrive. Use this option with caution.
133 */
134static void poll_idle (void)
135{
Venkatesh Pallipadid331e732006-12-07 02:14:13 +0100136 local_irq_enable();
Andi Kleen72690a22006-12-07 02:14:03 +0100137 cpu_relax();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700138}
139
140void cpu_idle_wait(void)
141{
142 unsigned int cpu, this_cpu = get_cpu();
Ingo Molnardc1829a2006-11-17 14:26:18 +0100143 cpumask_t map, tmp = current->cpus_allowed;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700144
145 set_cpus_allowed(current, cpumask_of_cpu(this_cpu));
146 put_cpu();
147
148 cpus_clear(map);
149 for_each_online_cpu(cpu) {
150 per_cpu(cpu_idle_state, cpu) = 1;
151 cpu_set(cpu, map);
152 }
153
154 __get_cpu_var(cpu_idle_state) = 0;
155
156 wmb();
157 do {
158 ssleep(1);
159 for_each_online_cpu(cpu) {
Andi Kleena88cde12005-11-05 17:25:54 +0100160 if (cpu_isset(cpu, map) &&
161 !per_cpu(cpu_idle_state, cpu))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700162 cpu_clear(cpu, map);
163 }
164 cpus_and(map, map, cpu_online_map);
165 } while (!cpus_empty(map));
Ingo Molnardc1829a2006-11-17 14:26:18 +0100166
167 set_cpus_allowed(current, tmp);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700168}
169EXPORT_SYMBOL_GPL(cpu_idle_wait);
170
Ashok Raj76e4f662005-06-25 14:55:00 -0700171#ifdef CONFIG_HOTPLUG_CPU
172DECLARE_PER_CPU(int, cpu_state);
173
174#include <asm/nmi.h>
Shaohua Li1fa744e2006-01-06 00:12:20 -0800175/* We halt the CPU with physical CPU hotplug */
Ashok Raj76e4f662005-06-25 14:55:00 -0700176static inline void play_dead(void)
177{
178 idle_task_exit();
179 wbinvd();
180 mb();
181 /* Ack it */
182 __get_cpu_var(cpu_state) = CPU_DEAD;
183
Shaohua Li1fa744e2006-01-06 00:12:20 -0800184 local_irq_disable();
Ashok Raj76e4f662005-06-25 14:55:00 -0700185 while (1)
Shaohua Li1fa744e2006-01-06 00:12:20 -0800186 halt();
Ashok Raj76e4f662005-06-25 14:55:00 -0700187}
188#else
189static inline void play_dead(void)
190{
191 BUG();
192}
193#endif /* CONFIG_HOTPLUG_CPU */
194
Linus Torvalds1da177e2005-04-16 15:20:36 -0700195/*
196 * The idle thread. There's no useful work to be
197 * done, so just try to conserve power and have a
198 * low exit latency (ie sit in a loop waiting for
199 * somebody to say that they'd like to reschedule)
200 */
201void cpu_idle (void)
202{
Andi Kleen495ab9c2006-06-26 13:59:11 +0200203 current_thread_info()->status |= TS_POLLING;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700204 /* endless idle loop with no priority at all */
205 while (1) {
206 while (!need_resched()) {
207 void (*idle)(void);
208
209 if (__get_cpu_var(cpu_idle_state))
210 __get_cpu_var(cpu_idle_state) = 0;
211
Chris Wright02290682007-10-12 23:04:07 +0200212 tick_nohz_stop_sched_tick();
213
Linus Torvalds1da177e2005-04-16 15:20:36 -0700214 rmb();
215 idle = pm_idle;
216 if (!idle)
217 idle = default_idle;
Ashok Raj76e4f662005-06-25 14:55:00 -0700218 if (cpu_is_offline(smp_processor_id()))
219 play_dead();
Venkatesh Pallipadid331e732006-12-07 02:14:13 +0100220 /*
221 * Idle routines should keep interrupts disabled
222 * from here on, until they go to idle.
223 * Otherwise, idle callbacks can misfire.
224 */
225 local_irq_disable();
Andi Kleen95833c82006-01-11 22:44:36 +0100226 enter_idle();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700227 idle();
Andi Kleena15da492006-09-26 10:52:40 +0200228 /* In many cases the interrupt that ended idle
229 has already called exit_idle. But some idle
230 loops can be woken up without interrupt. */
Andi Kleen95833c82006-01-11 22:44:36 +0100231 __exit_idle();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700232 }
233
Chris Wright02290682007-10-12 23:04:07 +0200234 tick_nohz_restart_sched_tick();
Nick Piggin5bfb5d62005-11-08 21:39:01 -0800235 preempt_enable_no_resched();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700236 schedule();
Nick Piggin5bfb5d62005-11-08 21:39:01 -0800237 preempt_disable();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700238 }
239}
240
241/*
242 * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
243 * which can obviate IPI to trigger checking of need_resched.
244 * We execute MONITOR against need_resched and enter optimized wait state
245 * through MWAIT. Whenever someone changes need_resched, we would be woken
246 * up from MWAIT (without an IPI).
Venkatesh Pallipadi991528d2006-09-25 16:28:13 -0700247 *
248 * New with Core Duo processors, MWAIT can take some hints based on CPU
249 * capability.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700250 */
Venkatesh Pallipadi991528d2006-09-25 16:28:13 -0700251void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
252{
253 if (!need_resched()) {
254 __monitor((void *)&current_thread_info()->flags, 0, 0);
255 smp_mb();
256 if (!need_resched())
257 __mwait(eax, ecx);
258 }
259}
260
261/* Default MONITOR/MWAIT with no hints, used for default C1 state */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700262static void mwait_idle(void)
263{
Venkatesh Pallipadid331e732006-12-07 02:14:13 +0100264 if (!need_resched()) {
265 __monitor((void *)&current_thread_info()->flags, 0, 0);
266 smp_mb();
267 if (!need_resched())
268 __sti_mwait(0, 0);
269 else
270 local_irq_enable();
271 } else {
272 local_irq_enable();
273 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700274}
275
Ashok Raje6982c62005-06-25 14:54:58 -0700276void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700277{
278 static int printed;
279 if (cpu_has(c, X86_FEATURE_MWAIT)) {
280 /*
281 * Skip, if setup has overridden idle.
282 * One CPU supports mwait => All CPUs supports mwait
283 */
284 if (!pm_idle) {
285 if (!printed) {
Dan Aloni2d4fa2f2007-07-21 17:11:20 +0200286 printk(KERN_INFO "using mwait in idle threads.\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700287 printed = 1;
288 }
289 pm_idle = mwait_idle;
290 }
291 }
292}
293
294static int __init idle_setup (char *str)
295{
Andi Kleenf039b752007-05-02 19:27:12 +0200296 if (!strcmp(str, "poll")) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700297 printk("using polling idle threads.\n");
298 pm_idle = poll_idle;
Andi Kleenf039b752007-05-02 19:27:12 +0200299 } else if (!strcmp(str, "mwait"))
300 force_mwait = 1;
301 else
302 return -1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700303
304 boot_option_idle_override = 1;
Andi Kleenf039b752007-05-02 19:27:12 +0200305 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700306}
Andi Kleenf039b752007-05-02 19:27:12 +0200307early_param("idle", idle_setup);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700308
309/* Prints also some state that isn't saved in the pt_regs */
310void __show_regs(struct pt_regs * regs)
311{
312 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
Alan Sternbb1995d2007-07-21 17:10:42 +0200313 unsigned long d0, d1, d2, d3, d6, d7;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700314 unsigned int fsindex,gsindex;
315 unsigned int ds,cs,es;
316
317 printk("\n");
318 print_modules();
Andi Kleen9acf23c2005-09-12 18:49:24 +0200319 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
320 current->pid, current->comm, print_tainted(),
Serge E. Hallyn96b644b2006-10-02 02:18:13 -0700321 init_utsname()->release,
322 (int)strcspn(init_utsname()->version, " "),
323 init_utsname()->version);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700324 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->rip);
325 printk_address(regs->rip);
Ingo Molnar3ac94932006-07-03 00:24:36 -0700326 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->rsp,
Andi Kleena88cde12005-11-05 17:25:54 +0100327 regs->eflags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700328 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n",
329 regs->rax, regs->rbx, regs->rcx);
330 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n",
331 regs->rdx, regs->rsi, regs->rdi);
332 printk("RBP: %016lx R08: %016lx R09: %016lx\n",
333 regs->rbp, regs->r8, regs->r9);
334 printk("R10: %016lx R11: %016lx R12: %016lx\n",
335 regs->r10, regs->r11, regs->r12);
336 printk("R13: %016lx R14: %016lx R15: %016lx\n",
337 regs->r13, regs->r14, regs->r15);
338
339 asm("movl %%ds,%0" : "=r" (ds));
340 asm("movl %%cs,%0" : "=r" (cs));
341 asm("movl %%es,%0" : "=r" (es));
342 asm("movl %%fs,%0" : "=r" (fsindex));
343 asm("movl %%gs,%0" : "=r" (gsindex));
344
345 rdmsrl(MSR_FS_BASE, fs);
346 rdmsrl(MSR_GS_BASE, gs);
347 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
348
Glauber de Oliveira Costaf51c9452007-07-22 11:12:29 +0200349 cr0 = read_cr0();
350 cr2 = read_cr2();
351 cr3 = read_cr3();
352 cr4 = read_cr4();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700353
354 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
355 fs,fsindex,gs,gsindex,shadowgs);
356 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0);
357 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
Alan Sternbb1995d2007-07-21 17:10:42 +0200358
359 get_debugreg(d0, 0);
360 get_debugreg(d1, 1);
361 get_debugreg(d2, 2);
362 printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
363 get_debugreg(d3, 3);
364 get_debugreg(d6, 6);
365 get_debugreg(d7, 7);
366 printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700367}
368
369void show_regs(struct pt_regs *regs)
370{
Zwane Mwaikamboc078d322005-09-06 15:16:16 -0700371 printk("CPU %d:", smp_processor_id());
Linus Torvalds1da177e2005-04-16 15:20:36 -0700372 __show_regs(regs);
Jan Beulichb538ed22006-06-26 13:57:32 +0200373 show_trace(NULL, regs, (void *)(regs + 1));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700374}
375
376/*
377 * Free current thread data structures etc..
378 */
379void exit_thread(void)
380{
381 struct task_struct *me = current;
382 struct thread_struct *t = &me->thread;
Rusty Lynch73649da2005-06-23 00:09:23 -0700383
Linus Torvalds1da177e2005-04-16 15:20:36 -0700384 if (me->thread.io_bitmap_ptr) {
385 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
386
387 kfree(t->io_bitmap_ptr);
388 t->io_bitmap_ptr = NULL;
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200389 clear_thread_flag(TIF_IO_BITMAP);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700390 /*
391 * Careful, clear this in the TSS too:
392 */
393 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
394 t->io_bitmap_max = 0;
395 put_cpu();
396 }
397}
398
399void flush_thread(void)
400{
401 struct task_struct *tsk = current;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700402
Mathieu Desnoyers303cd152007-03-18 01:26:11 -0800403 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
404 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
405 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
406 clear_tsk_thread_flag(tsk, TIF_IA32);
407 } else {
408 set_tsk_thread_flag(tsk, TIF_IA32);
Andi Kleen4d9bc792006-06-26 13:57:19 +0200409 current_thread_info()->status |= TS_COMPAT;
Mathieu Desnoyers303cd152007-03-18 01:26:11 -0800410 }
Andi Kleen4d9bc792006-06-26 13:57:19 +0200411 }
Mathieu Desnoyers303cd152007-03-18 01:26:11 -0800412 clear_tsk_thread_flag(tsk, TIF_DEBUG);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700413
414 tsk->thread.debugreg0 = 0;
415 tsk->thread.debugreg1 = 0;
416 tsk->thread.debugreg2 = 0;
417 tsk->thread.debugreg3 = 0;
418 tsk->thread.debugreg6 = 0;
419 tsk->thread.debugreg7 = 0;
420 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
421 /*
422 * Forget coprocessor state..
423 */
424 clear_fpu(tsk);
425 clear_used_math();
426}
427
428void release_thread(struct task_struct *dead_task)
429{
430 if (dead_task->mm) {
431 if (dead_task->mm->context.size) {
432 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
433 dead_task->comm,
434 dead_task->mm->context.ldt,
435 dead_task->mm->context.size);
436 BUG();
437 }
438 }
439}
440
441static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
442{
443 struct user_desc ud = {
444 .base_addr = addr,
445 .limit = 0xfffff,
446 .seg_32bit = 1,
447 .limit_in_pages = 1,
448 .useable = 1,
449 };
450 struct n_desc_struct *desc = (void *)t->thread.tls_array;
451 desc += tls;
452 desc->a = LDT_entry_a(&ud);
453 desc->b = LDT_entry_b(&ud);
454}
455
456static inline u32 read_32bit_tls(struct task_struct *t, int tls)
457{
458 struct desc_struct *desc = (void *)t->thread.tls_array;
459 desc += tls;
460 return desc->base0 |
461 (((u32)desc->base1) << 16) |
462 (((u32)desc->base2) << 24);
463}
464
465/*
466 * This gets called before we allocate a new thread and copy
467 * the current task into it.
468 */
469void prepare_to_copy(struct task_struct *tsk)
470{
471 unlazy_fpu(tsk);
472}
473
474int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
475 unsigned long unused,
476 struct task_struct * p, struct pt_regs * regs)
477{
478 int err;
479 struct pt_regs * childregs;
480 struct task_struct *me = current;
481
Andi Kleena88cde12005-11-05 17:25:54 +0100482 childregs = ((struct pt_regs *)
Al Viro57eafdc2006-01-12 01:05:39 -0800483 (THREAD_SIZE + task_stack_page(p))) - 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700484 *childregs = *regs;
485
486 childregs->rax = 0;
487 childregs->rsp = rsp;
Andi Kleena88cde12005-11-05 17:25:54 +0100488 if (rsp == ~0UL)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700489 childregs->rsp = (unsigned long)childregs;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700490
491 p->thread.rsp = (unsigned long) childregs;
492 p->thread.rsp0 = (unsigned long) (childregs+1);
493 p->thread.userrsp = me->thread.userrsp;
494
Al Viroe4f17c42006-01-12 01:05:38 -0800495 set_tsk_thread_flag(p, TIF_FORK);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700496
497 p->thread.fs = me->thread.fs;
498 p->thread.gs = me->thread.gs;
499
H. J. Lufd51f662005-05-01 08:58:48 -0700500 asm("mov %%gs,%0" : "=m" (p->thread.gsindex));
501 asm("mov %%fs,%0" : "=m" (p->thread.fsindex));
502 asm("mov %%es,%0" : "=m" (p->thread.es));
503 asm("mov %%ds,%0" : "=m" (p->thread.ds));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700504
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200505 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700506 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
507 if (!p->thread.io_bitmap_ptr) {
508 p->thread.io_bitmap_max = 0;
509 return -ENOMEM;
510 }
Andi Kleena88cde12005-11-05 17:25:54 +0100511 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
512 IO_BITMAP_BYTES);
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200513 set_tsk_thread_flag(p, TIF_IO_BITMAP);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700514 }
515
516 /*
517 * Set a new TLS for the child thread?
518 */
519 if (clone_flags & CLONE_SETTLS) {
520#ifdef CONFIG_IA32_EMULATION
521 if (test_thread_flag(TIF_IA32))
522 err = ia32_child_tls(p, childregs);
523 else
524#endif
525 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
526 if (err)
527 goto out;
528 }
529 err = 0;
530out:
531 if (err && p->thread.io_bitmap_ptr) {
532 kfree(p->thread.io_bitmap_ptr);
533 p->thread.io_bitmap_max = 0;
534 }
535 return err;
536}
537
538/*
539 * This special macro can be used to load a debugging register
540 */
Jan Beulich2b514e72006-03-25 16:29:22 +0100541#define loaddebug(thread,r) set_debugreg(thread->debugreg ## r, r)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700542
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200543static inline void __switch_to_xtra(struct task_struct *prev_p,
544 struct task_struct *next_p,
545 struct tss_struct *tss)
546{
547 struct thread_struct *prev, *next;
548
549 prev = &prev_p->thread,
550 next = &next_p->thread;
551
552 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
553 loaddebug(next, 0);
554 loaddebug(next, 1);
555 loaddebug(next, 2);
556 loaddebug(next, 3);
557 /* no 4 and 5 */
558 loaddebug(next, 6);
559 loaddebug(next, 7);
560 }
561
562 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
563 /*
564 * Copy the relevant range of the IO bitmap.
565 * Normally this is 128 bytes or less:
566 */
567 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
568 max(prev->io_bitmap_max, next->io_bitmap_max));
569 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
570 /*
571 * Clear any possible leftover bits:
572 */
573 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
574 }
575}
576
Linus Torvalds1da177e2005-04-16 15:20:36 -0700577/*
578 * switch_to(x,y) should switch tasks from x to y.
579 *
580 * This could still be optimized:
581 * - fold all the options into a flag word and test it with a single test.
582 * - could test fs/gs bitsliced
Andi Kleen099f3182006-02-03 21:51:38 +0100583 *
584 * Kprobes not supported here. Set the probe on schedule instead.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700585 */
Andi Kleen099f3182006-02-03 21:51:38 +0100586__kprobes struct task_struct *
Andi Kleena88cde12005-11-05 17:25:54 +0100587__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700588{
589 struct thread_struct *prev = &prev_p->thread,
590 *next = &next_p->thread;
591 int cpu = smp_processor_id();
592 struct tss_struct *tss = &per_cpu(init_tss, cpu);
593
Arjan van de Vene07e23e2006-09-26 10:52:36 +0200594 /* we're going to use this soon, after a few expensive things */
595 if (next_p->fpu_counter>5)
596 prefetch(&next->i387.fxsave);
597
Linus Torvalds1da177e2005-04-16 15:20:36 -0700598 /*
599 * Reload esp0, LDT and the page table pointer:
600 */
601 tss->rsp0 = next->rsp0;
602
603 /*
604 * Switch DS and ES.
605 * This won't pick up thread selector changes, but I guess that is ok.
606 */
H. J. Lufd51f662005-05-01 08:58:48 -0700607 asm volatile("mov %%es,%0" : "=m" (prev->es));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700608 if (unlikely(next->es | prev->es))
609 loadsegment(es, next->es);
610
H. J. Lufd51f662005-05-01 08:58:48 -0700611 asm volatile ("mov %%ds,%0" : "=m" (prev->ds));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700612 if (unlikely(next->ds | prev->ds))
613 loadsegment(ds, next->ds);
614
615 load_TLS(next, cpu);
616
617 /*
618 * Switch FS and GS.
619 */
620 {
621 unsigned fsindex;
622 asm volatile("movl %%fs,%0" : "=r" (fsindex));
623 /* segment register != 0 always requires a reload.
624 also reload when it has changed.
625 when prev process used 64bit base always reload
626 to avoid an information leak. */
627 if (unlikely(fsindex | next->fsindex | prev->fs)) {
628 loadsegment(fs, next->fsindex);
629 /* check if the user used a selector != 0
630 * if yes clear 64bit base, since overloaded base
631 * is always mapped to the Null selector
632 */
633 if (fsindex)
634 prev->fs = 0;
635 }
636 /* when next process has a 64bit base use it */
637 if (next->fs)
638 wrmsrl(MSR_FS_BASE, next->fs);
639 prev->fsindex = fsindex;
640 }
641 {
642 unsigned gsindex;
643 asm volatile("movl %%gs,%0" : "=r" (gsindex));
644 if (unlikely(gsindex | next->gsindex | prev->gs)) {
645 load_gs_index(next->gsindex);
646 if (gsindex)
647 prev->gs = 0;
648 }
649 if (next->gs)
650 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
651 prev->gsindex = gsindex;
652 }
653
Andi Kleen0a5ace22006-10-05 18:47:22 +0200654 /* Must be after DS reload */
655 unlazy_fpu(prev_p);
656
Linus Torvalds1da177e2005-04-16 15:20:36 -0700657 /*
Jan Beulich45948d72006-03-25 16:29:25 +0100658 * Switch the PDA and FPU contexts.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700659 */
660 prev->userrsp = read_pda(oldrsp);
661 write_pda(oldrsp, next->userrsp);
662 write_pda(pcurrent, next_p);
Andi Kleen18bd0572006-04-20 02:36:45 +0200663
Andi Kleena88cde12005-11-05 17:25:54 +0100664 write_pda(kernelstack,
Andi Kleen7b0bda72006-09-26 10:52:39 +0200665 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET);
Arjan van de Ven0a4254052006-09-26 10:52:38 +0200666#ifdef CONFIG_CC_STACKPROTECTOR
667 write_pda(stack_canary, next_p->stack_canary);
668 /*
669 * Build time only check to make sure the stack_canary is at
670 * offset 40 in the pda; this is a gcc ABI requirement
671 */
672 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
673#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700674
675 /*
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200676 * Now maybe reload the debug registers and handle I/O bitmaps
Linus Torvalds1da177e2005-04-16 15:20:36 -0700677 */
Stephane Eraniand3a4f482006-09-26 10:52:28 +0200678 if (unlikely((task_thread_info(next_p)->flags & _TIF_WORK_CTXSW))
679 || test_tsk_thread_flag(prev_p, TIF_IO_BITMAP))
680 __switch_to_xtra(prev_p, next_p, tss);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700681
Arjan van de Vene07e23e2006-09-26 10:52:36 +0200682 /* If the task has used fpu the last 5 timeslices, just do a full
683 * restore of the math state immediately to avoid the trap; the
684 * chances of needing FPU soon are obviously high now
685 */
686 if (next_p->fpu_counter>5)
687 math_state_restore();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700688 return prev_p;
689}
690
691/*
692 * sys_execve() executes a new program.
693 */
694asmlinkage
695long sys_execve(char __user *name, char __user * __user *argv,
696 char __user * __user *envp, struct pt_regs regs)
697{
698 long error;
699 char * filename;
700
701 filename = getname(name);
702 error = PTR_ERR(filename);
703 if (IS_ERR(filename))
704 return error;
705 error = do_execve(filename, argv, envp, &regs);
706 if (error == 0) {
707 task_lock(current);
708 current->ptrace &= ~PT_DTRACE;
709 task_unlock(current);
710 }
711 putname(filename);
712 return error;
713}
714
715void set_personality_64bit(void)
716{
717 /* inherit personality from parent */
718
719 /* Make sure to be in 64bit mode */
720 clear_thread_flag(TIF_IA32);
721
722 /* TBD: overwrites user setup. Should have two bits.
723 But 64bit processes have always behaved this way,
724 so it's not too bad. The main problem is just that
725 32bit childs are affected again. */
726 current->personality &= ~READ_IMPLIES_EXEC;
727}
728
729asmlinkage long sys_fork(struct pt_regs *regs)
730{
731 return do_fork(SIGCHLD, regs->rsp, regs, 0, NULL, NULL);
732}
733
Andi Kleena88cde12005-11-05 17:25:54 +0100734asmlinkage long
735sys_clone(unsigned long clone_flags, unsigned long newsp,
736 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700737{
738 if (!newsp)
739 newsp = regs->rsp;
740 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
741}
742
743/*
744 * This is trivial, and on the face of it looks like it
745 * could equally well be done in user mode.
746 *
747 * Not so, for quite unobvious reasons - register pressure.
748 * In user mode vfork() cannot have a stack frame, and if
749 * done by calling the "clone()" system call directly, you
750 * do not have enough call-clobbered registers to hold all
751 * the information you need.
752 */
753asmlinkage long sys_vfork(struct pt_regs *regs)
754{
755 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->rsp, regs, 0,
756 NULL, NULL);
757}
758
759unsigned long get_wchan(struct task_struct *p)
760{
761 unsigned long stack;
762 u64 fp,rip;
763 int count = 0;
764
765 if (!p || p == current || p->state==TASK_RUNNING)
766 return 0;
Al Viro57eafdc2006-01-12 01:05:39 -0800767 stack = (unsigned long)task_stack_page(p);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700768 if (p->thread.rsp < stack || p->thread.rsp > stack+THREAD_SIZE)
769 return 0;
770 fp = *(u64 *)(p->thread.rsp);
771 do {
Andi Kleena88cde12005-11-05 17:25:54 +0100772 if (fp < (unsigned long)stack ||
773 fp > (unsigned long)stack+THREAD_SIZE)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700774 return 0;
775 rip = *(u64 *)(fp+8);
776 if (!in_sched_functions(rip))
777 return rip;
778 fp = *(u64 *)fp;
779 } while (count++ < 16);
780 return 0;
781}
782
783long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
784{
785 int ret = 0;
786 int doit = task == current;
787 int cpu;
788
789 switch (code) {
790 case ARCH_SET_GS:
Suresh Siddha84929802005-06-21 17:14:32 -0700791 if (addr >= TASK_SIZE_OF(task))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700792 return -EPERM;
793 cpu = get_cpu();
794 /* handle small bases via the GDT because that's faster to
795 switch. */
796 if (addr <= 0xffffffff) {
797 set_32bit_tls(task, GS_TLS, addr);
798 if (doit) {
799 load_TLS(&task->thread, cpu);
800 load_gs_index(GS_TLS_SEL);
801 }
802 task->thread.gsindex = GS_TLS_SEL;
803 task->thread.gs = 0;
804 } else {
805 task->thread.gsindex = 0;
806 task->thread.gs = addr;
807 if (doit) {
Andi Kleena88cde12005-11-05 17:25:54 +0100808 load_gs_index(0);
809 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700810 }
811 }
812 put_cpu();
813 break;
814 case ARCH_SET_FS:
815 /* Not strictly needed for fs, but do it for symmetry
816 with gs */
Suresh Siddha84929802005-06-21 17:14:32 -0700817 if (addr >= TASK_SIZE_OF(task))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700818 return -EPERM;
819 cpu = get_cpu();
820 /* handle small bases via the GDT because that's faster to
821 switch. */
822 if (addr <= 0xffffffff) {
823 set_32bit_tls(task, FS_TLS, addr);
824 if (doit) {
825 load_TLS(&task->thread, cpu);
Andi Kleena88cde12005-11-05 17:25:54 +0100826 asm volatile("movl %0,%%fs" :: "r"(FS_TLS_SEL));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700827 }
828 task->thread.fsindex = FS_TLS_SEL;
829 task->thread.fs = 0;
830 } else {
831 task->thread.fsindex = 0;
832 task->thread.fs = addr;
833 if (doit) {
834 /* set the selector to 0 to not confuse
835 __switch_to */
Andi Kleena88cde12005-11-05 17:25:54 +0100836 asm volatile("movl %0,%%fs" :: "r" (0));
837 ret = checking_wrmsrl(MSR_FS_BASE, addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700838 }
839 }
840 put_cpu();
841 break;
842 case ARCH_GET_FS: {
843 unsigned long base;
844 if (task->thread.fsindex == FS_TLS_SEL)
845 base = read_32bit_tls(task, FS_TLS);
Andi Kleena88cde12005-11-05 17:25:54 +0100846 else if (doit)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700847 rdmsrl(MSR_FS_BASE, base);
Andi Kleena88cde12005-11-05 17:25:54 +0100848 else
Linus Torvalds1da177e2005-04-16 15:20:36 -0700849 base = task->thread.fs;
850 ret = put_user(base, (unsigned long __user *)addr);
851 break;
852 }
853 case ARCH_GET_GS: {
854 unsigned long base;
John Blackwood97c28032006-04-07 19:50:25 +0200855 unsigned gsindex;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700856 if (task->thread.gsindex == GS_TLS_SEL)
857 base = read_32bit_tls(task, GS_TLS);
John Blackwood97c28032006-04-07 19:50:25 +0200858 else if (doit) {
859 asm("movl %%gs,%0" : "=r" (gsindex));
860 if (gsindex)
861 rdmsrl(MSR_KERNEL_GS_BASE, base);
862 else
863 base = task->thread.gs;
864 }
Andi Kleena88cde12005-11-05 17:25:54 +0100865 else
Linus Torvalds1da177e2005-04-16 15:20:36 -0700866 base = task->thread.gs;
867 ret = put_user(base, (unsigned long __user *)addr);
868 break;
869 }
870
871 default:
872 ret = -EINVAL;
873 break;
874 }
875
876 return ret;
877}
878
879long sys_arch_prctl(int code, unsigned long addr)
880{
881 return do_arch_prctl(current, code, addr);
882}
883
884/*
885 * Capture the user space registers if the task is not running (in user space)
886 */
887int dump_task_regs(struct task_struct *tsk, elf_gregset_t *regs)
888{
889 struct pt_regs *pp, ptregs;
890
Al Virobb049232006-01-12 01:05:38 -0800891 pp = task_pt_regs(tsk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700892
893 ptregs = *pp;
894 ptregs.cs &= 0xffff;
895 ptregs.ss &= 0xffff;
896
897 elf_core_copy_regs(regs, &ptregs);
898
899 return 1;
900}
901
902unsigned long arch_align_stack(unsigned long sp)
903{
Andi Kleenc16b63e02006-09-26 10:52:28 +0200904 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700905 sp -= get_random_int() % 8192;
906 return sp & ~0xf;
907}