x86: make lazy %gs optional on x86_32
Impact: pt_regs changed, lazy gs handling made optional, add slight
overhead to SAVE_ALL, simplifies error_code path a bit
On x86_32, %gs hasn't been used by kernel and handled lazily. pt_regs
doesn't have place for it and gs is saved/loaded only when necessary.
In preparation for stack protector support, this patch makes lazy %gs
handling optional by doing the followings.
* Add CONFIG_X86_32_LAZY_GS and place for gs in pt_regs.
* Save and restore %gs along with other registers in entry_32.S unless
LAZY_GS. Note that this unfortunately adds "pushl $0" on SAVE_ALL
even when LAZY_GS. However, it adds no overhead to common exit path
and simplifies entry path with error code.
* Define different user_gs accessors depending on LAZY_GS and add
lazy_save_gs() and lazy_load_gs() which are noop if !LAZY_GS. The
lazy_*_gs() ops are used to save, load and clear %gs lazily.
* Define ELF_CORE_COPY_KERNEL_REGS() which always read %gs directly.
xen and lguest changes need to be verified.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5c8e353..5bcdede 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -207,6 +207,10 @@
depends on X86_SMP || (X86_VOYAGER && SMP) || (64BIT && ACPI_SLEEP)
default y
+config X86_32_LAZY_GS
+ def_bool y
+ depends on X86_32
+
config KTIME_SCALAR
def_bool X86_32
source "init/Kconfig"
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 39b0aac..83c1bc8 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -112,7 +112,7 @@
* now struct_user_regs, they are different)
*/
-#define ELF_CORE_COPY_REGS(pr_reg, regs) \
+#define ELF_CORE_COPY_REGS_COMMON(pr_reg, regs) \
do { \
pr_reg[0] = regs->bx; \
pr_reg[1] = regs->cx; \
@@ -124,7 +124,6 @@
pr_reg[7] = regs->ds & 0xffff; \
pr_reg[8] = regs->es & 0xffff; \
pr_reg[9] = regs->fs & 0xffff; \
- pr_reg[10] = get_user_gs(regs); \
pr_reg[11] = regs->orig_ax; \
pr_reg[12] = regs->ip; \
pr_reg[13] = regs->cs & 0xffff; \
@@ -133,6 +132,18 @@
pr_reg[16] = regs->ss & 0xffff; \
} while (0);
+#define ELF_CORE_COPY_REGS(pr_reg, regs) \
+do { \
+ ELF_CORE_COPY_REGS_COMMON(pr_reg, regs);\
+ pr_reg[10] = get_user_gs(regs); \
+} while (0);
+
+#define ELF_CORE_COPY_KERNEL_REGS(pr_reg, regs) \
+do { \
+ ELF_CORE_COPY_REGS_COMMON(pr_reg, regs);\
+ savesegment(gs, pr_reg[10]); \
+} while (0);
+
#define ELF_PLATFORM (utsname()->machine)
#define set_personality_64bit() do { } while (0)
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 4955165..f923203 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -79,7 +79,7 @@
#ifdef CONFIG_X86_32
#define deactivate_mm(tsk, mm) \
do { \
- set_user_gs(task_pt_regs(tsk), 0); \
+ lazy_load_gs(0); \
} while (0)
#else
#define deactivate_mm(tsk, mm) \
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 6d34d95..e304b66 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -28,7 +28,7 @@
int xds;
int xes;
int xfs;
- /* int gs; */
+ int xgs;
long orig_eax;
long eip;
int xcs;
@@ -50,7 +50,7 @@
unsigned long ds;
unsigned long es;
unsigned long fs;
- /* int gs; */
+ unsigned long gs;
unsigned long orig_ax;
unsigned long ip;
unsigned long cs;
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index 70c74b8..79b98e5 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -186,10 +186,20 @@
* x86_32 user gs accessors.
*/
#ifdef CONFIG_X86_32
+#ifdef CONFIG_X86_32_LAZY_GS
#define get_user_gs(regs) (u16)({unsigned long v; savesegment(gs, v); v;})
#define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v))
#define task_user_gs(tsk) ((tsk)->thread.gs)
-#endif
+#define lazy_save_gs(v) savesegment(gs, (v))
+#define lazy_load_gs(v) loadsegment(gs, (v))
+#else /* X86_32_LAZY_GS */
+#define get_user_gs(regs) (u16)((regs)->gs)
+#define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0)
+#define task_user_gs(tsk) (task_pt_regs(tsk)->gs)
+#define lazy_save_gs(v) do { } while (0)
+#define lazy_load_gs(v) do { } while (0)
+#endif /* X86_32_LAZY_GS */
+#endif /* X86_32 */
static inline unsigned long get_limit(unsigned long segment)
{
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index ee4df08..fbf2f33 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -75,6 +75,7 @@
OFFSET(PT_DS, pt_regs, ds);
OFFSET(PT_ES, pt_regs, es);
OFFSET(PT_FS, pt_regs, fs);
+ OFFSET(PT_GS, pt_regs, gs);
OFFSET(PT_ORIG_EAX, pt_regs, orig_ax);
OFFSET(PT_EIP, pt_regs, ip);
OFFSET(PT_CS, pt_regs, cs);
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c461925..82e6868 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -30,12 +30,13 @@
* 1C(%esp) - %ds
* 20(%esp) - %es
* 24(%esp) - %fs
- * 28(%esp) - orig_eax
- * 2C(%esp) - %eip
- * 30(%esp) - %cs
- * 34(%esp) - %eflags
- * 38(%esp) - %oldesp
- * 3C(%esp) - %oldss
+ * 28(%esp) - %gs saved iff !CONFIG_X86_32_LAZY_GS
+ * 2C(%esp) - orig_eax
+ * 30(%esp) - %eip
+ * 34(%esp) - %cs
+ * 38(%esp) - %eflags
+ * 3C(%esp) - %oldesp
+ * 40(%esp) - %oldss
*
* "current" is in register %ebx during any slow entries.
*/
@@ -101,8 +102,99 @@
#define resume_userspace_sig resume_userspace
#endif
+/*
+ * User gs save/restore
+ *
+ * %gs is used for userland TLS and kernel only uses it for stack
+ * canary which is required to be at %gs:20 by gcc. Read the comment
+ * at the top of stackprotector.h for more info.
+ *
+ * Local labels 98 and 99 are used.
+ */
+#ifdef CONFIG_X86_32_LAZY_GS
+
+ /* unfortunately push/pop can't be no-op */
+.macro PUSH_GS
+ pushl $0
+ CFI_ADJUST_CFA_OFFSET 4
+.endm
+.macro POP_GS pop=0
+ addl $(4 + \pop), %esp
+ CFI_ADJUST_CFA_OFFSET -(4 + \pop)
+.endm
+.macro POP_GS_EX
+.endm
+
+ /* all the rest are no-op */
+.macro PTGS_TO_GS
+.endm
+.macro PTGS_TO_GS_EX
+.endm
+.macro GS_TO_REG reg
+.endm
+.macro REG_TO_PTGS reg
+.endm
+.macro SET_KERNEL_GS reg
+.endm
+
+#else /* CONFIG_X86_32_LAZY_GS */
+
+.macro PUSH_GS
+ pushl %gs
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET gs, 0*/
+.endm
+
+.macro POP_GS pop=0
+98: popl %gs
+ CFI_ADJUST_CFA_OFFSET -4
+ /*CFI_RESTORE gs*/
+ .if \pop <> 0
+ add $\pop, %esp
+ CFI_ADJUST_CFA_OFFSET -\pop
+ .endif
+.endm
+.macro POP_GS_EX
+.pushsection .fixup, "ax"
+99: movl $0, (%esp)
+ jmp 98b
+.section __ex_table, "a"
+ .align 4
+ .long 98b, 99b
+.popsection
+.endm
+
+.macro PTGS_TO_GS
+98: mov PT_GS(%esp), %gs
+.endm
+.macro PTGS_TO_GS_EX
+.pushsection .fixup, "ax"
+99: movl $0, PT_GS(%esp)
+ jmp 98b
+.section __ex_table, "a"
+ .align 4
+ .long 98b, 99b
+.popsection
+.endm
+
+.macro GS_TO_REG reg
+ movl %gs, \reg
+ /*CFI_REGISTER gs, \reg*/
+.endm
+.macro REG_TO_PTGS reg
+ movl \reg, PT_GS(%esp)
+ /*CFI_REL_OFFSET gs, PT_GS*/
+.endm
+.macro SET_KERNEL_GS reg
+ xorl \reg, \reg
+ movl \reg, %gs
+.endm
+
+#endif /* CONFIG_X86_32_LAZY_GS */
+
.macro SAVE_ALL
cld
+ PUSH_GS
pushl %fs
CFI_ADJUST_CFA_OFFSET 4
/*CFI_REL_OFFSET fs, 0;*/
@@ -138,6 +230,7 @@
movl %edx, %es
movl $(__KERNEL_PERCPU), %edx
movl %edx, %fs
+ SET_KERNEL_GS %edx
.endm
.macro RESTORE_INT_REGS
@@ -164,7 +257,7 @@
CFI_RESTORE eax
.endm
-.macro RESTORE_REGS
+.macro RESTORE_REGS pop=0
RESTORE_INT_REGS
1: popl %ds
CFI_ADJUST_CFA_OFFSET -4
@@ -175,6 +268,7 @@
3: popl %fs
CFI_ADJUST_CFA_OFFSET -4
/*CFI_RESTORE fs;*/
+ POP_GS \pop
.pushsection .fixup, "ax"
4: movl $0, (%esp)
jmp 1b
@@ -188,6 +282,7 @@
.long 2b, 5b
.long 3b, 6b
.popsection
+ POP_GS_EX
.endm
.macro RING0_INT_FRAME
@@ -368,6 +463,7 @@
xorl %ebp,%ebp
TRACE_IRQS_ON
1: mov PT_FS(%esp), %fs
+ PTGS_TO_GS
ENABLE_INTERRUPTS_SYSEXIT
#ifdef CONFIG_AUDITSYSCALL
@@ -416,6 +512,7 @@
.align 4
.long 1b,2b
.popsection
+ PTGS_TO_GS_EX
ENDPROC(ia32_sysenter_target)
# system call handler stub
@@ -458,8 +555,7 @@
restore_nocheck:
TRACE_IRQS_IRET
restore_nocheck_notrace:
- RESTORE_REGS
- addl $4, %esp # skip orig_eax/error_code
+ RESTORE_REGS 4 # skip orig_eax/error_code
CFI_ADJUST_CFA_OFFSET -4
irq_return:
INTERRUPT_RETURN
@@ -1078,7 +1174,10 @@
CFI_ADJUST_CFA_OFFSET 4
ALIGN
error_code:
- /* the function address is in %fs's slot on the stack */
+ /* the function address is in %gs's slot on the stack */
+ pushl %fs
+ CFI_ADJUST_CFA_OFFSET 4
+ /*CFI_REL_OFFSET fs, 0*/
pushl %es
CFI_ADJUST_CFA_OFFSET 4
/*CFI_REL_OFFSET es, 0*/
@@ -1107,20 +1206,15 @@
CFI_ADJUST_CFA_OFFSET 4
CFI_REL_OFFSET ebx, 0
cld
- pushl %fs
- CFI_ADJUST_CFA_OFFSET 4
- /*CFI_REL_OFFSET fs, 0*/
movl $(__KERNEL_PERCPU), %ecx
movl %ecx, %fs
UNWIND_ESPFIX_STACK
- popl %ecx
- CFI_ADJUST_CFA_OFFSET -4
- /*CFI_REGISTER es, ecx*/
- movl PT_FS(%esp), %edi # get the function address
+ GS_TO_REG %ecx
+ movl PT_GS(%esp), %edi # get the function address
movl PT_ORIG_EAX(%esp), %edx # get the error code
movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
- mov %ecx, PT_FS(%esp)
- /*CFI_REL_OFFSET fs, ES*/
+ REG_TO_PTGS %ecx
+ SET_KERNEL_GS %ecx
movl $(__USER_DS), %ecx
movl %ecx, %ds
movl %ecx, %es
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index d58a340..86122fa 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -539,7 +539,7 @@
* used %fs or %gs (it does not today), or if the kernel is
* running inside of a hypervisor layer.
*/
- savesegment(gs, prev->gs);
+ lazy_save_gs(prev->gs);
/*
* Load the per-thread Thread-Local Storage descriptor.
@@ -585,7 +585,7 @@
* Restore %gs if needed (which is common)
*/
if (prev->gs | next->gs)
- loadsegment(gs, next->gs);
+ lazy_load_gs(next->gs);
percpu_write(current_task, next_p);
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 508b6b5..7ec39ab 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -75,10 +75,7 @@
static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
{
BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0);
- regno >>= 2;
- if (regno > FS)
- --regno;
- return ®s->bx + regno;
+ return ®s->bx + (regno >> 2);
}
static u16 get_segment_reg(struct task_struct *task, unsigned long offset)
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 19e33b6..da2e314 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -283,7 +283,7 @@
/* There's one problem which normal hardware doesn't have: the Host
* can't handle us removing entries we're currently using. So we clear
* the GS register here: if it's needed it'll be reloaded anyway. */
- loadsegment(gs, 0);
+ lazy_load_gs(0);
lazy_hcall(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu, 0);
}
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 3723034..95ff6a0 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -323,13 +323,14 @@
static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
{
/*
- * XXX sleazy hack: If we're being called in a lazy-cpu zone,
- * it means we're in a context switch, and %gs has just been
- * saved. This means we can zero it out to prevent faults on
- * exit from the hypervisor if the next process has no %gs.
- * Either way, it has been saved, and the new value will get
- * loaded properly. This will go away as soon as Xen has been
- * modified to not save/restore %gs for normal hypercalls.
+ * XXX sleazy hack: If we're being called in a lazy-cpu zone
+ * and lazy gs handling is enabled, it means we're in a
+ * context switch, and %gs has just been saved. This means we
+ * can zero it out to prevent faults on exit from the
+ * hypervisor if the next process has no %gs. Either way, it
+ * has been saved, and the new value will get loaded properly.
+ * This will go away as soon as Xen has been modified to not
+ * save/restore %gs for normal hypercalls.
*
* On x86_64, this hack is not used for %gs, because gs points
* to KERNEL_GS_BASE (and uses it for PDA references), so we
@@ -341,7 +342,7 @@
*/
if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
#ifdef CONFIG_X86_32
- loadsegment(gs, 0);
+ lazy_load_gs(0);
#else
loadsegment(fs, 0);
#endif