Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 1 | /* |
Dave Jones | 835c34a | 2007-10-12 21:10:53 -0400 | [diff] [blame] | 2 | * handle transition of Linux booting another kernel |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 3 | * Copyright (C) 2002-2005 Eric Biederman <ebiederm@xmission.com> |
| 4 | * |
| 5 | * This source code is licensed under the GNU General Public License, |
| 6 | * Version 2. See the file COPYING for more details. |
| 7 | */ |
| 8 | |
| 9 | #include <linux/mm.h> |
| 10 | #include <linux/kexec.h> |
| 11 | #include <linux/delay.h> |
Rusty Russell | 1a3f239 | 2006-09-26 10:52:32 +0200 | [diff] [blame] | 12 | #include <linux/init.h> |
Ken'ichi Ohmichi | fd59d23 | 2007-10-16 23:27:27 -0700 | [diff] [blame] | 13 | #include <linux/numa.h> |
Ingo Molnar | f43fdad | 2008-05-12 21:20:43 +0200 | [diff] [blame] | 14 | #include <linux/ftrace.h> |
Huang Ying | 3122c33 | 2008-08-15 00:40:26 -0700 | [diff] [blame] | 15 | #include <linux/suspend.h> |
Huang Ying | 92be3d6 | 2008-10-31 09:48:08 +0800 | [diff] [blame] | 16 | #include <linux/gfp.h> |
Huang Ying | fef3a7a | 2009-03-10 10:56:57 +0800 | [diff] [blame] | 17 | #include <linux/io.h> |
Ingo Molnar | f43fdad | 2008-05-12 21:20:43 +0200 | [diff] [blame] | 18 | |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 19 | #include <asm/pgtable.h> |
| 20 | #include <asm/pgalloc.h> |
| 21 | #include <asm/tlbflush.h> |
| 22 | #include <asm/mmu_context.h> |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 23 | #include <asm/apic.h> |
| 24 | #include <asm/cpufeature.h> |
Eric W. Biederman | e7b47cc | 2005-07-29 13:01:18 -0600 | [diff] [blame] | 25 | #include <asm/desc.h> |
Zachary Amsden | 4bb0d3e | 2005-09-03 15:56:36 -0700 | [diff] [blame] | 26 | #include <asm/system.h> |
Huang Ying | 3ab8352 | 2008-07-25 19:45:07 -0700 | [diff] [blame] | 27 | #include <asm/cacheflush.h> |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 28 | |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 29 | static void set_idt(void *newidt, __u16 limit) |
| 30 | { |
Glauber de Oliveira Costa | 6b68f01 | 2008-01-30 13:31:12 +0100 | [diff] [blame] | 31 | struct desc_ptr curidt; |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 32 | |
| 33 | /* ia32 supports unaliged loads & stores */ |
Eric W. Biederman | e7b47cc | 2005-07-29 13:01:18 -0600 | [diff] [blame] | 34 | curidt.size = limit; |
| 35 | curidt.address = (unsigned long)newidt; |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 36 | |
Zachary Amsden | f2ab446 | 2005-09-03 15:56:42 -0700 | [diff] [blame] | 37 | load_idt(&curidt); |
WANG Cong | 378fc6e | 2008-06-24 16:21:18 +0100 | [diff] [blame] | 38 | } |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 39 | |
| 40 | |
| 41 | static void set_gdt(void *newgdt, __u16 limit) |
| 42 | { |
Glauber de Oliveira Costa | 6b68f01 | 2008-01-30 13:31:12 +0100 | [diff] [blame] | 43 | struct desc_ptr curgdt; |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 44 | |
| 45 | /* ia32 supports unaligned loads & stores */ |
Eric W. Biederman | e7b47cc | 2005-07-29 13:01:18 -0600 | [diff] [blame] | 46 | curgdt.size = limit; |
| 47 | curgdt.address = (unsigned long)newgdt; |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 48 | |
Zachary Amsden | f2ab446 | 2005-09-03 15:56:42 -0700 | [diff] [blame] | 49 | load_gdt(&curgdt); |
WANG Cong | 378fc6e | 2008-06-24 16:21:18 +0100 | [diff] [blame] | 50 | } |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 51 | |
| 52 | static void load_segments(void) |
| 53 | { |
| 54 | #define __STR(X) #X |
| 55 | #define STR(X) __STR(X) |
| 56 | |
| 57 | __asm__ __volatile__ ( |
| 58 | "\tljmp $"STR(__KERNEL_CS)",$1f\n" |
| 59 | "\t1:\n" |
Michael Matz | 2ec5e3a | 2006-03-07 21:55:48 -0800 | [diff] [blame] | 60 | "\tmovl $"STR(__KERNEL_DS)",%%eax\n" |
| 61 | "\tmovl %%eax,%%ds\n" |
| 62 | "\tmovl %%eax,%%es\n" |
| 63 | "\tmovl %%eax,%%fs\n" |
| 64 | "\tmovl %%eax,%%gs\n" |
| 65 | "\tmovl %%eax,%%ss\n" |
Huang Ying | fef3a7a | 2009-03-10 10:56:57 +0800 | [diff] [blame] | 66 | : : : "eax", "memory"); |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 67 | #undef STR |
| 68 | #undef __STR |
| 69 | } |
| 70 | |
Huang Ying | 92be3d6 | 2008-10-31 09:48:08 +0800 | [diff] [blame] | 71 | static void machine_kexec_free_page_tables(struct kimage *image) |
| 72 | { |
| 73 | free_page((unsigned long)image->arch.pgd); |
| 74 | #ifdef CONFIG_X86_PAE |
| 75 | free_page((unsigned long)image->arch.pmd0); |
| 76 | free_page((unsigned long)image->arch.pmd1); |
| 77 | #endif |
| 78 | free_page((unsigned long)image->arch.pte0); |
| 79 | free_page((unsigned long)image->arch.pte1); |
| 80 | } |
| 81 | |
| 82 | static int machine_kexec_alloc_page_tables(struct kimage *image) |
| 83 | { |
| 84 | image->arch.pgd = (pgd_t *)get_zeroed_page(GFP_KERNEL); |
| 85 | #ifdef CONFIG_X86_PAE |
| 86 | image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL); |
| 87 | image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL); |
| 88 | #endif |
| 89 | image->arch.pte0 = (pte_t *)get_zeroed_page(GFP_KERNEL); |
| 90 | image->arch.pte1 = (pte_t *)get_zeroed_page(GFP_KERNEL); |
| 91 | if (!image->arch.pgd || |
| 92 | #ifdef CONFIG_X86_PAE |
| 93 | !image->arch.pmd0 || !image->arch.pmd1 || |
| 94 | #endif |
| 95 | !image->arch.pte0 || !image->arch.pte1) { |
| 96 | machine_kexec_free_page_tables(image); |
| 97 | return -ENOMEM; |
| 98 | } |
| 99 | return 0; |
| 100 | } |
| 101 | |
Huang Ying | 9868ee6 | 2008-10-31 09:48:15 +0800 | [diff] [blame] | 102 | static void machine_kexec_page_table_set_one( |
| 103 | pgd_t *pgd, pmd_t *pmd, pte_t *pte, |
| 104 | unsigned long vaddr, unsigned long paddr) |
| 105 | { |
| 106 | pud_t *pud; |
| 107 | |
| 108 | pgd += pgd_index(vaddr); |
| 109 | #ifdef CONFIG_X86_PAE |
| 110 | if (!(pgd_val(*pgd) & _PAGE_PRESENT)) |
| 111 | set_pgd(pgd, __pgd(__pa(pmd) | _PAGE_PRESENT)); |
| 112 | #endif |
| 113 | pud = pud_offset(pgd, vaddr); |
| 114 | pmd = pmd_offset(pud, vaddr); |
| 115 | if (!(pmd_val(*pmd) & _PAGE_PRESENT)) |
| 116 | set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); |
| 117 | pte = pte_offset_kernel(pmd, vaddr); |
| 118 | set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); |
| 119 | } |
| 120 | |
| 121 | static void machine_kexec_prepare_page_tables(struct kimage *image) |
| 122 | { |
| 123 | void *control_page; |
Hannes Eder | fc6fcdf | 2009-02-22 01:00:57 +0100 | [diff] [blame] | 124 | pmd_t *pmd = NULL; |
Huang Ying | 9868ee6 | 2008-10-31 09:48:15 +0800 | [diff] [blame] | 125 | |
| 126 | control_page = page_address(image->control_code_page); |
| 127 | #ifdef CONFIG_X86_PAE |
| 128 | pmd = image->arch.pmd0; |
| 129 | #endif |
| 130 | machine_kexec_page_table_set_one( |
| 131 | image->arch.pgd, pmd, image->arch.pte0, |
| 132 | (unsigned long)control_page, __pa(control_page)); |
| 133 | #ifdef CONFIG_X86_PAE |
| 134 | pmd = image->arch.pmd1; |
| 135 | #endif |
| 136 | machine_kexec_page_table_set_one( |
| 137 | image->arch.pgd, pmd, image->arch.pte1, |
| 138 | __pa(control_page), __pa(control_page)); |
| 139 | } |
| 140 | |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 141 | /* |
| 142 | * A architecture hook called to validate the |
| 143 | * proposed image and prepare the control pages |
Huang Ying | 163f687 | 2008-08-15 00:40:22 -0700 | [diff] [blame] | 144 | * as needed. The pages for KEXEC_CONTROL_PAGE_SIZE |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 145 | * have been allocated, but the segments have yet |
| 146 | * been copied into the kernel. |
| 147 | * |
| 148 | * Do what every setup is needed on image and the |
| 149 | * reboot code buffer to allow us to avoid allocations |
| 150 | * later. |
| 151 | * |
Huang Ying | 92be3d6 | 2008-10-31 09:48:08 +0800 | [diff] [blame] | 152 | * - Make control page executable. |
| 153 | * - Allocate page tables |
Huang Ying | 9868ee6 | 2008-10-31 09:48:15 +0800 | [diff] [blame] | 154 | * - Setup page tables |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 155 | */ |
| 156 | int machine_kexec_prepare(struct kimage *image) |
| 157 | { |
Huang Ying | 9868ee6 | 2008-10-31 09:48:15 +0800 | [diff] [blame] | 158 | int error; |
| 159 | |
H. Peter Anvin | 583140a | 2009-11-13 15:28:15 -0800 | [diff] [blame^] | 160 | set_pages_x(image->control_code_page, 1); |
Huang Ying | 9868ee6 | 2008-10-31 09:48:15 +0800 | [diff] [blame] | 161 | error = machine_kexec_alloc_page_tables(image); |
| 162 | if (error) |
| 163 | return error; |
| 164 | machine_kexec_prepare_page_tables(image); |
| 165 | return 0; |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 166 | } |
| 167 | |
| 168 | /* |
| 169 | * Undo anything leftover by machine_kexec_prepare |
| 170 | * when an image is freed. |
| 171 | */ |
| 172 | void machine_kexec_cleanup(struct kimage *image) |
| 173 | { |
H. Peter Anvin | 583140a | 2009-11-13 15:28:15 -0800 | [diff] [blame^] | 174 | set_pages_nx(image->control_code_page, 1); |
Huang Ying | 92be3d6 | 2008-10-31 09:48:08 +0800 | [diff] [blame] | 175 | machine_kexec_free_page_tables(image); |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 176 | } |
| 177 | |
| 178 | /* |
| 179 | * Do not allocate memory (or fail in any way) in machine_kexec(). |
| 180 | * We are past the point of no return, committed to rebooting now. |
| 181 | */ |
Huang Ying | 3ab8352 | 2008-07-25 19:45:07 -0700 | [diff] [blame] | 182 | void machine_kexec(struct kimage *image) |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 183 | { |
Magnus Damm | 3566561 | 2006-09-26 10:52:38 +0200 | [diff] [blame] | 184 | unsigned long page_list[PAGES_NR]; |
| 185 | void *control_page; |
Huang Ying | 3122c33 | 2008-08-15 00:40:26 -0700 | [diff] [blame] | 186 | int save_ftrace_enabled; |
Huang Ying | 3ab8352 | 2008-07-25 19:45:07 -0700 | [diff] [blame] | 187 | asmlinkage unsigned long |
| 188 | (*relocate_kernel_ptr)(unsigned long indirection_page, |
| 189 | unsigned long control_page, |
| 190 | unsigned long start_address, |
| 191 | unsigned int has_pae, |
| 192 | unsigned int preserve_context); |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 193 | |
Huang Ying | 3122c33 | 2008-08-15 00:40:26 -0700 | [diff] [blame] | 194 | #ifdef CONFIG_KEXEC_JUMP |
Huang Ying | 6407df5 | 2009-05-08 10:51:41 +0800 | [diff] [blame] | 195 | if (image->preserve_context) |
Huang Ying | 3122c33 | 2008-08-15 00:40:26 -0700 | [diff] [blame] | 196 | save_processor_state(); |
| 197 | #endif |
| 198 | |
| 199 | save_ftrace_enabled = __ftrace_enabled_save(); |
Ingo Molnar | f43fdad | 2008-05-12 21:20:43 +0200 | [diff] [blame] | 200 | |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 201 | /* Interrupts aren't acceptable while we reboot */ |
| 202 | local_irq_disable(); |
| 203 | |
Huang Ying | 89081d1 | 2008-07-25 19:45:10 -0700 | [diff] [blame] | 204 | if (image->preserve_context) { |
| 205 | #ifdef CONFIG_X86_IO_APIC |
Huang Ying | fef3a7a | 2009-03-10 10:56:57 +0800 | [diff] [blame] | 206 | /* |
| 207 | * We need to put APICs in legacy mode so that we can |
Huang Ying | 89081d1 | 2008-07-25 19:45:10 -0700 | [diff] [blame] | 208 | * get timer interrupts in second kernel. kexec/kdump |
| 209 | * paths already have calls to disable_IO_APIC() in |
| 210 | * one form or other. kexec jump path also need |
| 211 | * one. |
| 212 | */ |
| 213 | disable_IO_APIC(); |
| 214 | #endif |
| 215 | } |
| 216 | |
Magnus Damm | 3566561 | 2006-09-26 10:52:38 +0200 | [diff] [blame] | 217 | control_page = page_address(image->control_code_page); |
Huang Ying | fb45daa | 2008-08-15 00:40:23 -0700 | [diff] [blame] | 218 | memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 219 | |
Huang Ying | 3ab8352 | 2008-07-25 19:45:07 -0700 | [diff] [blame] | 220 | relocate_kernel_ptr = control_page; |
Magnus Damm | 3566561 | 2006-09-26 10:52:38 +0200 | [diff] [blame] | 221 | page_list[PA_CONTROL_PAGE] = __pa(control_page); |
Huang Ying | 3ab8352 | 2008-07-25 19:45:07 -0700 | [diff] [blame] | 222 | page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; |
Huang Ying | 92be3d6 | 2008-10-31 09:48:08 +0800 | [diff] [blame] | 223 | page_list[PA_PGD] = __pa(image->arch.pgd); |
Ken'ichi Ohmichi | e7706fc | 2008-10-20 13:51:52 +0900 | [diff] [blame] | 224 | |
| 225 | if (image->type == KEXEC_TYPE_DEFAULT) |
| 226 | page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) |
| 227 | << PAGE_SHIFT); |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 228 | |
Huang Ying | fef3a7a | 2009-03-10 10:56:57 +0800 | [diff] [blame] | 229 | /* |
| 230 | * The segment registers are funny things, they have both a |
Eric W. Biederman | 2a8a3d5 | 2006-07-30 03:03:20 -0700 | [diff] [blame] | 231 | * visible and an invisible part. Whenever the visible part is |
| 232 | * set to a specific selector, the invisible part is loaded |
| 233 | * with from a table in memory. At no other time is the |
| 234 | * descriptor table in memory accessed. |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 235 | * |
| 236 | * I take advantage of this here by force loading the |
| 237 | * segments, before I zap the gdt with an invalid value. |
| 238 | */ |
| 239 | load_segments(); |
Huang Ying | fef3a7a | 2009-03-10 10:56:57 +0800 | [diff] [blame] | 240 | /* |
| 241 | * The gdt & idt are now invalid. |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 242 | * If you want to load them you must set up your own idt & gdt. |
| 243 | */ |
Huang Ying | fef3a7a | 2009-03-10 10:56:57 +0800 | [diff] [blame] | 244 | set_gdt(phys_to_virt(0), 0); |
| 245 | set_idt(phys_to_virt(0), 0); |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 246 | |
| 247 | /* now call it */ |
Huang Ying | 3ab8352 | 2008-07-25 19:45:07 -0700 | [diff] [blame] | 248 | image->start = relocate_kernel_ptr((unsigned long)image->head, |
| 249 | (unsigned long)page_list, |
| 250 | image->start, cpu_has_pae, |
| 251 | image->preserve_context); |
Huang Ying | 3122c33 | 2008-08-15 00:40:26 -0700 | [diff] [blame] | 252 | |
| 253 | #ifdef CONFIG_KEXEC_JUMP |
Huang Ying | 6407df5 | 2009-05-08 10:51:41 +0800 | [diff] [blame] | 254 | if (image->preserve_context) |
Huang Ying | 3122c33 | 2008-08-15 00:40:26 -0700 | [diff] [blame] | 255 | restore_processor_state(); |
| 256 | #endif |
| 257 | |
| 258 | __ftrace_enabled_restore(save_ftrace_enabled); |
Eric W. Biederman | 5033cba | 2005-06-25 14:57:56 -0700 | [diff] [blame] | 259 | } |
Rusty Russell | 1a3f239 | 2006-09-26 10:52:32 +0200 | [diff] [blame] | 260 | |
Ken'ichi Ohmichi | fd59d23 | 2007-10-16 23:27:27 -0700 | [diff] [blame] | 261 | void arch_crash_save_vmcoreinfo(void) |
| 262 | { |
Ken'ichi Ohmichi | 92df5c3 | 2008-02-07 00:15:23 -0800 | [diff] [blame] | 263 | #ifdef CONFIG_NUMA |
Ken'ichi Ohmichi | bcbba6c | 2007-10-16 23:27:30 -0700 | [diff] [blame] | 264 | VMCOREINFO_SYMBOL(node_data); |
| 265 | VMCOREINFO_LENGTH(node_data, MAX_NUMNODES); |
Ken'ichi Ohmichi | fd59d23 | 2007-10-16 23:27:27 -0700 | [diff] [blame] | 266 | #endif |
| 267 | #ifdef CONFIG_X86_PAE |
Ken'ichi Ohmichi | bcbba6c | 2007-10-16 23:27:30 -0700 | [diff] [blame] | 268 | VMCOREINFO_CONFIG(X86_PAE); |
Ken'ichi Ohmichi | fd59d23 | 2007-10-16 23:27:27 -0700 | [diff] [blame] | 269 | #endif |
| 270 | } |
| 271 | |