blob: f34e33d184438ba82aa7dc6e7591338bf0f6d976 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * linux/arch/i386/mm/pgtable.c
3 */
4
Linus Torvalds1da177e2005-04-16 15:20:36 -07005#include <linux/sched.h>
6#include <linux/kernel.h>
7#include <linux/errno.h>
8#include <linux/mm.h>
Prarit Bhargava27eb0b22007-10-17 18:04:34 +02009#include <linux/nmi.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070010#include <linux/swap.h>
11#include <linux/smp.h>
12#include <linux/highmem.h>
13#include <linux/slab.h>
14#include <linux/pagemap.h>
15#include <linux/spinlock.h>
Jeremy Fitzhardinge052e7992006-09-25 23:32:25 -070016#include <linux/module.h>
Christoph Lameterf1d1a842007-05-12 11:15:24 -070017#include <linux/quicklist.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070018
19#include <asm/system.h>
20#include <asm/pgtable.h>
21#include <asm/pgalloc.h>
22#include <asm/fixmap.h>
23#include <asm/e820.h>
24#include <asm/tlb.h>
25#include <asm/tlbflush.h>
26
27void show_mem(void)
28{
29 int total = 0, reserved = 0;
30 int shared = 0, cached = 0;
31 int highmem = 0;
32 struct page *page;
33 pg_data_t *pgdat;
34 unsigned long i;
Dave Hansen208d54e2005-10-29 18:16:52 -070035 unsigned long flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -070036
Christophe Lucasf90e7182005-06-25 14:59:24 -070037 printk(KERN_INFO "Mem-info:\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -070038 show_free_areas();
Christophe Lucasf90e7182005-06-25 14:59:24 -070039 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
KAMEZAWA Hiroyukiec936fc2006-03-27 01:15:59 -080040 for_each_online_pgdat(pgdat) {
Dave Hansen208d54e2005-10-29 18:16:52 -070041 pgdat_resize_lock(pgdat, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -070042 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
Prarit Bhargava27eb0b22007-10-17 18:04:34 +020043 if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
44 touch_nmi_watchdog();
Dave Hansen408fde82005-06-23 00:07:37 -070045 page = pgdat_page_nr(pgdat, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -070046 total++;
47 if (PageHighMem(page))
48 highmem++;
49 if (PageReserved(page))
50 reserved++;
51 else if (PageSwapCache(page))
52 cached++;
53 else if (page_count(page))
54 shared += page_count(page) - 1;
55 }
Dave Hansen208d54e2005-10-29 18:16:52 -070056 pgdat_resize_unlock(pgdat, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -070057 }
Christophe Lucasf90e7182005-06-25 14:59:24 -070058 printk(KERN_INFO "%d pages of RAM\n", total);
59 printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
60 printk(KERN_INFO "%d reserved pages\n", reserved);
61 printk(KERN_INFO "%d pages shared\n", shared);
62 printk(KERN_INFO "%d pages swap cached\n", cached);
Martin J. Bligh6f4e1e52005-06-23 00:08:08 -070063
Christoph Lameterb1e7a8f2006-06-30 01:55:39 -070064 printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
Christoph Lameterce866b32006-06-30 01:55:40 -070065 printk(KERN_INFO "%lu pages writeback\n",
66 global_page_state(NR_WRITEBACK));
Christoph Lameter65ba55f2006-06-30 01:55:34 -070067 printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
Christoph Lameter972d1a72006-09-25 23:31:51 -070068 printk(KERN_INFO "%lu pages slab\n",
69 global_page_state(NR_SLAB_RECLAIMABLE) +
70 global_page_state(NR_SLAB_UNRECLAIMABLE));
Christoph Lameterdf849a12006-06-30 01:55:38 -070071 printk(KERN_INFO "%lu pages pagetables\n",
72 global_page_state(NR_PAGETABLE));
Linus Torvalds1da177e2005-04-16 15:20:36 -070073}
74
75/*
76 * Associate a virtual page frame with a given physical page frame
77 * and protection flags for that frame.
78 */
79static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
80{
81 pgd_t *pgd;
82 pud_t *pud;
83 pmd_t *pmd;
84 pte_t *pte;
85
86 pgd = swapper_pg_dir + pgd_index(vaddr);
87 if (pgd_none(*pgd)) {
88 BUG();
89 return;
90 }
91 pud = pud_offset(pgd, vaddr);
92 if (pud_none(*pud)) {
93 BUG();
94 return;
95 }
96 pmd = pmd_offset(pud, vaddr);
97 if (pmd_none(*pmd)) {
98 BUG();
99 return;
100 }
101 pte = pte_offset_kernel(pmd, vaddr);
Jan Beulichb0bfece2006-12-07 02:14:09 +0100102 if (pgprot_val(flags))
Jan Beulichaa506dc2007-10-17 18:04:33 +0200103 set_pte_present(&init_mm, vaddr, pte, pfn_pte(pfn, flags));
Jan Beulichb0bfece2006-12-07 02:14:09 +0100104 else
105 pte_clear(&init_mm, vaddr, pte);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700106
107 /*
108 * It's enough to flush this one mapping.
109 * (PGE mappings get flushed as well)
110 */
111 __flush_tlb_one(vaddr);
112}
113
114/*
115 * Associate a large virtual page frame with a given physical page frame
116 * and protection flags for that frame. pfn is for the base of the page,
117 * vaddr is what the page gets mapped to - both must be properly aligned.
118 * The pmd must already be instantiated. Assumes PAE mode.
119 */
120void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
121{
122 pgd_t *pgd;
123 pud_t *pud;
124 pmd_t *pmd;
125
126 if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */
Christophe Lucasf90e7182005-06-25 14:59:24 -0700127 printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700128 return; /* BUG(); */
129 }
130 if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */
Christophe Lucasf90e7182005-06-25 14:59:24 -0700131 printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700132 return; /* BUG(); */
133 }
134 pgd = swapper_pg_dir + pgd_index(vaddr);
135 if (pgd_none(*pgd)) {
Christophe Lucasf90e7182005-06-25 14:59:24 -0700136 printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700137 return; /* BUG(); */
138 }
139 pud = pud_offset(pgd, vaddr);
140 pmd = pmd_offset(pud, vaddr);
141 set_pmd(pmd, pfn_pmd(pfn, flags));
142 /*
143 * It's enough to flush this one mapping.
144 * (PGE mappings get flushed as well)
145 */
146 __flush_tlb_one(vaddr);
147}
148
Jeremy Fitzhardinge052e7992006-09-25 23:32:25 -0700149static int fixmaps;
Jeremy Fitzhardinge052e7992006-09-25 23:32:25 -0700150unsigned long __FIXADDR_TOP = 0xfffff000;
151EXPORT_SYMBOL(__FIXADDR_TOP);
Jeremy Fitzhardinge052e7992006-09-25 23:32:25 -0700152
Linus Torvalds1da177e2005-04-16 15:20:36 -0700153void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
154{
155 unsigned long address = __fix_to_virt(idx);
156
157 if (idx >= __end_of_fixed_addresses) {
158 BUG();
159 return;
160 }
161 set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
Jeremy Fitzhardinge052e7992006-09-25 23:32:25 -0700162 fixmaps++;
163}
164
165/**
166 * reserve_top_address - reserves a hole in the top of kernel address space
167 * @reserve - size of hole to reserve
168 *
169 * Can be used to relocate the fixmap area and poke a hole in the top
170 * of kernel address space to make room for a hypervisor.
171 */
172void reserve_top_address(unsigned long reserve)
173{
174 BUG_ON(fixmaps > 0);
Zachary Amsden7ce0bcf2007-02-13 13:26:21 +0100175 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
176 (int)-reserve);
Jeremy Fitzhardinge052e7992006-09-25 23:32:25 -0700177 __FIXADDR_TOP = -reserve - PAGE_SIZE;
178 __VMALLOC_RESERVE += reserve;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700179}
180
181pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
182{
183 return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
184}
185
186struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
187{
188 struct page *pte;
189
190#ifdef CONFIG_HIGHPTE
191 pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
192#else
193 pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
194#endif
195 return pte;
196}
197
Linus Torvalds1da177e2005-04-16 15:20:36 -0700198/*
199 * List of all pgd's needed for non-PAE so it can invalidate entries
200 * in both cached and uncached pgd's; not needed for PAE since the
201 * kernel pmd is shared. If PAE were not to share the pmd a similar
202 * tactic would be needed. This is essentially codepath-based locking
203 * against pageattr.c; it is the unique case in which a valid change
204 * of kernel pagetables can't be lazily synchronized by vmalloc faults.
205 * vmalloc faults work because attached pagetables are never freed.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700206 * -- wli
207 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700208static inline void pgd_list_add(pgd_t *pgd)
209{
210 struct page *page = virt_to_page(pgd);
Jeremy Fitzhardingee3ed9102008-01-30 13:34:11 +0100211
212 list_add(&page->lru, &pgd_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700213}
214
215static inline void pgd_list_del(pgd_t *pgd)
216{
Jeremy Fitzhardingee3ed9102008-01-30 13:34:11 +0100217 struct page *page = virt_to_page(pgd);
218
219 list_del(&page->lru);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700220}
221
Jeremy Fitzhardingee618c952008-02-04 16:48:02 +0100222#define UNSHARED_PTRS_PER_PGD \
223 (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
Christoph Lameterf1d1a842007-05-12 11:15:24 -0700224
Jeremy Fitzhardingee618c952008-02-04 16:48:02 +0100225static void pgd_ctor(void *p)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700226{
Jeremy Fitzhardingee618c952008-02-04 16:48:02 +0100227 pgd_t *pgd = p;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700228 unsigned long flags;
229
Jeremy Fitzhardingee618c952008-02-04 16:48:02 +0100230 /* Clear usermode parts of PGD */
Jeremy Fitzhardinge5311ab62007-05-02 19:27:13 +0200231 memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700232
Jeremy Fitzhardinge5311ab62007-05-02 19:27:13 +0200233 spin_lock_irqsave(&pgd_lock, flags);
234
Jeremy Fitzhardingee618c952008-02-04 16:48:02 +0100235 /* If the pgd points to a shared pagetable level (either the
236 ptes in non-PAE, or shared PMD in PAE), then just copy the
237 references from swapper_pg_dir. */
238 if (PAGETABLE_LEVELS == 2 ||
239 (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD)) {
240 clone_pgd_range(pgd + USER_PTRS_PER_PGD,
Jeremy Fitzhardinge5311ab62007-05-02 19:27:13 +0200241 swapper_pg_dir + USER_PTRS_PER_PGD,
242 KERNEL_PGD_PTRS);
Jeremy Fitzhardingee618c952008-02-04 16:48:02 +0100243 paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
244 __pa(swapper_pg_dir) >> PAGE_SHIFT,
245 USER_PTRS_PER_PGD,
246 KERNEL_PGD_PTRS);
Jeremy Fitzhardinge5311ab62007-05-02 19:27:13 +0200247 }
Jeremy Fitzhardingee618c952008-02-04 16:48:02 +0100248
249 /* list required to sync kernel mapping updates */
250 if (!SHARED_KERNEL_PMD)
251 pgd_list_add(pgd);
252
253 spin_unlock_irqrestore(&pgd_lock, flags);
Jeremy Fitzhardinge5311ab62007-05-02 19:27:13 +0200254}
Jeremy Fitzhardinge5311ab62007-05-02 19:27:13 +0200255
Adrian Bunk23785692007-07-21 17:11:07 +0200256static void pgd_dtor(void *pgd)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700257{
258 unsigned long flags; /* can be called from interrupt context */
259
Christoph Lameterf1d1a842007-05-12 11:15:24 -0700260 if (SHARED_KERNEL_PMD)
261 return;
Jeremy Fitzhardinge5311ab62007-05-02 19:27:13 +0200262
Linus Torvalds1da177e2005-04-16 15:20:36 -0700263 spin_lock_irqsave(&pgd_lock, flags);
264 pgd_list_del(pgd);
265 spin_unlock_irqrestore(&pgd_lock, flags);
266}
267
Jeremy Fitzhardinge8fe3dee2008-01-30 13:33:40 +0100268#ifdef CONFIG_X86_PAE
269/*
270 * Mop up any pmd pages which may still be attached to the pgd.
271 * Normally they will be freed by munmap/exit_mmap, but any pmd we
272 * preallocate which never got a corresponding vma will need to be
273 * freed manually.
274 */
275static void pgd_mop_up_pmds(pgd_t *pgdp)
276{
277 int i;
278
Jeremy Fitzhardinge508bebb2008-01-30 13:33:40 +0100279 for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
Jeremy Fitzhardinge8fe3dee2008-01-30 13:33:40 +0100280 pgd_t pgd = pgdp[i];
281
282 if (pgd_val(pgd) != 0) {
283 pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
284
285 pgdp[i] = native_make_pgd(0);
286
287 paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT);
288 pmd_free(pmd);
289 }
290 }
291}
292
293/*
294 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
295 * updating the top-level pagetable entries to guarantee the
296 * processor notices the update. Since this is expensive, and
297 * all 4 top-level entries are used almost immediately in a
298 * new process's life, we just pre-populate them here.
Jeremy Fitzhardinge508bebb2008-01-30 13:33:40 +0100299 *
300 * Also, if we're in a paravirt environment where the kernel pmd is
301 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
302 * and initialize the kernel pmds here.
Jeremy Fitzhardinge8fe3dee2008-01-30 13:33:40 +0100303 */
304static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
305{
306 pud_t *pud;
307 unsigned long addr;
308 int i;
309
310 pud = pud_offset(pgd, 0);
Jeremy Fitzhardinge508bebb2008-01-30 13:33:40 +0100311 for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
312 i++, pud++, addr += PUD_SIZE) {
Jeremy Fitzhardinge8fe3dee2008-01-30 13:33:40 +0100313 pmd_t *pmd = pmd_alloc_one(mm, addr);
314
315 if (!pmd) {
316 pgd_mop_up_pmds(pgd);
317 return 0;
318 }
319
Jeremy Fitzhardinge508bebb2008-01-30 13:33:40 +0100320 if (i >= USER_PTRS_PER_PGD)
321 memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
322 sizeof(pmd_t) * PTRS_PER_PMD);
323
Jeremy Fitzhardinge8fe3dee2008-01-30 13:33:40 +0100324 pud_populate(mm, pud, pmd);
325 }
326
327 return 1;
328}
329#else /* !CONFIG_X86_PAE */
330/* No need to prepopulate any pagetable entries in non-PAE modes. */
331static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
332{
333 return 1;
334}
335
336static void pgd_mop_up_pmds(pgd_t *pgd)
337{
338}
339#endif /* CONFIG_X86_PAE */
340
Linus Torvalds1da177e2005-04-16 15:20:36 -0700341pgd_t *pgd_alloc(struct mm_struct *mm)
342{
Christoph Lameterf1d1a842007-05-12 11:15:24 -0700343 pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700344
Jeremy Fitzhardinge6c435452008-01-30 13:33:39 +0100345 mm->pgd = pgd; /* so that alloc_pd can use it */
346
Jeremy Fitzhardinge8fe3dee2008-01-30 13:33:40 +0100347 if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
348 quicklist_free(0, pgd_dtor, pgd);
349 pgd = NULL;
350 }
351
Linus Torvalds1da177e2005-04-16 15:20:36 -0700352 return pgd;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700353}
354
355void pgd_free(pgd_t *pgd)
356{
Jeremy Fitzhardinge8fe3dee2008-01-30 13:33:40 +0100357 pgd_mop_up_pmds(pgd);
Christoph Lameterf1d1a842007-05-12 11:15:24 -0700358 quicklist_free(0, pgd_dtor, pgd);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700359}
Christoph Lameterf1d1a842007-05-12 11:15:24 -0700360
361void check_pgt_cache(void)
362{
363 quicklist_trim(0, pgd_dtor, 25, 16);
364}
Ingo Molnar5aa05082008-01-31 22:05:48 +0100365
366void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
367{
368 paravirt_release_pt(page_to_pfn(pte));
369 tlb_remove_page(tlb, pte);
370}
371
372#ifdef CONFIG_X86_PAE
373
374void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
375{
376 /* This is called just after the pmd has been detached from
377 the pgd, which requires a full tlb flush to be recognized
378 by the CPU. Rather than incurring multiple tlb flushes
379 while the address space is being pulled down, make the tlb
380 gathering machinery do a full flush when we're done. */
381 tlb->fullmm = 1;
382
383 paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
384 tlb_remove_page(tlb, virt_to_page(pmd));
385}
386
387#endif