blob: cb3aa470249b82fd457207455135addbde196733 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * linux/arch/i386/mm/pgtable.c
3 */
4
Linus Torvalds1da177e2005-04-16 15:20:36 -07005#include <linux/sched.h>
6#include <linux/kernel.h>
7#include <linux/errno.h>
8#include <linux/mm.h>
Prarit Bhargava27eb0b22007-10-17 18:04:34 +02009#include <linux/nmi.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070010#include <linux/swap.h>
11#include <linux/smp.h>
12#include <linux/highmem.h>
13#include <linux/slab.h>
14#include <linux/pagemap.h>
15#include <linux/spinlock.h>
Jeremy Fitzhardinge052e7992006-09-25 23:32:25 -070016#include <linux/module.h>
Christoph Lameterf1d1a842007-05-12 11:15:24 -070017#include <linux/quicklist.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070018
19#include <asm/system.h>
20#include <asm/pgtable.h>
21#include <asm/pgalloc.h>
22#include <asm/fixmap.h>
23#include <asm/e820.h>
24#include <asm/tlb.h>
25#include <asm/tlbflush.h>
26
27void show_mem(void)
28{
29 int total = 0, reserved = 0;
30 int shared = 0, cached = 0;
31 int highmem = 0;
32 struct page *page;
33 pg_data_t *pgdat;
34 unsigned long i;
Dave Hansen208d54e2005-10-29 18:16:52 -070035 unsigned long flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -070036
Christophe Lucasf90e7182005-06-25 14:59:24 -070037 printk(KERN_INFO "Mem-info:\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -070038 show_free_areas();
Christophe Lucasf90e7182005-06-25 14:59:24 -070039 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
KAMEZAWA Hiroyukiec936fc2006-03-27 01:15:59 -080040 for_each_online_pgdat(pgdat) {
Dave Hansen208d54e2005-10-29 18:16:52 -070041 pgdat_resize_lock(pgdat, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -070042 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
Prarit Bhargava27eb0b22007-10-17 18:04:34 +020043 if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
44 touch_nmi_watchdog();
Dave Hansen408fde82005-06-23 00:07:37 -070045 page = pgdat_page_nr(pgdat, i);
Linus Torvalds1da177e2005-04-16 15:20:36 -070046 total++;
47 if (PageHighMem(page))
48 highmem++;
49 if (PageReserved(page))
50 reserved++;
51 else if (PageSwapCache(page))
52 cached++;
53 else if (page_count(page))
54 shared += page_count(page) - 1;
55 }
Dave Hansen208d54e2005-10-29 18:16:52 -070056 pgdat_resize_unlock(pgdat, &flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -070057 }
Christophe Lucasf90e7182005-06-25 14:59:24 -070058 printk(KERN_INFO "%d pages of RAM\n", total);
59 printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
60 printk(KERN_INFO "%d reserved pages\n", reserved);
61 printk(KERN_INFO "%d pages shared\n", shared);
62 printk(KERN_INFO "%d pages swap cached\n", cached);
Martin J. Bligh6f4e1e52005-06-23 00:08:08 -070063
Christoph Lameterb1e7a8f2006-06-30 01:55:39 -070064 printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
Christoph Lameterce866b32006-06-30 01:55:40 -070065 printk(KERN_INFO "%lu pages writeback\n",
66 global_page_state(NR_WRITEBACK));
Christoph Lameter65ba55f2006-06-30 01:55:34 -070067 printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
Christoph Lameter972d1a72006-09-25 23:31:51 -070068 printk(KERN_INFO "%lu pages slab\n",
69 global_page_state(NR_SLAB_RECLAIMABLE) +
70 global_page_state(NR_SLAB_UNRECLAIMABLE));
Christoph Lameterdf849a12006-06-30 01:55:38 -070071 printk(KERN_INFO "%lu pages pagetables\n",
72 global_page_state(NR_PAGETABLE));
Linus Torvalds1da177e2005-04-16 15:20:36 -070073}
74
75/*
76 * Associate a virtual page frame with a given physical page frame
77 * and protection flags for that frame.
78 */
79static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
80{
81 pgd_t *pgd;
82 pud_t *pud;
83 pmd_t *pmd;
84 pte_t *pte;
85
86 pgd = swapper_pg_dir + pgd_index(vaddr);
87 if (pgd_none(*pgd)) {
88 BUG();
89 return;
90 }
91 pud = pud_offset(pgd, vaddr);
92 if (pud_none(*pud)) {
93 BUG();
94 return;
95 }
96 pmd = pmd_offset(pud, vaddr);
97 if (pmd_none(*pmd)) {
98 BUG();
99 return;
100 }
101 pte = pte_offset_kernel(pmd, vaddr);
Jan Beulichb0bfece2006-12-07 02:14:09 +0100102 if (pgprot_val(flags))
Jan Beulichaa506dc2007-10-17 18:04:33 +0200103 set_pte_present(&init_mm, vaddr, pte, pfn_pte(pfn, flags));
Jan Beulichb0bfece2006-12-07 02:14:09 +0100104 else
105 pte_clear(&init_mm, vaddr, pte);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700106
107 /*
108 * It's enough to flush this one mapping.
109 * (PGE mappings get flushed as well)
110 */
111 __flush_tlb_one(vaddr);
112}
113
114/*
115 * Associate a large virtual page frame with a given physical page frame
116 * and protection flags for that frame. pfn is for the base of the page,
117 * vaddr is what the page gets mapped to - both must be properly aligned.
118 * The pmd must already be instantiated. Assumes PAE mode.
119 */
120void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
121{
122 pgd_t *pgd;
123 pud_t *pud;
124 pmd_t *pmd;
125
126 if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */
Christophe Lucasf90e7182005-06-25 14:59:24 -0700127 printk(KERN_WARNING "set_pmd_pfn: vaddr misaligned\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700128 return; /* BUG(); */
129 }
130 if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */
Christophe Lucasf90e7182005-06-25 14:59:24 -0700131 printk(KERN_WARNING "set_pmd_pfn: pfn misaligned\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700132 return; /* BUG(); */
133 }
134 pgd = swapper_pg_dir + pgd_index(vaddr);
135 if (pgd_none(*pgd)) {
Christophe Lucasf90e7182005-06-25 14:59:24 -0700136 printk(KERN_WARNING "set_pmd_pfn: pgd_none\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700137 return; /* BUG(); */
138 }
139 pud = pud_offset(pgd, vaddr);
140 pmd = pmd_offset(pud, vaddr);
141 set_pmd(pmd, pfn_pmd(pfn, flags));
142 /*
143 * It's enough to flush this one mapping.
144 * (PGE mappings get flushed as well)
145 */
146 __flush_tlb_one(vaddr);
147}
148
Jeremy Fitzhardinge052e7992006-09-25 23:32:25 -0700149static int fixmaps;
Jeremy Fitzhardinge052e7992006-09-25 23:32:25 -0700150unsigned long __FIXADDR_TOP = 0xfffff000;
151EXPORT_SYMBOL(__FIXADDR_TOP);
Jeremy Fitzhardinge052e7992006-09-25 23:32:25 -0700152
Linus Torvalds1da177e2005-04-16 15:20:36 -0700153void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
154{
155 unsigned long address = __fix_to_virt(idx);
156
157 if (idx >= __end_of_fixed_addresses) {
158 BUG();
159 return;
160 }
161 set_pte_pfn(address, phys >> PAGE_SHIFT, flags);
Jeremy Fitzhardinge052e7992006-09-25 23:32:25 -0700162 fixmaps++;
163}
164
165/**
166 * reserve_top_address - reserves a hole in the top of kernel address space
167 * @reserve - size of hole to reserve
168 *
169 * Can be used to relocate the fixmap area and poke a hole in the top
170 * of kernel address space to make room for a hypervisor.
171 */
172void reserve_top_address(unsigned long reserve)
173{
174 BUG_ON(fixmaps > 0);
Zachary Amsden7ce0bcf2007-02-13 13:26:21 +0100175 printk(KERN_INFO "Reserving virtual address space above 0x%08x\n",
176 (int)-reserve);
Jeremy Fitzhardinge052e7992006-09-25 23:32:25 -0700177 __FIXADDR_TOP = -reserve - PAGE_SIZE;
178 __VMALLOC_RESERVE += reserve;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700179}
180
181pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
182{
183 return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
184}
185
186struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
187{
188 struct page *pte;
189
190#ifdef CONFIG_HIGHPTE
191 pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT|__GFP_ZERO, 0);
192#else
193 pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0);
194#endif
195 return pte;
196}
197
Linus Torvalds1da177e2005-04-16 15:20:36 -0700198/*
199 * List of all pgd's needed for non-PAE so it can invalidate entries
200 * in both cached and uncached pgd's; not needed for PAE since the
201 * kernel pmd is shared. If PAE were not to share the pmd a similar
202 * tactic would be needed. This is essentially codepath-based locking
203 * against pageattr.c; it is the unique case in which a valid change
204 * of kernel pagetables can't be lazily synchronized by vmalloc faults.
205 * vmalloc faults work because attached pagetables are never freed.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700206 * -- wli
207 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700208static inline void pgd_list_add(pgd_t *pgd)
209{
210 struct page *page = virt_to_page(pgd);
Jeremy Fitzhardingee3ed9102008-01-30 13:34:11 +0100211
212 list_add(&page->lru, &pgd_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700213}
214
215static inline void pgd_list_del(pgd_t *pgd)
216{
Jeremy Fitzhardingee3ed9102008-01-30 13:34:11 +0100217 struct page *page = virt_to_page(pgd);
218
219 list_del(&page->lru);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700220}
221
Christoph Lameterf1d1a842007-05-12 11:15:24 -0700222
223
Jeremy Fitzhardinge5311ab62007-05-02 19:27:13 +0200224#if (PTRS_PER_PMD == 1)
225/* Non-PAE pgd constructor */
Adrian Bunk23785692007-07-21 17:11:07 +0200226static void pgd_ctor(void *pgd)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700227{
228 unsigned long flags;
229
Jeremy Fitzhardinge5311ab62007-05-02 19:27:13 +0200230 /* !PAE, no pagetable sharing */
231 memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700232
Jeremy Fitzhardinge5311ab62007-05-02 19:27:13 +0200233 spin_lock_irqsave(&pgd_lock, flags);
234
235 /* must happen under lock */
Zachary Amsdend7271b12005-09-03 15:56:50 -0700236 clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700237 swapper_pg_dir + USER_PTRS_PER_PGD,
Zachary Amsdend7271b12005-09-03 15:56:50 -0700238 KERNEL_PGD_PTRS);
Zachary Amsdenc119ecc2007-02-13 13:26:21 +0100239 paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT,
Jeremy Fitzhardinge5311ab62007-05-02 19:27:13 +0200240 __pa(swapper_pg_dir) >> PAGE_SHIFT,
241 USER_PTRS_PER_PGD,
242 KERNEL_PGD_PTRS);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700243 pgd_list_add(pgd);
244 spin_unlock_irqrestore(&pgd_lock, flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700245}
Jeremy Fitzhardinge5311ab62007-05-02 19:27:13 +0200246#else /* PTRS_PER_PMD > 1 */
247/* PAE pgd constructor */
Adrian Bunk23785692007-07-21 17:11:07 +0200248static void pgd_ctor(void *pgd)
Jeremy Fitzhardinge5311ab62007-05-02 19:27:13 +0200249{
250 /* PAE, kernel PMD may be shared */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700251
Jeremy Fitzhardinge5311ab62007-05-02 19:27:13 +0200252 if (SHARED_KERNEL_PMD) {
253 clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD,
254 swapper_pg_dir + USER_PTRS_PER_PGD,
255 KERNEL_PGD_PTRS);
256 } else {
257 unsigned long flags;
258
259 memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
260 spin_lock_irqsave(&pgd_lock, flags);
261 pgd_list_add(pgd);
262 spin_unlock_irqrestore(&pgd_lock, flags);
263 }
264}
265#endif /* PTRS_PER_PMD */
266
Adrian Bunk23785692007-07-21 17:11:07 +0200267static void pgd_dtor(void *pgd)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700268{
269 unsigned long flags; /* can be called from interrupt context */
270
Christoph Lameterf1d1a842007-05-12 11:15:24 -0700271 if (SHARED_KERNEL_PMD)
272 return;
Jeremy Fitzhardinge5311ab62007-05-02 19:27:13 +0200273
Linus Torvalds1da177e2005-04-16 15:20:36 -0700274 spin_lock_irqsave(&pgd_lock, flags);
275 pgd_list_del(pgd);
276 spin_unlock_irqrestore(&pgd_lock, flags);
277}
278
Jeremy Fitzhardinge5311ab62007-05-02 19:27:13 +0200279#define UNSHARED_PTRS_PER_PGD \
280 (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD)
281
Jeremy Fitzhardinge8fe3dee2008-01-30 13:33:40 +0100282#ifdef CONFIG_X86_PAE
283/*
284 * Mop up any pmd pages which may still be attached to the pgd.
285 * Normally they will be freed by munmap/exit_mmap, but any pmd we
286 * preallocate which never got a corresponding vma will need to be
287 * freed manually.
288 */
289static void pgd_mop_up_pmds(pgd_t *pgdp)
290{
291 int i;
292
Jeremy Fitzhardinge508bebb2008-01-30 13:33:40 +0100293 for(i = 0; i < UNSHARED_PTRS_PER_PGD; i++) {
Jeremy Fitzhardinge8fe3dee2008-01-30 13:33:40 +0100294 pgd_t pgd = pgdp[i];
295
296 if (pgd_val(pgd) != 0) {
297 pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
298
299 pgdp[i] = native_make_pgd(0);
300
301 paravirt_release_pd(pgd_val(pgd) >> PAGE_SHIFT);
302 pmd_free(pmd);
303 }
304 }
305}
306
307/*
308 * In PAE mode, we need to do a cr3 reload (=tlb flush) when
309 * updating the top-level pagetable entries to guarantee the
310 * processor notices the update. Since this is expensive, and
311 * all 4 top-level entries are used almost immediately in a
312 * new process's life, we just pre-populate them here.
Jeremy Fitzhardinge508bebb2008-01-30 13:33:40 +0100313 *
314 * Also, if we're in a paravirt environment where the kernel pmd is
315 * not shared between pagetables (!SHARED_KERNEL_PMDS), we allocate
316 * and initialize the kernel pmds here.
Jeremy Fitzhardinge8fe3dee2008-01-30 13:33:40 +0100317 */
318static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
319{
320 pud_t *pud;
321 unsigned long addr;
322 int i;
323
324 pud = pud_offset(pgd, 0);
Jeremy Fitzhardinge508bebb2008-01-30 13:33:40 +0100325 for (addr = i = 0; i < UNSHARED_PTRS_PER_PGD;
326 i++, pud++, addr += PUD_SIZE) {
Jeremy Fitzhardinge8fe3dee2008-01-30 13:33:40 +0100327 pmd_t *pmd = pmd_alloc_one(mm, addr);
328
329 if (!pmd) {
330 pgd_mop_up_pmds(pgd);
331 return 0;
332 }
333
Jeremy Fitzhardinge508bebb2008-01-30 13:33:40 +0100334 if (i >= USER_PTRS_PER_PGD)
335 memcpy(pmd, (pmd_t *)pgd_page_vaddr(swapper_pg_dir[i]),
336 sizeof(pmd_t) * PTRS_PER_PMD);
337
Jeremy Fitzhardinge8fe3dee2008-01-30 13:33:40 +0100338 pud_populate(mm, pud, pmd);
339 }
340
341 return 1;
342}
343#else /* !CONFIG_X86_PAE */
344/* No need to prepopulate any pagetable entries in non-PAE modes. */
345static int pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd)
346{
347 return 1;
348}
349
350static void pgd_mop_up_pmds(pgd_t *pgd)
351{
352}
353#endif /* CONFIG_X86_PAE */
354
Linus Torvalds1da177e2005-04-16 15:20:36 -0700355pgd_t *pgd_alloc(struct mm_struct *mm)
356{
Christoph Lameterf1d1a842007-05-12 11:15:24 -0700357 pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700358
Jeremy Fitzhardinge6c435452008-01-30 13:33:39 +0100359 mm->pgd = pgd; /* so that alloc_pd can use it */
360
Jeremy Fitzhardinge8fe3dee2008-01-30 13:33:40 +0100361 if (pgd && !pgd_prepopulate_pmd(mm, pgd)) {
362 quicklist_free(0, pgd_dtor, pgd);
363 pgd = NULL;
364 }
365
Linus Torvalds1da177e2005-04-16 15:20:36 -0700366 return pgd;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700367}
368
369void pgd_free(pgd_t *pgd)
370{
Jeremy Fitzhardinge8fe3dee2008-01-30 13:33:40 +0100371 pgd_mop_up_pmds(pgd);
Christoph Lameterf1d1a842007-05-12 11:15:24 -0700372 quicklist_free(0, pgd_dtor, pgd);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700373}
Christoph Lameterf1d1a842007-05-12 11:15:24 -0700374
375void check_pgt_cache(void)
376{
377 quicklist_trim(0, pgd_dtor, 25, 16);
378}
Ingo Molnar5aa05082008-01-31 22:05:48 +0100379
380void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
381{
382 paravirt_release_pt(page_to_pfn(pte));
383 tlb_remove_page(tlb, pte);
384}
385
386#ifdef CONFIG_X86_PAE
387
388void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
389{
390 /* This is called just after the pmd has been detached from
391 the pgd, which requires a full tlb flush to be recognized
392 by the CPU. Rather than incurring multiple tlb flushes
393 while the address space is being pulled down, make the tlb
394 gathering machinery do a full flush when we're done. */
395 tlb->fullmm = 1;
396
397 paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT);
398 tlb_remove_page(tlb, virt_to_page(pmd));
399}
400
401#endif