blob: 37a23c22370576415441e7542728b79a233f2f58 [file] [log] [blame]
Martin Schwidefsky3610cce2007-10-22 12:52:47 +02001/*
Heiko Carstens239a64252009-06-12 10:26:33 +02002 * Copyright IBM Corp. 2007,2009
Martin Schwidefsky3610cce2007-10-22 12:52:47 +02003 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
4 */
5
6#include <linux/sched.h>
7#include <linux/kernel.h>
8#include <linux/errno.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +09009#include <linux/gfp.h>
Martin Schwidefsky3610cce2007-10-22 12:52:47 +020010#include <linux/mm.h>
11#include <linux/swap.h>
12#include <linux/smp.h>
13#include <linux/highmem.h>
Martin Schwidefsky3610cce2007-10-22 12:52:47 +020014#include <linux/pagemap.h>
15#include <linux/spinlock.h>
16#include <linux/module.h>
17#include <linux/quicklist.h>
Martin Schwidefsky80217142010-10-25 16:10:11 +020018#include <linux/rcupdate.h>
Martin Schwidefsky3610cce2007-10-22 12:52:47 +020019
20#include <asm/system.h>
21#include <asm/pgtable.h>
22#include <asm/pgalloc.h>
23#include <asm/tlb.h>
24#include <asm/tlbflush.h>
Martin Schwidefsky6252d702008-02-09 18:24:37 +010025#include <asm/mmu_context.h>
Martin Schwidefsky3610cce2007-10-22 12:52:47 +020026
27#ifndef CONFIG_64BIT
28#define ALLOC_ORDER 1
Martin Schwidefsky36409f62011-06-06 14:14:41 +020029#define FRAG_MASK 0x0f
Martin Schwidefsky3610cce2007-10-22 12:52:47 +020030#else
31#define ALLOC_ORDER 2
Martin Schwidefsky36409f62011-06-06 14:14:41 +020032#define FRAG_MASK 0x03
Martin Schwidefsky3610cce2007-10-22 12:52:47 +020033#endif
34
Heiko Carstens239a64252009-06-12 10:26:33 +020035unsigned long VMALLOC_START = VMALLOC_END - VMALLOC_SIZE;
36EXPORT_SYMBOL(VMALLOC_START);
37
38static int __init parse_vmalloc(char *arg)
39{
40 if (!arg)
41 return -EINVAL;
42 VMALLOC_START = (VMALLOC_END - memparse(arg, &arg)) & PAGE_MASK;
43 return 0;
44}
45early_param("vmalloc", parse_vmalloc);
46
Martin Schwidefsky043d0702011-05-23 10:24:23 +020047unsigned long *crst_table_alloc(struct mm_struct *mm)
Martin Schwidefsky3610cce2007-10-22 12:52:47 +020048{
49 struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
50
51 if (!page)
52 return NULL;
Martin Schwidefsky3610cce2007-10-22 12:52:47 +020053 return (unsigned long *) page_to_phys(page);
54}
55
Martin Schwidefsky146e4b32008-02-09 18:24:35 +010056void crst_table_free(struct mm_struct *mm, unsigned long *table)
Martin Schwidefsky3610cce2007-10-22 12:52:47 +020057{
Martin Schwidefsky043d0702011-05-23 10:24:23 +020058 free_pages((unsigned long) table, ALLOC_ORDER);
Martin Schwidefsky80217142010-10-25 16:10:11 +020059}
60
Martin Schwidefsky6252d702008-02-09 18:24:37 +010061#ifdef CONFIG_64BIT
62int crst_table_upgrade(struct mm_struct *mm, unsigned long limit)
63{
64 unsigned long *table, *pgd;
65 unsigned long entry;
66
67 BUG_ON(limit > (1UL << 53));
68repeat:
Martin Schwidefsky043d0702011-05-23 10:24:23 +020069 table = crst_table_alloc(mm);
Martin Schwidefsky6252d702008-02-09 18:24:37 +010070 if (!table)
71 return -ENOMEM;
Martin Schwidefsky80217142010-10-25 16:10:11 +020072 spin_lock_bh(&mm->page_table_lock);
Martin Schwidefsky6252d702008-02-09 18:24:37 +010073 if (mm->context.asce_limit < limit) {
74 pgd = (unsigned long *) mm->pgd;
75 if (mm->context.asce_limit <= (1UL << 31)) {
76 entry = _REGION3_ENTRY_EMPTY;
77 mm->context.asce_limit = 1UL << 42;
78 mm->context.asce_bits = _ASCE_TABLE_LENGTH |
79 _ASCE_USER_BITS |
80 _ASCE_TYPE_REGION3;
81 } else {
82 entry = _REGION2_ENTRY_EMPTY;
83 mm->context.asce_limit = 1UL << 53;
84 mm->context.asce_bits = _ASCE_TABLE_LENGTH |
85 _ASCE_USER_BITS |
86 _ASCE_TYPE_REGION2;
87 }
88 crst_table_init(table, entry);
89 pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
90 mm->pgd = (pgd_t *) table;
Martin Schwidefskyf481bfa2009-03-18 13:27:36 +010091 mm->task_size = mm->context.asce_limit;
Martin Schwidefsky6252d702008-02-09 18:24:37 +010092 table = NULL;
93 }
Martin Schwidefsky80217142010-10-25 16:10:11 +020094 spin_unlock_bh(&mm->page_table_lock);
Martin Schwidefsky6252d702008-02-09 18:24:37 +010095 if (table)
96 crst_table_free(mm, table);
97 if (mm->context.asce_limit < limit)
98 goto repeat;
99 update_mm(mm, current);
100 return 0;
101}
102
103void crst_table_downgrade(struct mm_struct *mm, unsigned long limit)
104{
105 pgd_t *pgd;
106
107 if (mm->context.asce_limit <= limit)
108 return;
109 __tlb_flush_mm(mm);
110 while (mm->context.asce_limit > limit) {
111 pgd = mm->pgd;
112 switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) {
113 case _REGION_ENTRY_TYPE_R2:
114 mm->context.asce_limit = 1UL << 42;
115 mm->context.asce_bits = _ASCE_TABLE_LENGTH |
116 _ASCE_USER_BITS |
117 _ASCE_TYPE_REGION3;
118 break;
119 case _REGION_ENTRY_TYPE_R3:
120 mm->context.asce_limit = 1UL << 31;
121 mm->context.asce_bits = _ASCE_TABLE_LENGTH |
122 _ASCE_USER_BITS |
123 _ASCE_TYPE_SEGMENT;
124 break;
125 default:
126 BUG();
127 }
128 mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
Martin Schwidefskyf481bfa2009-03-18 13:27:36 +0100129 mm->task_size = mm->context.asce_limit;
Martin Schwidefsky6252d702008-02-09 18:24:37 +0100130 crst_table_free(mm, (unsigned long *) pgd);
131 }
132 update_mm(mm, current);
133}
134#endif
135
Martin Schwidefsky36409f62011-06-06 14:14:41 +0200136static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
137{
138 unsigned int old, new;
139
140 do {
141 old = atomic_read(v);
142 new = old ^ bits;
143 } while (atomic_cmpxchg(v, old, new) != old);
144 return new;
145}
146
Martin Schwidefsky3610cce2007-10-22 12:52:47 +0200147/*
148 * page table entry allocation/free routines.
149 */
Martin Schwidefsky36409f62011-06-06 14:14:41 +0200150#ifdef CONFIG_PGSTE
151static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm)
152{
153 struct page *page;
154 unsigned long *table;
155
156 page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
157 if (!page)
158 return NULL;
159 pgtable_page_ctor(page);
160 atomic_set(&page->_mapcount, 3);
161 table = (unsigned long *) page_to_phys(page);
162 clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2);
163 clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2);
164 return table;
165}
166
167static inline void page_table_free_pgste(unsigned long *table)
168{
169 struct page *page;
170
171 page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
172 pgtable_page_ctor(page);
173 atomic_set(&page->_mapcount, -1);
174 __free_page(page);
175}
176#endif
177
Martin Schwidefsky146e4b32008-02-09 18:24:35 +0100178unsigned long *page_table_alloc(struct mm_struct *mm)
Martin Schwidefsky3610cce2007-10-22 12:52:47 +0200179{
Martin Schwidefsky146e4b32008-02-09 18:24:35 +0100180 struct page *page;
Martin Schwidefsky3610cce2007-10-22 12:52:47 +0200181 unsigned long *table;
Martin Schwidefsky36409f62011-06-06 14:14:41 +0200182 unsigned int mask, bit;
Martin Schwidefsky3610cce2007-10-22 12:52:47 +0200183
Martin Schwidefsky36409f62011-06-06 14:14:41 +0200184#ifdef CONFIG_PGSTE
185 if (mm_has_pgste(mm))
186 return page_table_alloc_pgste(mm);
187#endif
188 /* Allocate fragments of a 4K page as 1K/2K page table */
Martin Schwidefsky80217142010-10-25 16:10:11 +0200189 spin_lock_bh(&mm->context.list_lock);
Martin Schwidefsky36409f62011-06-06 14:14:41 +0200190 mask = FRAG_MASK;
Martin Schwidefsky146e4b32008-02-09 18:24:35 +0100191 if (!list_empty(&mm->context.pgtable_list)) {
192 page = list_first_entry(&mm->context.pgtable_list,
193 struct page, lru);
Martin Schwidefsky36409f62011-06-06 14:14:41 +0200194 table = (unsigned long *) page_to_phys(page);
195 mask = atomic_read(&page->_mapcount);
196 mask = mask | (mask >> 4);
Martin Schwidefsky3610cce2007-10-22 12:52:47 +0200197 }
Martin Schwidefsky36409f62011-06-06 14:14:41 +0200198 if ((mask & FRAG_MASK) == FRAG_MASK) {
Martin Schwidefsky80217142010-10-25 16:10:11 +0200199 spin_unlock_bh(&mm->context.list_lock);
Martin Schwidefsky146e4b32008-02-09 18:24:35 +0100200 page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
201 if (!page)
202 return NULL;
203 pgtable_page_ctor(page);
Martin Schwidefsky36409f62011-06-06 14:14:41 +0200204 atomic_set(&page->_mapcount, 1);
Martin Schwidefsky146e4b32008-02-09 18:24:35 +0100205 table = (unsigned long *) page_to_phys(page);
Martin Schwidefsky36409f62011-06-06 14:14:41 +0200206 clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE);
Martin Schwidefsky80217142010-10-25 16:10:11 +0200207 spin_lock_bh(&mm->context.list_lock);
Martin Schwidefsky146e4b32008-02-09 18:24:35 +0100208 list_add(&page->lru, &mm->context.pgtable_list);
Martin Schwidefsky36409f62011-06-06 14:14:41 +0200209 } else {
210 for (bit = 1; mask & bit; bit <<= 1)
211 table += PTRS_PER_PTE;
212 mask = atomic_xor_bits(&page->_mapcount, bit);
213 if ((mask & FRAG_MASK) == FRAG_MASK)
214 list_del(&page->lru);
Martin Schwidefsky146e4b32008-02-09 18:24:35 +0100215 }
Martin Schwidefsky80217142010-10-25 16:10:11 +0200216 spin_unlock_bh(&mm->context.list_lock);
Martin Schwidefsky3610cce2007-10-22 12:52:47 +0200217 return table;
218}
219
Martin Schwidefsky146e4b32008-02-09 18:24:35 +0100220void page_table_free(struct mm_struct *mm, unsigned long *table)
Martin Schwidefsky3610cce2007-10-22 12:52:47 +0200221{
Martin Schwidefsky146e4b32008-02-09 18:24:35 +0100222 struct page *page;
Martin Schwidefsky36409f62011-06-06 14:14:41 +0200223 unsigned int bit, mask;
Martin Schwidefsky3610cce2007-10-22 12:52:47 +0200224
Martin Schwidefsky36409f62011-06-06 14:14:41 +0200225#ifdef CONFIG_PGSTE
226 if (mm_has_pgste(mm))
227 return page_table_free_pgste(table);
228#endif
229 /* Free 1K/2K page table fragment of a 4K page */
Martin Schwidefsky146e4b32008-02-09 18:24:35 +0100230 page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
Martin Schwidefsky36409f62011-06-06 14:14:41 +0200231 bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)));
Martin Schwidefsky80217142010-10-25 16:10:11 +0200232 spin_lock_bh(&mm->context.list_lock);
Martin Schwidefsky36409f62011-06-06 14:14:41 +0200233 if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
Martin Schwidefsky146e4b32008-02-09 18:24:35 +0100234 list_del(&page->lru);
Martin Schwidefsky36409f62011-06-06 14:14:41 +0200235 mask = atomic_xor_bits(&page->_mapcount, bit);
236 if (mask & FRAG_MASK)
237 list_add(&page->lru, &mm->context.pgtable_list);
Martin Schwidefsky80217142010-10-25 16:10:11 +0200238 spin_unlock_bh(&mm->context.list_lock);
Martin Schwidefsky36409f62011-06-06 14:14:41 +0200239 if (mask == 0) {
Martin Schwidefsky146e4b32008-02-09 18:24:35 +0100240 pgtable_page_dtor(page);
Martin Schwidefsky36409f62011-06-06 14:14:41 +0200241 atomic_set(&page->_mapcount, -1);
Martin Schwidefsky146e4b32008-02-09 18:24:35 +0100242 __free_page(page);
243 }
244}
Martin Schwidefsky3610cce2007-10-22 12:52:47 +0200245
Martin Schwidefsky36409f62011-06-06 14:14:41 +0200246#ifdef CONFIG_HAVE_RCU_TABLE_FREE
Martin Schwidefsky80217142010-10-25 16:10:11 +0200247
Martin Schwidefsky36409f62011-06-06 14:14:41 +0200248static void __page_table_free_rcu(void *table, unsigned bit)
249{
250 struct page *page;
251
252#ifdef CONFIG_PGSTE
253 if (bit == FRAG_MASK)
254 return page_table_free_pgste(table);
255#endif
256 /* Free 1K/2K page table fragment of a 4K page */
257 page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
258 if (atomic_xor_bits(&page->_mapcount, bit) == 0) {
259 pgtable_page_dtor(page);
260 atomic_set(&page->_mapcount, -1);
261 __free_page(page);
Martin Schwidefsky80217142010-10-25 16:10:11 +0200262 }
Martin Schwidefsky36409f62011-06-06 14:14:41 +0200263}
264
265void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table)
266{
267 struct mm_struct *mm;
268 struct page *page;
269 unsigned int bit, mask;
270
271 mm = tlb->mm;
272#ifdef CONFIG_PGSTE
273 if (mm_has_pgste(mm)) {
274 table = (unsigned long *) (__pa(table) | FRAG_MASK);
275 tlb_remove_table(tlb, table);
276 return;
Martin Schwidefsky80217142010-10-25 16:10:11 +0200277 }
Martin Schwidefsky36409f62011-06-06 14:14:41 +0200278#endif
279 bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)));
Martin Schwidefsky80217142010-10-25 16:10:11 +0200280 page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
281 spin_lock_bh(&mm->context.list_lock);
Martin Schwidefsky36409f62011-06-06 14:14:41 +0200282 if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK)
283 list_del(&page->lru);
284 mask = atomic_xor_bits(&page->_mapcount, bit | (bit << 4));
285 if (mask & FRAG_MASK)
286 list_add_tail(&page->lru, &mm->context.pgtable_list);
Martin Schwidefsky80217142010-10-25 16:10:11 +0200287 spin_unlock_bh(&mm->context.list_lock);
Martin Schwidefsky36409f62011-06-06 14:14:41 +0200288 table = (unsigned long *) (__pa(table) | (bit << 4));
289 tlb_remove_table(tlb, table);
Martin Schwidefsky80217142010-10-25 16:10:11 +0200290}
291
Martin Schwidefsky36409f62011-06-06 14:14:41 +0200292void __tlb_remove_table(void *_table)
293{
294 void *table = (void *)((unsigned long) _table & PAGE_MASK);
295 unsigned type = (unsigned long) _table & ~PAGE_MASK;
296
297 if (type)
298 __page_table_free_rcu(table, type);
299 else
300 free_pages((unsigned long) table, ALLOC_ORDER);
301}
302
303#endif
304
Carsten Otte402b0862008-03-25 18:47:10 +0100305/*
306 * switch on pgstes for its userspace process (for kvm)
307 */
308int s390_enable_sie(void)
309{
310 struct task_struct *tsk = current;
Christian Borntraeger74b6b522008-05-21 13:37:29 +0200311 struct mm_struct *mm, *old_mm;
Carsten Otte402b0862008-03-25 18:47:10 +0100312
Carsten Otte702d9e52009-03-26 15:23:57 +0100313 /* Do we have switched amode? If no, we cannot do sie */
Martin Schwidefskyb11b5332009-12-07 12:51:43 +0100314 if (user_mode == HOME_SPACE_MODE)
Carsten Otte702d9e52009-03-26 15:23:57 +0100315 return -EINVAL;
316
Christian Borntraeger74b6b522008-05-21 13:37:29 +0200317 /* Do we have pgstes? if yes, we are done */
Martin Schwidefsky36409f62011-06-06 14:14:41 +0200318 if (mm_has_pgste(tsk->mm))
Christian Borntraeger74b6b522008-05-21 13:37:29 +0200319 return 0;
Carsten Otte402b0862008-03-25 18:47:10 +0100320
Christian Borntraeger74b6b522008-05-21 13:37:29 +0200321 /* lets check if we are allowed to replace the mm */
322 task_lock(tsk);
Carsten Otte402b0862008-03-25 18:47:10 +0100323 if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
Martin Schwidefsky52a21f22009-10-06 10:33:55 +0200324#ifdef CONFIG_AIO
325 !hlist_empty(&tsk->mm->ioctx_list) ||
326#endif
327 tsk->mm != tsk->active_mm) {
Christian Borntraeger74b6b522008-05-21 13:37:29 +0200328 task_unlock(tsk);
329 return -EINVAL;
330 }
331 task_unlock(tsk);
Carsten Otte402b0862008-03-25 18:47:10 +0100332
Christian Borntraeger250cf772008-10-28 11:10:15 +0100333 /* we copy the mm and let dup_mm create the page tables with_pgstes */
334 tsk->mm->context.alloc_pgste = 1;
Carsten Otte402b0862008-03-25 18:47:10 +0100335 mm = dup_mm(tsk);
Christian Borntraeger250cf772008-10-28 11:10:15 +0100336 tsk->mm->context.alloc_pgste = 0;
Carsten Otte402b0862008-03-25 18:47:10 +0100337 if (!mm)
Christian Borntraeger74b6b522008-05-21 13:37:29 +0200338 return -ENOMEM;
339
Christian Borntraeger250cf772008-10-28 11:10:15 +0100340 /* Now lets check again if something happened */
Christian Borntraeger74b6b522008-05-21 13:37:29 +0200341 task_lock(tsk);
342 if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
Martin Schwidefsky52a21f22009-10-06 10:33:55 +0200343#ifdef CONFIG_AIO
344 !hlist_empty(&tsk->mm->ioctx_list) ||
345#endif
346 tsk->mm != tsk->active_mm) {
Christian Borntraeger74b6b522008-05-21 13:37:29 +0200347 mmput(mm);
348 task_unlock(tsk);
349 return -EINVAL;
350 }
351
352 /* ok, we are alone. No ptrace, no threads, etc. */
353 old_mm = tsk->mm;
Carsten Otte402b0862008-03-25 18:47:10 +0100354 tsk->mm = tsk->active_mm = mm;
355 preempt_disable();
356 update_mm(mm, tsk);
Christian Borntraegere05ef9b2010-10-25 16:10:45 +0200357 atomic_inc(&mm->context.attach_count);
358 atomic_dec(&old_mm->context.attach_count);
Rusty Russell005f8ee2009-03-26 15:25:01 +0100359 cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
Carsten Otte402b0862008-03-25 18:47:10 +0100360 preempt_enable();
Carsten Otte402b0862008-03-25 18:47:10 +0100361 task_unlock(tsk);
Christian Borntraeger74b6b522008-05-21 13:37:29 +0200362 mmput(old_mm);
363 return 0;
Carsten Otte402b0862008-03-25 18:47:10 +0100364}
365EXPORT_SYMBOL_GPL(s390_enable_sie);
Hans-Joachim Picht7db11a32009-06-16 10:30:26 +0200366
Heiko Carstens87458ff2009-09-22 22:58:46 +0200367#if defined(CONFIG_DEBUG_PAGEALLOC) && defined(CONFIG_HIBERNATION)
Hans-Joachim Picht7db11a32009-06-16 10:30:26 +0200368bool kernel_page_present(struct page *page)
369{
370 unsigned long addr;
371 int cc;
372
373 addr = page_to_phys(page);
Heiko Carstens87458ff2009-09-22 22:58:46 +0200374 asm volatile(
375 " lra %1,0(%1)\n"
376 " ipm %0\n"
377 " srl %0,28"
378 : "=d" (cc), "+a" (addr) : : "cc");
Hans-Joachim Picht7db11a32009-06-16 10:30:26 +0200379 return cc == 0;
380}
Heiko Carstens87458ff2009-09-22 22:58:46 +0200381#endif /* CONFIG_HIBERNATION && CONFIG_DEBUG_PAGEALLOC */