blob: c512cc911e2234a715a1ab384f8c572f1b0ea3a5 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Simple NUMA memory policy for the Linux kernel.
3 *
4 * Copyright 2003,2004 Andi Kleen, SuSE Labs.
5 * Subject to the GNU Public License, version 2.
6 *
7 * NUMA policy allows the user to give hints in which node(s) memory should
8 * be allocated.
9 *
10 * Support four policies per VMA and per process:
11 *
12 * The VMA policy has priority over the process policy for a page fault.
13 *
14 * interleave Allocate memory interleaved over a set of nodes,
15 * with normal fallback if it fails.
16 * For VMA based allocations this interleaves based on the
17 * offset into the backing object or offset into the mapping
18 * for anonymous memory. For process policy an process counter
19 * is used.
20 * bind Only allocate memory on a specific set of nodes,
21 * no fallback.
22 * preferred Try a specific node first before normal fallback.
23 * As a special case node -1 here means do the allocation
24 * on the local CPU. This is normally identical to default,
25 * but useful to set in a VMA when you have a non default
26 * process policy.
27 * default Allocate on the local node first, or when on a VMA
28 * use the process policy. This is what Linux always did
29 * in a NUMA aware kernel and still does by, ahem, default.
30 *
31 * The process policy is applied for most non interrupt memory allocations
32 * in that process' context. Interrupts ignore the policies and always
33 * try to allocate on the local CPU. The VMA policy is only applied for memory
34 * allocations for a VMA in the VM.
35 *
36 * Currently there are a few corner cases in swapping where the policy
37 * is not applied, but the majority should be handled. When process policy
38 * is used it is not remembered over swap outs/swap ins.
39 *
40 * Only the highest zone in the zone hierarchy gets policied. Allocations
41 * requesting a lower zone just use default policy. This implies that
42 * on systems with highmem kernel lowmem allocation don't get policied.
43 * Same with GFP_DMA allocations.
44 *
45 * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
46 * all users and remembered even when nobody has memory mapped.
47 */
48
49/* Notebook:
50 fix mmap readahead to honour policy and enable policy for any page cache
51 object
52 statistics for bigpages
53 global policy for page cache? currently it uses process policy. Requires
54 first item above.
55 handle mremap for shared memory (currently ignored for the policy)
56 grows down?
57 make bind policy root only? It can trigger oom much faster and the
58 kernel is not always grateful with that.
59 could replace all the switch()es with a mempolicy_ops structure.
60*/
61
62#include <linux/mempolicy.h>
63#include <linux/mm.h>
64#include <linux/highmem.h>
65#include <linux/hugetlb.h>
66#include <linux/kernel.h>
67#include <linux/sched.h>
68#include <linux/mm.h>
69#include <linux/nodemask.h>
70#include <linux/cpuset.h>
71#include <linux/gfp.h>
72#include <linux/slab.h>
73#include <linux/string.h>
74#include <linux/module.h>
75#include <linux/interrupt.h>
76#include <linux/init.h>
77#include <linux/compat.h>
78#include <linux/mempolicy.h>
79#include <asm/tlbflush.h>
80#include <asm/uaccess.h>
81
82static kmem_cache_t *policy_cache;
83static kmem_cache_t *sn_cache;
84
85#define PDprintk(fmt...)
86
87/* Highest zone. An specific allocation for a zone below that is not
88 policied. */
89static int policy_zone;
90
91static struct mempolicy default_policy = {
92 .refcnt = ATOMIC_INIT(1), /* never free it */
93 .policy = MPOL_DEFAULT,
94};
95
96/* Check if all specified nodes are online */
97static int nodes_online(unsigned long *nodes)
98{
99 DECLARE_BITMAP(online2, MAX_NUMNODES);
100
101 bitmap_copy(online2, nodes_addr(node_online_map), MAX_NUMNODES);
102 if (bitmap_empty(online2, MAX_NUMNODES))
103 set_bit(0, online2);
104 if (!bitmap_subset(nodes, online2, MAX_NUMNODES))
105 return -EINVAL;
106 return 0;
107}
108
109/* Do sanity checking on a policy */
110static int mpol_check_policy(int mode, unsigned long *nodes)
111{
112 int empty = bitmap_empty(nodes, MAX_NUMNODES);
113
114 switch (mode) {
115 case MPOL_DEFAULT:
116 if (!empty)
117 return -EINVAL;
118 break;
119 case MPOL_BIND:
120 case MPOL_INTERLEAVE:
121 /* Preferred will only use the first bit, but allow
122 more for now. */
123 if (empty)
124 return -EINVAL;
125 break;
126 }
127 return nodes_online(nodes);
128}
129
130/* Copy a node mask from user space. */
131static int get_nodes(unsigned long *nodes, unsigned long __user *nmask,
132 unsigned long maxnode, int mode)
133{
134 unsigned long k;
135 unsigned long nlongs;
136 unsigned long endmask;
137
138 --maxnode;
139 bitmap_zero(nodes, MAX_NUMNODES);
140 if (maxnode == 0 || !nmask)
141 return 0;
142
143 nlongs = BITS_TO_LONGS(maxnode);
144 if ((maxnode % BITS_PER_LONG) == 0)
145 endmask = ~0UL;
146 else
147 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
148
149 /* When the user specified more nodes than supported just check
150 if the non supported part is all zero. */
151 if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
152 if (nlongs > PAGE_SIZE/sizeof(long))
153 return -EINVAL;
154 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
155 unsigned long t;
156 if (get_user(t, nmask + k))
157 return -EFAULT;
158 if (k == nlongs - 1) {
159 if (t & endmask)
160 return -EINVAL;
161 } else if (t)
162 return -EINVAL;
163 }
164 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
165 endmask = ~0UL;
166 }
167
168 if (copy_from_user(nodes, nmask, nlongs*sizeof(unsigned long)))
169 return -EFAULT;
170 nodes[nlongs-1] &= endmask;
171 /* Update current mems_allowed */
172 cpuset_update_current_mems_allowed();
173 /* Ignore nodes not set in current->mems_allowed */
174 cpuset_restrict_to_mems_allowed(nodes);
175 return mpol_check_policy(mode, nodes);
176}
177
178/* Generate a custom zonelist for the BIND policy. */
179static struct zonelist *bind_zonelist(unsigned long *nodes)
180{
181 struct zonelist *zl;
182 int num, max, nd;
183
184 max = 1 + MAX_NR_ZONES * bitmap_weight(nodes, MAX_NUMNODES);
185 zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
186 if (!zl)
187 return NULL;
188 num = 0;
189 for (nd = find_first_bit(nodes, MAX_NUMNODES);
190 nd < MAX_NUMNODES;
191 nd = find_next_bit(nodes, MAX_NUMNODES, 1+nd)) {
192 int k;
193 for (k = MAX_NR_ZONES-1; k >= 0; k--) {
194 struct zone *z = &NODE_DATA(nd)->node_zones[k];
195 if (!z->present_pages)
196 continue;
197 zl->zones[num++] = z;
198 if (k > policy_zone)
199 policy_zone = k;
200 }
201 }
202 BUG_ON(num >= max);
203 zl->zones[num] = NULL;
204 return zl;
205}
206
207/* Create a new policy */
208static struct mempolicy *mpol_new(int mode, unsigned long *nodes)
209{
210 struct mempolicy *policy;
211
212 PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes[0]);
213 if (mode == MPOL_DEFAULT)
214 return NULL;
215 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
216 if (!policy)
217 return ERR_PTR(-ENOMEM);
218 atomic_set(&policy->refcnt, 1);
219 switch (mode) {
220 case MPOL_INTERLEAVE:
221 bitmap_copy(policy->v.nodes, nodes, MAX_NUMNODES);
222 break;
223 case MPOL_PREFERRED:
224 policy->v.preferred_node = find_first_bit(nodes, MAX_NUMNODES);
225 if (policy->v.preferred_node >= MAX_NUMNODES)
226 policy->v.preferred_node = -1;
227 break;
228 case MPOL_BIND:
229 policy->v.zonelist = bind_zonelist(nodes);
230 if (policy->v.zonelist == NULL) {
231 kmem_cache_free(policy_cache, policy);
232 return ERR_PTR(-ENOMEM);
233 }
234 break;
235 }
236 policy->policy = mode;
237 return policy;
238}
239
240/* Ensure all existing pages follow the policy. */
241static int
242verify_pages(struct mm_struct *mm,
243 unsigned long addr, unsigned long end, unsigned long *nodes)
244{
Hugh Dickins941150a2005-06-21 17:15:06 -0700245 int err = 0;
246
247 spin_lock(&mm->page_table_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700248 while (addr < end) {
249 struct page *p;
250 pte_t *pte;
251 pmd_t *pmd;
252 pud_t *pud;
253 pgd_t *pgd;
254 pgd = pgd_offset(mm, addr);
255 if (pgd_none(*pgd)) {
256 unsigned long next = (addr + PGDIR_SIZE) & PGDIR_MASK;
257 if (next > addr)
258 break;
259 addr = next;
260 continue;
261 }
262 pud = pud_offset(pgd, addr);
263 if (pud_none(*pud)) {
264 addr = (addr + PUD_SIZE) & PUD_MASK;
265 continue;
266 }
267 pmd = pmd_offset(pud, addr);
268 if (pmd_none(*pmd)) {
269 addr = (addr + PMD_SIZE) & PMD_MASK;
270 continue;
271 }
272 p = NULL;
273 pte = pte_offset_map(pmd, addr);
Hugh Dickins941150a2005-06-21 17:15:06 -0700274 if (pte_present(*pte)) {
275 unsigned long pfn = pte_pfn(*pte);
276 if (pfn_valid(pfn))
277 p = pfn_to_page(pfn);
278 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700279 pte_unmap(pte);
280 if (p) {
281 unsigned nid = page_to_nid(p);
Hugh Dickins941150a2005-06-21 17:15:06 -0700282 if (!test_bit(nid, nodes)) {
283 err = -EIO;
284 break;
285 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700286 }
287 addr += PAGE_SIZE;
288 }
Hugh Dickins941150a2005-06-21 17:15:06 -0700289 spin_unlock(&mm->page_table_lock);
290 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700291}
292
293/* Step 1: check the range */
294static struct vm_area_struct *
295check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
296 unsigned long *nodes, unsigned long flags)
297{
298 int err;
299 struct vm_area_struct *first, *vma, *prev;
300
301 first = find_vma(mm, start);
302 if (!first)
303 return ERR_PTR(-EFAULT);
304 prev = NULL;
305 for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
306 if (!vma->vm_next && vma->vm_end < end)
307 return ERR_PTR(-EFAULT);
308 if (prev && prev->vm_end < vma->vm_start)
309 return ERR_PTR(-EFAULT);
310 if ((flags & MPOL_MF_STRICT) && !is_vm_hugetlb_page(vma)) {
311 err = verify_pages(vma->vm_mm,
312 vma->vm_start, vma->vm_end, nodes);
313 if (err) {
314 first = ERR_PTR(err);
315 break;
316 }
317 }
318 prev = vma;
319 }
320 return first;
321}
322
323/* Apply policy to a single VMA */
324static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
325{
326 int err = 0;
327 struct mempolicy *old = vma->vm_policy;
328
329 PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
330 vma->vm_start, vma->vm_end, vma->vm_pgoff,
331 vma->vm_ops, vma->vm_file,
332 vma->vm_ops ? vma->vm_ops->set_policy : NULL);
333
334 if (vma->vm_ops && vma->vm_ops->set_policy)
335 err = vma->vm_ops->set_policy(vma, new);
336 if (!err) {
337 mpol_get(new);
338 vma->vm_policy = new;
339 mpol_free(old);
340 }
341 return err;
342}
343
344/* Step 2: apply policy to a range and do splits. */
345static int mbind_range(struct vm_area_struct *vma, unsigned long start,
346 unsigned long end, struct mempolicy *new)
347{
348 struct vm_area_struct *next;
349 int err;
350
351 err = 0;
352 for (; vma && vma->vm_start < end; vma = next) {
353 next = vma->vm_next;
354 if (vma->vm_start < start)
355 err = split_vma(vma->vm_mm, vma, start, 1);
356 if (!err && vma->vm_end > end)
357 err = split_vma(vma->vm_mm, vma, end, 0);
358 if (!err)
359 err = policy_vma(vma, new);
360 if (err)
361 break;
362 }
363 return err;
364}
365
366/* Change policy for a memory range */
367asmlinkage long sys_mbind(unsigned long start, unsigned long len,
368 unsigned long mode,
369 unsigned long __user *nmask, unsigned long maxnode,
370 unsigned flags)
371{
372 struct vm_area_struct *vma;
373 struct mm_struct *mm = current->mm;
374 struct mempolicy *new;
375 unsigned long end;
376 DECLARE_BITMAP(nodes, MAX_NUMNODES);
377 int err;
378
379 if ((flags & ~(unsigned long)(MPOL_MF_STRICT)) || mode > MPOL_MAX)
380 return -EINVAL;
381 if (start & ~PAGE_MASK)
382 return -EINVAL;
383 if (mode == MPOL_DEFAULT)
384 flags &= ~MPOL_MF_STRICT;
385 len = (len + PAGE_SIZE - 1) & PAGE_MASK;
386 end = start + len;
387 if (end < start)
388 return -EINVAL;
389 if (end == start)
390 return 0;
391
392 err = get_nodes(nodes, nmask, maxnode, mode);
393 if (err)
394 return err;
395
396 new = mpol_new(mode, nodes);
397 if (IS_ERR(new))
398 return PTR_ERR(new);
399
400 PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
401 mode,nodes[0]);
402
403 down_write(&mm->mmap_sem);
404 vma = check_range(mm, start, end, nodes, flags);
405 err = PTR_ERR(vma);
406 if (!IS_ERR(vma))
407 err = mbind_range(vma, start, end, new);
408 up_write(&mm->mmap_sem);
409 mpol_free(new);
410 return err;
411}
412
413/* Set the process memory policy */
414asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
415 unsigned long maxnode)
416{
417 int err;
418 struct mempolicy *new;
419 DECLARE_BITMAP(nodes, MAX_NUMNODES);
420
421 if (mode > MPOL_MAX)
422 return -EINVAL;
423 err = get_nodes(nodes, nmask, maxnode, mode);
424 if (err)
425 return err;
426 new = mpol_new(mode, nodes);
427 if (IS_ERR(new))
428 return PTR_ERR(new);
429 mpol_free(current->mempolicy);
430 current->mempolicy = new;
431 if (new && new->policy == MPOL_INTERLEAVE)
432 current->il_next = find_first_bit(new->v.nodes, MAX_NUMNODES);
433 return 0;
434}
435
436/* Fill a zone bitmap for a policy */
437static void get_zonemask(struct mempolicy *p, unsigned long *nodes)
438{
439 int i;
440
441 bitmap_zero(nodes, MAX_NUMNODES);
442 switch (p->policy) {
443 case MPOL_BIND:
444 for (i = 0; p->v.zonelist->zones[i]; i++)
445 __set_bit(p->v.zonelist->zones[i]->zone_pgdat->node_id, nodes);
446 break;
447 case MPOL_DEFAULT:
448 break;
449 case MPOL_INTERLEAVE:
450 bitmap_copy(nodes, p->v.nodes, MAX_NUMNODES);
451 break;
452 case MPOL_PREFERRED:
453 /* or use current node instead of online map? */
454 if (p->v.preferred_node < 0)
455 bitmap_copy(nodes, nodes_addr(node_online_map), MAX_NUMNODES);
456 else
457 __set_bit(p->v.preferred_node, nodes);
458 break;
459 default:
460 BUG();
461 }
462}
463
464static int lookup_node(struct mm_struct *mm, unsigned long addr)
465{
466 struct page *p;
467 int err;
468
469 err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
470 if (err >= 0) {
471 err = page_to_nid(p);
472 put_page(p);
473 }
474 return err;
475}
476
477/* Copy a kernel node mask to user space */
478static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
479 void *nodes, unsigned nbytes)
480{
481 unsigned long copy = ALIGN(maxnode-1, 64) / 8;
482
483 if (copy > nbytes) {
484 if (copy > PAGE_SIZE)
485 return -EINVAL;
486 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
487 return -EFAULT;
488 copy = nbytes;
489 }
490 return copy_to_user(mask, nodes, copy) ? -EFAULT : 0;
491}
492
493/* Retrieve NUMA policy */
494asmlinkage long sys_get_mempolicy(int __user *policy,
495 unsigned long __user *nmask,
496 unsigned long maxnode,
497 unsigned long addr, unsigned long flags)
498{
499 int err, pval;
500 struct mm_struct *mm = current->mm;
501 struct vm_area_struct *vma = NULL;
502 struct mempolicy *pol = current->mempolicy;
503
504 if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
505 return -EINVAL;
506 if (nmask != NULL && maxnode < MAX_NUMNODES)
507 return -EINVAL;
508 if (flags & MPOL_F_ADDR) {
509 down_read(&mm->mmap_sem);
510 vma = find_vma_intersection(mm, addr, addr+1);
511 if (!vma) {
512 up_read(&mm->mmap_sem);
513 return -EFAULT;
514 }
515 if (vma->vm_ops && vma->vm_ops->get_policy)
516 pol = vma->vm_ops->get_policy(vma, addr);
517 else
518 pol = vma->vm_policy;
519 } else if (addr)
520 return -EINVAL;
521
522 if (!pol)
523 pol = &default_policy;
524
525 if (flags & MPOL_F_NODE) {
526 if (flags & MPOL_F_ADDR) {
527 err = lookup_node(mm, addr);
528 if (err < 0)
529 goto out;
530 pval = err;
531 } else if (pol == current->mempolicy &&
532 pol->policy == MPOL_INTERLEAVE) {
533 pval = current->il_next;
534 } else {
535 err = -EINVAL;
536 goto out;
537 }
538 } else
539 pval = pol->policy;
540
541 if (vma) {
542 up_read(&current->mm->mmap_sem);
543 vma = NULL;
544 }
545
546 if (policy && put_user(pval, policy))
547 return -EFAULT;
548
549 err = 0;
550 if (nmask) {
551 DECLARE_BITMAP(nodes, MAX_NUMNODES);
552 get_zonemask(pol, nodes);
553 err = copy_nodes_to_user(nmask, maxnode, nodes, sizeof(nodes));
554 }
555
556 out:
557 if (vma)
558 up_read(&current->mm->mmap_sem);
559 return err;
560}
561
562#ifdef CONFIG_COMPAT
563
564asmlinkage long compat_sys_get_mempolicy(int __user *policy,
565 compat_ulong_t __user *nmask,
566 compat_ulong_t maxnode,
567 compat_ulong_t addr, compat_ulong_t flags)
568{
569 long err;
570 unsigned long __user *nm = NULL;
571 unsigned long nr_bits, alloc_size;
572 DECLARE_BITMAP(bm, MAX_NUMNODES);
573
574 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
575 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
576
577 if (nmask)
578 nm = compat_alloc_user_space(alloc_size);
579
580 err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
581
582 if (!err && nmask) {
583 err = copy_from_user(bm, nm, alloc_size);
584 /* ensure entire bitmap is zeroed */
585 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
586 err |= compat_put_bitmap(nmask, bm, nr_bits);
587 }
588
589 return err;
590}
591
592asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
593 compat_ulong_t maxnode)
594{
595 long err = 0;
596 unsigned long __user *nm = NULL;
597 unsigned long nr_bits, alloc_size;
598 DECLARE_BITMAP(bm, MAX_NUMNODES);
599
600 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
601 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
602
603 if (nmask) {
604 err = compat_get_bitmap(bm, nmask, nr_bits);
605 nm = compat_alloc_user_space(alloc_size);
606 err |= copy_to_user(nm, bm, alloc_size);
607 }
608
609 if (err)
610 return -EFAULT;
611
612 return sys_set_mempolicy(mode, nm, nr_bits+1);
613}
614
615asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
616 compat_ulong_t mode, compat_ulong_t __user *nmask,
617 compat_ulong_t maxnode, compat_ulong_t flags)
618{
619 long err = 0;
620 unsigned long __user *nm = NULL;
621 unsigned long nr_bits, alloc_size;
622 DECLARE_BITMAP(bm, MAX_NUMNODES);
623
624 nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
625 alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
626
627 if (nmask) {
628 err = compat_get_bitmap(bm, nmask, nr_bits);
629 nm = compat_alloc_user_space(alloc_size);
630 err |= copy_to_user(nm, bm, alloc_size);
631 }
632
633 if (err)
634 return -EFAULT;
635
636 return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
637}
638
639#endif
640
641/* Return effective policy for a VMA */
642static struct mempolicy *
643get_vma_policy(struct vm_area_struct *vma, unsigned long addr)
644{
645 struct mempolicy *pol = current->mempolicy;
646
647 if (vma) {
648 if (vma->vm_ops && vma->vm_ops->get_policy)
649 pol = vma->vm_ops->get_policy(vma, addr);
650 else if (vma->vm_policy &&
651 vma->vm_policy->policy != MPOL_DEFAULT)
652 pol = vma->vm_policy;
653 }
654 if (!pol)
655 pol = &default_policy;
656 return pol;
657}
658
659/* Return a zonelist representing a mempolicy */
660static struct zonelist *zonelist_policy(unsigned int __nocast gfp, struct mempolicy *policy)
661{
662 int nd;
663
664 switch (policy->policy) {
665 case MPOL_PREFERRED:
666 nd = policy->v.preferred_node;
667 if (nd < 0)
668 nd = numa_node_id();
669 break;
670 case MPOL_BIND:
671 /* Lower zones don't get a policy applied */
672 /* Careful: current->mems_allowed might have moved */
Al Viro01424962005-04-24 12:28:34 -0700673 if ((gfp & GFP_ZONEMASK) >= policy_zone)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700674 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
675 return policy->v.zonelist;
676 /*FALL THROUGH*/
677 case MPOL_INTERLEAVE: /* should not happen */
678 case MPOL_DEFAULT:
679 nd = numa_node_id();
680 break;
681 default:
682 nd = 0;
683 BUG();
684 }
685 return NODE_DATA(nd)->node_zonelists + (gfp & GFP_ZONEMASK);
686}
687
688/* Do dynamic interleaving for a process */
689static unsigned interleave_nodes(struct mempolicy *policy)
690{
691 unsigned nid, next;
692 struct task_struct *me = current;
693
694 nid = me->il_next;
695 BUG_ON(nid >= MAX_NUMNODES);
696 next = find_next_bit(policy->v.nodes, MAX_NUMNODES, 1+nid);
697 if (next >= MAX_NUMNODES)
698 next = find_first_bit(policy->v.nodes, MAX_NUMNODES);
699 me->il_next = next;
700 return nid;
701}
702
703/* Do static interleaving for a VMA with known offset. */
704static unsigned offset_il_node(struct mempolicy *pol,
705 struct vm_area_struct *vma, unsigned long off)
706{
707 unsigned nnodes = bitmap_weight(pol->v.nodes, MAX_NUMNODES);
708 unsigned target = (unsigned)off % nnodes;
709 int c;
710 int nid = -1;
711
712 c = 0;
713 do {
714 nid = find_next_bit(pol->v.nodes, MAX_NUMNODES, nid+1);
715 c++;
716 } while (c <= target);
717 BUG_ON(nid >= MAX_NUMNODES);
718 BUG_ON(!test_bit(nid, pol->v.nodes));
719 return nid;
720}
721
722/* Allocate a page in interleaved policy.
723 Own path because it needs to do special accounting. */
724static struct page *alloc_page_interleave(unsigned int __nocast gfp, unsigned order, unsigned nid)
725{
726 struct zonelist *zl;
727 struct page *page;
728
729 BUG_ON(!node_online(nid));
730 zl = NODE_DATA(nid)->node_zonelists + (gfp & GFP_ZONEMASK);
731 page = __alloc_pages(gfp, order, zl);
732 if (page && page_zone(page) == zl->zones[0]) {
Christoph Lametere7c8d5c2005-06-21 17:14:47 -0700733 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700734 put_cpu();
735 }
736 return page;
737}
738
739/**
740 * alloc_page_vma - Allocate a page for a VMA.
741 *
742 * @gfp:
743 * %GFP_USER user allocation.
744 * %GFP_KERNEL kernel allocations,
745 * %GFP_HIGHMEM highmem/user allocations,
746 * %GFP_FS allocation should not call back into a file system.
747 * %GFP_ATOMIC don't sleep.
748 *
749 * @vma: Pointer to VMA or NULL if not available.
750 * @addr: Virtual Address of the allocation. Must be inside the VMA.
751 *
752 * This function allocates a page from the kernel page pool and applies
753 * a NUMA policy associated with the VMA or the current process.
754 * When VMA is not NULL caller must hold down_read on the mmap_sem of the
755 * mm_struct of the VMA to prevent it from going away. Should be used for
756 * all allocations for pages that will be mapped into
757 * user space. Returns NULL when no page can be allocated.
758 *
759 * Should be called with the mm_sem of the vma hold.
760 */
761struct page *
762alloc_page_vma(unsigned int __nocast gfp, struct vm_area_struct *vma, unsigned long addr)
763{
764 struct mempolicy *pol = get_vma_policy(vma, addr);
765
766 cpuset_update_current_mems_allowed();
767
768 if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
769 unsigned nid;
770 if (vma) {
771 unsigned long off;
772 BUG_ON(addr >= vma->vm_end);
773 BUG_ON(addr < vma->vm_start);
774 off = vma->vm_pgoff;
775 off += (addr - vma->vm_start) >> PAGE_SHIFT;
776 nid = offset_il_node(pol, vma, off);
777 } else {
778 /* fall back to process interleaving */
779 nid = interleave_nodes(pol);
780 }
781 return alloc_page_interleave(gfp, 0, nid);
782 }
783 return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
784}
785
786/**
787 * alloc_pages_current - Allocate pages.
788 *
789 * @gfp:
790 * %GFP_USER user allocation,
791 * %GFP_KERNEL kernel allocation,
792 * %GFP_HIGHMEM highmem allocation,
793 * %GFP_FS don't call back into a file system.
794 * %GFP_ATOMIC don't sleep.
795 * @order: Power of two of allocation size in pages. 0 is a single page.
796 *
797 * Allocate a page from the kernel page pool. When not in
798 * interrupt context and apply the current process NUMA policy.
799 * Returns NULL when no page can be allocated.
800 *
801 * Don't call cpuset_update_current_mems_allowed() unless
802 * 1) it's ok to take cpuset_sem (can WAIT), and
803 * 2) allocating for current task (not interrupt).
804 */
805struct page *alloc_pages_current(unsigned int __nocast gfp, unsigned order)
806{
807 struct mempolicy *pol = current->mempolicy;
808
809 if ((gfp & __GFP_WAIT) && !in_interrupt())
810 cpuset_update_current_mems_allowed();
811 if (!pol || in_interrupt())
812 pol = &default_policy;
813 if (pol->policy == MPOL_INTERLEAVE)
814 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
815 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
816}
817EXPORT_SYMBOL(alloc_pages_current);
818
819/* Slow path of a mempolicy copy */
820struct mempolicy *__mpol_copy(struct mempolicy *old)
821{
822 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
823
824 if (!new)
825 return ERR_PTR(-ENOMEM);
826 *new = *old;
827 atomic_set(&new->refcnt, 1);
828 if (new->policy == MPOL_BIND) {
829 int sz = ksize(old->v.zonelist);
830 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
831 if (!new->v.zonelist) {
832 kmem_cache_free(policy_cache, new);
833 return ERR_PTR(-ENOMEM);
834 }
835 memcpy(new->v.zonelist, old->v.zonelist, sz);
836 }
837 return new;
838}
839
840/* Slow path of a mempolicy comparison */
841int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
842{
843 if (!a || !b)
844 return 0;
845 if (a->policy != b->policy)
846 return 0;
847 switch (a->policy) {
848 case MPOL_DEFAULT:
849 return 1;
850 case MPOL_INTERLEAVE:
851 return bitmap_equal(a->v.nodes, b->v.nodes, MAX_NUMNODES);
852 case MPOL_PREFERRED:
853 return a->v.preferred_node == b->v.preferred_node;
854 case MPOL_BIND: {
855 int i;
856 for (i = 0; a->v.zonelist->zones[i]; i++)
857 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
858 return 0;
859 return b->v.zonelist->zones[i] == NULL;
860 }
861 default:
862 BUG();
863 return 0;
864 }
865}
866
867/* Slow path of a mpol destructor. */
868void __mpol_free(struct mempolicy *p)
869{
870 if (!atomic_dec_and_test(&p->refcnt))
871 return;
872 if (p->policy == MPOL_BIND)
873 kfree(p->v.zonelist);
874 p->policy = MPOL_DEFAULT;
875 kmem_cache_free(policy_cache, p);
876}
877
878/*
879 * Hugetlb policy. Same as above, just works with node numbers instead of
880 * zonelists.
881 */
882
883/* Find first node suitable for an allocation */
884int mpol_first_node(struct vm_area_struct *vma, unsigned long addr)
885{
886 struct mempolicy *pol = get_vma_policy(vma, addr);
887
888 switch (pol->policy) {
889 case MPOL_DEFAULT:
890 return numa_node_id();
891 case MPOL_BIND:
892 return pol->v.zonelist->zones[0]->zone_pgdat->node_id;
893 case MPOL_INTERLEAVE:
894 return interleave_nodes(pol);
895 case MPOL_PREFERRED:
896 return pol->v.preferred_node >= 0 ?
897 pol->v.preferred_node : numa_node_id();
898 }
899 BUG();
900 return 0;
901}
902
903/* Find secondary valid nodes for an allocation */
904int mpol_node_valid(int nid, struct vm_area_struct *vma, unsigned long addr)
905{
906 struct mempolicy *pol = get_vma_policy(vma, addr);
907
908 switch (pol->policy) {
909 case MPOL_PREFERRED:
910 case MPOL_DEFAULT:
911 case MPOL_INTERLEAVE:
912 return 1;
913 case MPOL_BIND: {
914 struct zone **z;
915 for (z = pol->v.zonelist->zones; *z; z++)
916 if ((*z)->zone_pgdat->node_id == nid)
917 return 1;
918 return 0;
919 }
920 default:
921 BUG();
922 return 0;
923 }
924}
925
926/*
927 * Shared memory backing store policy support.
928 *
929 * Remember policies even when nobody has shared memory mapped.
930 * The policies are kept in Red-Black tree linked from the inode.
931 * They are protected by the sp->lock spinlock, which should be held
932 * for any accesses to the tree.
933 */
934
935/* lookup first element intersecting start-end */
936/* Caller holds sp->lock */
937static struct sp_node *
938sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
939{
940 struct rb_node *n = sp->root.rb_node;
941
942 while (n) {
943 struct sp_node *p = rb_entry(n, struct sp_node, nd);
944
945 if (start >= p->end)
946 n = n->rb_right;
947 else if (end <= p->start)
948 n = n->rb_left;
949 else
950 break;
951 }
952 if (!n)
953 return NULL;
954 for (;;) {
955 struct sp_node *w = NULL;
956 struct rb_node *prev = rb_prev(n);
957 if (!prev)
958 break;
959 w = rb_entry(prev, struct sp_node, nd);
960 if (w->end <= start)
961 break;
962 n = prev;
963 }
964 return rb_entry(n, struct sp_node, nd);
965}
966
967/* Insert a new shared policy into the list. */
968/* Caller holds sp->lock */
969static void sp_insert(struct shared_policy *sp, struct sp_node *new)
970{
971 struct rb_node **p = &sp->root.rb_node;
972 struct rb_node *parent = NULL;
973 struct sp_node *nd;
974
975 while (*p) {
976 parent = *p;
977 nd = rb_entry(parent, struct sp_node, nd);
978 if (new->start < nd->start)
979 p = &(*p)->rb_left;
980 else if (new->end > nd->end)
981 p = &(*p)->rb_right;
982 else
983 BUG();
984 }
985 rb_link_node(&new->nd, parent, p);
986 rb_insert_color(&new->nd, &sp->root);
987 PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
988 new->policy ? new->policy->policy : 0);
989}
990
991/* Find shared policy intersecting idx */
992struct mempolicy *
993mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
994{
995 struct mempolicy *pol = NULL;
996 struct sp_node *sn;
997
998 if (!sp->root.rb_node)
999 return NULL;
1000 spin_lock(&sp->lock);
1001 sn = sp_lookup(sp, idx, idx+1);
1002 if (sn) {
1003 mpol_get(sn->policy);
1004 pol = sn->policy;
1005 }
1006 spin_unlock(&sp->lock);
1007 return pol;
1008}
1009
1010static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1011{
1012 PDprintk("deleting %lx-l%x\n", n->start, n->end);
1013 rb_erase(&n->nd, &sp->root);
1014 mpol_free(n->policy);
1015 kmem_cache_free(sn_cache, n);
1016}
1017
1018struct sp_node *
1019sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1020{
1021 struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1022
1023 if (!n)
1024 return NULL;
1025 n->start = start;
1026 n->end = end;
1027 mpol_get(pol);
1028 n->policy = pol;
1029 return n;
1030}
1031
1032/* Replace a policy range. */
1033static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1034 unsigned long end, struct sp_node *new)
1035{
1036 struct sp_node *n, *new2 = NULL;
1037
1038restart:
1039 spin_lock(&sp->lock);
1040 n = sp_lookup(sp, start, end);
1041 /* Take care of old policies in the same range. */
1042 while (n && n->start < end) {
1043 struct rb_node *next = rb_next(&n->nd);
1044 if (n->start >= start) {
1045 if (n->end <= end)
1046 sp_delete(sp, n);
1047 else
1048 n->start = end;
1049 } else {
1050 /* Old policy spanning whole new range. */
1051 if (n->end > end) {
1052 if (!new2) {
1053 spin_unlock(&sp->lock);
1054 new2 = sp_alloc(end, n->end, n->policy);
1055 if (!new2)
1056 return -ENOMEM;
1057 goto restart;
1058 }
1059 n->end = start;
1060 sp_insert(sp, new2);
1061 new2 = NULL;
1062 break;
1063 } else
1064 n->end = start;
1065 }
1066 if (!next)
1067 break;
1068 n = rb_entry(next, struct sp_node, nd);
1069 }
1070 if (new)
1071 sp_insert(sp, new);
1072 spin_unlock(&sp->lock);
1073 if (new2) {
1074 mpol_free(new2->policy);
1075 kmem_cache_free(sn_cache, new2);
1076 }
1077 return 0;
1078}
1079
1080int mpol_set_shared_policy(struct shared_policy *info,
1081 struct vm_area_struct *vma, struct mempolicy *npol)
1082{
1083 int err;
1084 struct sp_node *new = NULL;
1085 unsigned long sz = vma_pages(vma);
1086
1087 PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1088 vma->vm_pgoff,
1089 sz, npol? npol->policy : -1,
1090 npol ? npol->v.nodes[0] : -1);
1091
1092 if (npol) {
1093 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1094 if (!new)
1095 return -ENOMEM;
1096 }
1097 err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1098 if (err && new)
1099 kmem_cache_free(sn_cache, new);
1100 return err;
1101}
1102
1103/* Free a backing policy store on inode delete. */
1104void mpol_free_shared_policy(struct shared_policy *p)
1105{
1106 struct sp_node *n;
1107 struct rb_node *next;
1108
1109 if (!p->root.rb_node)
1110 return;
1111 spin_lock(&p->lock);
1112 next = rb_first(&p->root);
1113 while (next) {
1114 n = rb_entry(next, struct sp_node, nd);
1115 next = rb_next(&n->nd);
1116 mpol_free(n->policy);
1117 kmem_cache_free(sn_cache, n);
1118 }
1119 spin_unlock(&p->lock);
1120 p->root = RB_ROOT;
1121}
1122
1123/* assumes fs == KERNEL_DS */
1124void __init numa_policy_init(void)
1125{
1126 policy_cache = kmem_cache_create("numa_policy",
1127 sizeof(struct mempolicy),
1128 0, SLAB_PANIC, NULL, NULL);
1129
1130 sn_cache = kmem_cache_create("shared_policy_node",
1131 sizeof(struct sp_node),
1132 0, SLAB_PANIC, NULL, NULL);
1133
1134 /* Set interleaving policy for system init. This way not all
1135 the data structures allocated at system boot end up in node zero. */
1136
1137 if (sys_set_mempolicy(MPOL_INTERLEAVE, nodes_addr(node_online_map),
1138 MAX_NUMNODES) < 0)
1139 printk("numa_policy_init: interleaving failed\n");
1140}
1141
1142/* Reset policy of current process to default.
1143 * Assumes fs == KERNEL_DS */
1144void numa_default_policy(void)
1145{
1146 sys_set_mempolicy(MPOL_DEFAULT, NULL, 0);
1147}