blob: c81e74790a8c994c416eb260851ac8884be9843d [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * pSeries NUMA support
3 *
4 * Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11#include <linux/threads.h>
12#include <linux/bootmem.h>
13#include <linux/init.h>
14#include <linux/mm.h>
15#include <linux/mmzone.h>
16#include <linux/module.h>
17#include <linux/nodemask.h>
18#include <linux/cpu.h>
19#include <linux/notifier.h>
David S. Millerd9b2b2a2008-02-13 16:56:49 -080020#include <linux/lmb.h>
Michael Ellerman6df16462008-02-14 11:37:49 +110021#include <linux/of.h>
Anton Blanchard45fb6ce2005-11-11 14:22:35 +110022#include <asm/sparsemem.h>
David S. Millerd9b2b2a2008-02-13 16:56:49 -080023#include <asm/prom.h>
Paul Mackerrascf00a8d2005-10-31 13:07:02 +110024#include <asm/system.h>
Paul Mackerras2249ca92005-11-07 13:18:13 +110025#include <asm/smp.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070026
27static int numa_enabled = 1;
28
Balbir Singh1daa6d02008-02-01 15:57:31 +110029static char *cmdline __initdata;
30
Linus Torvalds1da177e2005-04-16 15:20:36 -070031static int numa_debug;
32#define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
33
Anton Blanchard45fb6ce2005-11-11 14:22:35 +110034int numa_cpu_lookup_table[NR_CPUS];
Linus Torvalds1da177e2005-04-16 15:20:36 -070035cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES];
Linus Torvalds1da177e2005-04-16 15:20:36 -070036struct pglist_data *node_data[MAX_NUMNODES];
Anton Blanchard45fb6ce2005-11-11 14:22:35 +110037
38EXPORT_SYMBOL(numa_cpu_lookup_table);
39EXPORT_SYMBOL(numa_cpumask_lookup_table);
40EXPORT_SYMBOL(node_data);
41
Linus Torvalds1da177e2005-04-16 15:20:36 -070042static int min_common_depth;
Mike Kravetz237a09892005-12-05 12:06:42 -080043static int n_mem_addr_cells, n_mem_size_cells;
Linus Torvalds1da177e2005-04-16 15:20:36 -070044
Balbir Singh1daa6d02008-02-01 15:57:31 +110045static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn,
46 unsigned int *nid)
47{
48 unsigned long long mem;
49 char *p = cmdline;
50 static unsigned int fake_nid;
51 static unsigned long long curr_boundary;
52
53 /*
54 * Modify node id, iff we started creating NUMA nodes
55 * We want to continue from where we left of the last time
56 */
57 if (fake_nid)
58 *nid = fake_nid;
59 /*
60 * In case there are no more arguments to parse, the
61 * node_id should be the same as the last fake node id
62 * (we've handled this above).
63 */
64 if (!p)
65 return 0;
66
67 mem = memparse(p, &p);
68 if (!mem)
69 return 0;
70
71 if (mem < curr_boundary)
72 return 0;
73
74 curr_boundary = mem;
75
76 if ((end_pfn << PAGE_SHIFT) > mem) {
77 /*
78 * Skip commas and spaces
79 */
80 while (*p == ',' || *p == ' ' || *p == '\t')
81 p++;
82
83 cmdline = p;
84 fake_nid++;
85 *nid = fake_nid;
86 dbg("created new fake_node with id %d\n", fake_nid);
87 return 1;
88 }
89 return 0;
90}
91
Jon Tollefson8f64e1f2008-10-09 10:18:40 +000092/*
93 * get_active_region_work_fn - A helper function for get_node_active_region
94 * Returns datax set to the start_pfn and end_pfn if they contain
95 * the initial value of datax->start_pfn between them
96 * @start_pfn: start page(inclusive) of region to check
97 * @end_pfn: end page(exclusive) of region to check
98 * @datax: comes in with ->start_pfn set to value to search for and
99 * goes out with active range if it contains it
100 * Returns 1 if search value is in range else 0
101 */
102static int __init get_active_region_work_fn(unsigned long start_pfn,
103 unsigned long end_pfn, void *datax)
104{
105 struct node_active_region *data;
106 data = (struct node_active_region *)datax;
107
108 if (start_pfn <= data->start_pfn && end_pfn > data->start_pfn) {
109 data->start_pfn = start_pfn;
110 data->end_pfn = end_pfn;
111 return 1;
112 }
113 return 0;
114
115}
116
117/*
118 * get_node_active_region - Return active region containing start_pfn
Jon Tollefsone8170372008-10-16 18:59:43 +0000119 * Active range returned is empty if none found.
Jon Tollefson8f64e1f2008-10-09 10:18:40 +0000120 * @start_pfn: The page to return the region for.
121 * @node_ar: Returned set to the active region containing start_pfn
122 */
123static void __init get_node_active_region(unsigned long start_pfn,
124 struct node_active_region *node_ar)
125{
126 int nid = early_pfn_to_nid(start_pfn);
127
128 node_ar->nid = nid;
129 node_ar->start_pfn = start_pfn;
Jon Tollefsone8170372008-10-16 18:59:43 +0000130 node_ar->end_pfn = start_pfn;
Jon Tollefson8f64e1f2008-10-09 10:18:40 +0000131 work_with_active_regions(nid, get_active_region_work_fn, node_ar);
132}
133
Nathan Lynch2e5ce392006-03-20 18:35:15 -0600134static void __cpuinit map_cpu_to_node(int cpu, int node)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700135{
136 numa_cpu_lookup_table[cpu] = node;
Anton Blanchard45fb6ce2005-11-11 14:22:35 +1100137
Nathan Lynchbf4b85b2006-03-20 18:34:45 -0600138 dbg("adding cpu %d to node %d\n", cpu, node);
139
Anton Blanchard45fb6ce2005-11-11 14:22:35 +1100140 if (!(cpu_isset(cpu, numa_cpumask_lookup_table[node])))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700141 cpu_set(cpu, numa_cpumask_lookup_table[node]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700142}
143
144#ifdef CONFIG_HOTPLUG_CPU
145static void unmap_cpu_from_node(unsigned long cpu)
146{
147 int node = numa_cpu_lookup_table[cpu];
148
149 dbg("removing cpu %lu from node %d\n", cpu, node);
150
151 if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) {
152 cpu_clear(cpu, numa_cpumask_lookup_table[node]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700153 } else {
154 printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
155 cpu, node);
156 }
157}
158#endif /* CONFIG_HOTPLUG_CPU */
159
Linus Torvalds1da177e2005-04-16 15:20:36 -0700160/* must hold reference to node during call */
Jeremy Kerra7f67bd2006-07-12 15:35:54 +1000161static const int *of_get_associativity(struct device_node *dev)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700162{
Stephen Rothwelle2eb6392007-04-03 22:26:41 +1000163 return of_get_property(dev, "ibm,associativity", NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700164}
165
Chandrucf000852008-08-30 00:28:16 +1000166/*
167 * Returns the property linux,drconf-usable-memory if
168 * it exists (the property exists only in kexec/kdump kernels,
169 * added by kexec-tools)
170 */
171static const u32 *of_get_usable_memory(struct device_node *memory)
172{
173 const u32 *prop;
174 u32 len;
175 prop = of_get_property(memory, "linux,drconf-usable-memory", &len);
176 if (!prop || len < sizeof(unsigned int))
177 return 0;
178 return prop;
179}
180
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600181/* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
182 * info is found.
183 */
Jeremy Kerr953039c2006-05-01 12:16:12 -0700184static int of_node_to_nid_single(struct device_node *device)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700185{
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600186 int nid = -1;
Jeremy Kerra7f67bd2006-07-12 15:35:54 +1000187 const unsigned int *tmp;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700188
189 if (min_common_depth == -1)
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600190 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700191
192 tmp = of_get_associativity(device);
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600193 if (!tmp)
194 goto out;
195
196 if (tmp[0] >= min_common_depth)
Nathan Lynchcf950b72006-03-20 18:35:45 -0600197 nid = tmp[min_common_depth];
Nathan Lynchbc16a752006-03-20 18:36:15 -0600198
199 /* POWER4 LPAR uses 0xffff as invalid node */
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600200 if (nid == 0xffff || nid >= MAX_NUMNODES)
201 nid = -1;
202out:
Nathan Lynchcf950b72006-03-20 18:35:45 -0600203 return nid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700204}
205
Jeremy Kerr953039c2006-05-01 12:16:12 -0700206/* Walk the device tree upwards, looking for an associativity id */
207int of_node_to_nid(struct device_node *device)
208{
209 struct device_node *tmp;
210 int nid = -1;
211
212 of_node_get(device);
213 while (device) {
214 nid = of_node_to_nid_single(device);
215 if (nid != -1)
216 break;
217
218 tmp = device;
219 device = of_get_parent(tmp);
220 of_node_put(tmp);
221 }
222 of_node_put(device);
223
224 return nid;
225}
226EXPORT_SYMBOL_GPL(of_node_to_nid);
227
Linus Torvalds1da177e2005-04-16 15:20:36 -0700228/*
229 * In theory, the "ibm,associativity" property may contain multiple
230 * associativity lists because a resource may be multiply connected
231 * into the machine. This resource then has different associativity
232 * characteristics relative to its multiple connections. We ignore
233 * this for now. We also assume that all cpu and memory sets have
234 * their distances represented at a common level. This won't be
Uwe Kleine-König1b3c3712007-02-17 19:23:03 +0100235 * true for hierarchical NUMA.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700236 *
237 * In any case the ibm,associativity-reference-points should give
238 * the correct depth for a normal NUMA system.
239 *
240 * - Dave Hansen <haveblue@us.ibm.com>
241 */
242static int __init find_min_common_depth(void)
243{
244 int depth;
Jeremy Kerra7f67bd2006-07-12 15:35:54 +1000245 const unsigned int *ref_points;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700246 struct device_node *rtas_root;
247 unsigned int len;
248
249 rtas_root = of_find_node_by_path("/rtas");
250
251 if (!rtas_root)
252 return -1;
253
254 /*
255 * this property is 2 32-bit integers, each representing a level of
256 * depth in the associativity nodes. The first is for an SMP
257 * configuration (should be all 0's) and the second is for a normal
258 * NUMA configuration.
259 */
Stephen Rothwelle2eb6392007-04-03 22:26:41 +1000260 ref_points = of_get_property(rtas_root,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700261 "ibm,associativity-reference-points", &len);
262
Milton Miller20fcefe2009-01-08 02:19:43 +0000263 if ((len >= 2 * sizeof(unsigned int)) && ref_points) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700264 depth = ref_points[1];
265 } else {
Nathan Lynchbf4b85b2006-03-20 18:34:45 -0600266 dbg("NUMA: ibm,associativity-reference-points not found.\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700267 depth = -1;
268 }
269 of_node_put(rtas_root);
270
271 return depth;
272}
273
Mike Kravetz84c9fdd2005-11-30 13:47:23 -0800274static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700275{
276 struct device_node *memory = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700277
278 memory = of_find_node_by_type(memory, "memory");
Paul Mackerras54c23312005-12-05 15:50:39 +1100279 if (!memory)
Mike Kravetz84c9fdd2005-11-30 13:47:23 -0800280 panic("numa.c: No memory nodes found!");
Paul Mackerras54c23312005-12-05 15:50:39 +1100281
Stephen Rothwella8bda5d2007-04-03 10:56:50 +1000282 *n_addr_cells = of_n_addr_cells(memory);
Stephen Rothwell9213fee2007-04-03 10:57:48 +1000283 *n_size_cells = of_n_size_cells(memory);
Mike Kravetz84c9fdd2005-11-30 13:47:23 -0800284 of_node_put(memory);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700285}
286
Jeremy Kerra7f67bd2006-07-12 15:35:54 +1000287static unsigned long __devinit read_n_cells(int n, const unsigned int **buf)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700288{
289 unsigned long result = 0;
290
291 while (n--) {
292 result = (result << 32) | **buf;
293 (*buf)++;
294 }
295 return result;
296}
297
Nathan Fontenot83426812008-07-03 13:35:54 +1000298struct of_drconf_cell {
299 u64 base_addr;
300 u32 drc_index;
301 u32 reserved;
302 u32 aa_index;
303 u32 flags;
304};
305
306#define DRCONF_MEM_ASSIGNED 0x00000008
307#define DRCONF_MEM_AI_INVALID 0x00000040
308#define DRCONF_MEM_RESERVED 0x00000080
309
310/*
311 * Read the next lmb list entry from the ibm,dynamic-memory property
312 * and return the information in the provided of_drconf_cell structure.
313 */
314static void read_drconf_cell(struct of_drconf_cell *drmem, const u32 **cellp)
315{
316 const u32 *cp;
317
318 drmem->base_addr = read_n_cells(n_mem_addr_cells, cellp);
319
320 cp = *cellp;
321 drmem->drc_index = cp[0];
322 drmem->reserved = cp[1];
323 drmem->aa_index = cp[2];
324 drmem->flags = cp[3];
325
326 *cellp = cp + 4;
327}
328
329/*
330 * Retreive and validate the ibm,dynamic-memory property of the device tree.
331 *
332 * The layout of the ibm,dynamic-memory property is a number N of lmb
333 * list entries followed by N lmb list entries. Each lmb list entry
334 * contains information as layed out in the of_drconf_cell struct above.
335 */
336static int of_get_drconf_memory(struct device_node *memory, const u32 **dm)
337{
338 const u32 *prop;
339 u32 len, entries;
340
341 prop = of_get_property(memory, "ibm,dynamic-memory", &len);
342 if (!prop || len < sizeof(unsigned int))
343 return 0;
344
345 entries = *prop++;
346
347 /* Now that we know the number of entries, revalidate the size
348 * of the property read in to ensure we have everything
349 */
350 if (len < (entries * (n_mem_addr_cells + 4) + 1) * sizeof(unsigned int))
351 return 0;
352
353 *dm = prop;
354 return entries;
355}
356
357/*
358 * Retreive and validate the ibm,lmb-size property for drconf memory
359 * from the device tree.
360 */
361static u64 of_get_lmb_size(struct device_node *memory)
362{
363 const u32 *prop;
364 u32 len;
365
366 prop = of_get_property(memory, "ibm,lmb-size", &len);
367 if (!prop || len < sizeof(unsigned int))
368 return 0;
369
370 return read_n_cells(n_mem_size_cells, &prop);
371}
372
373struct assoc_arrays {
374 u32 n_arrays;
375 u32 array_sz;
376 const u32 *arrays;
377};
378
379/*
380 * Retreive and validate the list of associativity arrays for drconf
381 * memory from the ibm,associativity-lookup-arrays property of the
382 * device tree..
383 *
384 * The layout of the ibm,associativity-lookup-arrays property is a number N
385 * indicating the number of associativity arrays, followed by a number M
386 * indicating the size of each associativity array, followed by a list
387 * of N associativity arrays.
388 */
389static int of_get_assoc_arrays(struct device_node *memory,
390 struct assoc_arrays *aa)
391{
392 const u32 *prop;
393 u32 len;
394
395 prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len);
396 if (!prop || len < 2 * sizeof(unsigned int))
397 return -1;
398
399 aa->n_arrays = *prop++;
400 aa->array_sz = *prop++;
401
402 /* Now that we know the number of arrrays and size of each array,
403 * revalidate the size of the property read in.
404 */
405 if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int))
406 return -1;
407
408 aa->arrays = prop;
409 return 0;
410}
411
412/*
413 * This is like of_node_to_nid_single() for memory represented in the
414 * ibm,dynamic-reconfiguration-memory node.
415 */
416static int of_drconf_to_nid_single(struct of_drconf_cell *drmem,
417 struct assoc_arrays *aa)
418{
419 int default_nid = 0;
420 int nid = default_nid;
421 int index;
422
423 if (min_common_depth > 0 && min_common_depth <= aa->array_sz &&
424 !(drmem->flags & DRCONF_MEM_AI_INVALID) &&
425 drmem->aa_index < aa->n_arrays) {
426 index = drmem->aa_index * aa->array_sz + min_common_depth - 1;
427 nid = aa->arrays[index];
428
429 if (nid == 0xffff || nid >= MAX_NUMNODES)
430 nid = default_nid;
431 }
432
433 return nid;
434}
435
Linus Torvalds1da177e2005-04-16 15:20:36 -0700436/*
437 * Figure out to which domain a cpu belongs and stick it there.
438 * Return the id of the domain used.
439 */
Nathan Lynch2e5ce392006-03-20 18:35:15 -0600440static int __cpuinit numa_setup_cpu(unsigned long lcpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700441{
Nathan Lynchcf950b72006-03-20 18:35:45 -0600442 int nid = 0;
Milton Miller8b16cd22009-01-08 02:19:45 +0000443 struct device_node *cpu = of_get_cpu_node(lcpu, NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700444
445 if (!cpu) {
446 WARN_ON(1);
447 goto out;
448 }
449
Jeremy Kerr953039c2006-05-01 12:16:12 -0700450 nid = of_node_to_nid_single(cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700451
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600452 if (nid < 0 || !node_online(nid))
453 nid = any_online_node(NODE_MASK_ALL);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700454out:
Nathan Lynchcf950b72006-03-20 18:35:45 -0600455 map_cpu_to_node(lcpu, nid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700456
457 of_node_put(cpu);
458
Nathan Lynchcf950b72006-03-20 18:35:45 -0600459 return nid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700460}
461
Chandra Seetharaman74b85f32006-06-27 02:54:09 -0700462static int __cpuinit cpu_numa_callback(struct notifier_block *nfb,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700463 unsigned long action,
464 void *hcpu)
465{
466 unsigned long lcpu = (unsigned long)hcpu;
467 int ret = NOTIFY_DONE;
468
469 switch (action) {
470 case CPU_UP_PREPARE:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -0700471 case CPU_UP_PREPARE_FROZEN:
Nathan Lynch2b261222006-03-20 18:37:15 -0600472 numa_setup_cpu(lcpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700473 ret = NOTIFY_OK;
474 break;
475#ifdef CONFIG_HOTPLUG_CPU
476 case CPU_DEAD:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -0700477 case CPU_DEAD_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700478 case CPU_UP_CANCELED:
Rafael J. Wysocki8bb78442007-05-09 02:35:10 -0700479 case CPU_UP_CANCELED_FROZEN:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700480 unmap_cpu_from_node(lcpu);
481 break;
482 ret = NOTIFY_OK;
483#endif
484 }
485 return ret;
486}
487
488/*
489 * Check and possibly modify a memory region to enforce the memory limit.
490 *
491 * Returns the size the region should have to enforce the memory limit.
492 * This will either be the original value of size, a truncated value,
493 * or zero. If the returned value of size is 0 the region should be
494 * discarded as it lies wholy above the memory limit.
495 */
Anton Blanchard45fb6ce2005-11-11 14:22:35 +1100496static unsigned long __init numa_enforce_memory_limit(unsigned long start,
497 unsigned long size)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700498{
499 /*
500 * We use lmb_end_of_DRAM() in here instead of memory_limit because
501 * we've already adjusted it for the limit and it takes care of
Milton Millerfe552492008-10-20 15:37:04 +0000502 * having memory holes below the limit. Also, in the case of
503 * iommu_is_off, memory_limit is not set but is implicitly enforced.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700504 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700505
Linus Torvalds1da177e2005-04-16 15:20:36 -0700506 if (start + size <= lmb_end_of_DRAM())
507 return size;
508
509 if (start >= lmb_end_of_DRAM())
510 return 0;
511
512 return lmb_end_of_DRAM() - start;
513}
514
Paul Mackerras02045682006-11-29 22:27:42 +1100515/*
Chandrucf000852008-08-30 00:28:16 +1000516 * Reads the counter for a given entry in
517 * linux,drconf-usable-memory property
518 */
519static inline int __init read_usm_ranges(const u32 **usm)
520{
521 /*
522 * For each lmb in ibm,dynamic-memory a corresponding
523 * entry in linux,drconf-usable-memory property contains
524 * a counter followed by that many (base, size) duple.
525 * read the counter from linux,drconf-usable-memory
526 */
527 return read_n_cells(n_mem_size_cells, usm);
528}
529
530/*
Paul Mackerras02045682006-11-29 22:27:42 +1100531 * Extract NUMA information from the ibm,dynamic-reconfiguration-memory
532 * node. This assumes n_mem_{addr,size}_cells have been set.
533 */
534static void __init parse_drconf_memory(struct device_node *memory)
535{
Chandrucf000852008-08-30 00:28:16 +1000536 const u32 *dm, *usm;
537 unsigned int n, rc, ranges, is_kexec_kdump = 0;
538 unsigned long lmb_size, base, size, sz;
Nathan Fontenot83426812008-07-03 13:35:54 +1000539 int nid;
540 struct assoc_arrays aa;
Paul Mackerras02045682006-11-29 22:27:42 +1100541
Nathan Fontenot83426812008-07-03 13:35:54 +1000542 n = of_get_drconf_memory(memory, &dm);
543 if (!n)
Paul Mackerras02045682006-11-29 22:27:42 +1100544 return;
545
Nathan Fontenot83426812008-07-03 13:35:54 +1000546 lmb_size = of_get_lmb_size(memory);
547 if (!lmb_size)
548 return;
549
550 rc = of_get_assoc_arrays(memory, &aa);
551 if (rc)
Paul Mackerras02045682006-11-29 22:27:42 +1100552 return;
553
Chandrucf000852008-08-30 00:28:16 +1000554 /* check if this is a kexec/kdump kernel */
555 usm = of_get_usable_memory(memory);
556 if (usm != NULL)
557 is_kexec_kdump = 1;
558
Paul Mackerras02045682006-11-29 22:27:42 +1100559 for (; n != 0; --n) {
Nathan Fontenot83426812008-07-03 13:35:54 +1000560 struct of_drconf_cell drmem;
Balbir Singh1daa6d02008-02-01 15:57:31 +1100561
Nathan Fontenot83426812008-07-03 13:35:54 +1000562 read_drconf_cell(&drmem, &dm);
563
564 /* skip this block if the reserved bit is set in flags (0x80)
565 or if the block is not assigned to this partition (0x8) */
566 if ((drmem.flags & DRCONF_MEM_RESERVED)
567 || !(drmem.flags & DRCONF_MEM_ASSIGNED))
568 continue;
569
Chandrucf000852008-08-30 00:28:16 +1000570 base = drmem.base_addr;
571 size = lmb_size;
572 ranges = 1;
Nathan Fontenot83426812008-07-03 13:35:54 +1000573
Chandrucf000852008-08-30 00:28:16 +1000574 if (is_kexec_kdump) {
575 ranges = read_usm_ranges(&usm);
576 if (!ranges) /* there are no (base, size) duple */
577 continue;
578 }
579 do {
580 if (is_kexec_kdump) {
581 base = read_n_cells(n_mem_addr_cells, &usm);
582 size = read_n_cells(n_mem_size_cells, &usm);
583 }
584 nid = of_drconf_to_nid_single(&drmem, &aa);
585 fake_numa_create_new_node(
586 ((base + size) >> PAGE_SHIFT),
Nathan Fontenot83426812008-07-03 13:35:54 +1000587 &nid);
Chandrucf000852008-08-30 00:28:16 +1000588 node_set_online(nid);
589 sz = numa_enforce_memory_limit(base, size);
590 if (sz)
591 add_active_range(nid, base >> PAGE_SHIFT,
592 (base >> PAGE_SHIFT)
593 + (sz >> PAGE_SHIFT));
594 } while (--ranges);
Paul Mackerras02045682006-11-29 22:27:42 +1100595 }
596}
597
Linus Torvalds1da177e2005-04-16 15:20:36 -0700598static int __init parse_numa_properties(void)
599{
600 struct device_node *cpu = NULL;
601 struct device_node *memory = NULL;
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600602 int default_nid = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700603 unsigned long i;
604
605 if (numa_enabled == 0) {
606 printk(KERN_WARNING "NUMA disabled by user\n");
607 return -1;
608 }
609
Linus Torvalds1da177e2005-04-16 15:20:36 -0700610 min_common_depth = find_min_common_depth();
611
Linus Torvalds1da177e2005-04-16 15:20:36 -0700612 if (min_common_depth < 0)
613 return min_common_depth;
614
Nathan Lynchbf4b85b2006-03-20 18:34:45 -0600615 dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
616
Linus Torvalds1da177e2005-04-16 15:20:36 -0700617 /*
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600618 * Even though we connect cpus to numa domains later in SMP
619 * init, we need to know the node ids now. This is because
620 * each node to be onlined must have NODE_DATA etc backing it.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700621 */
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600622 for_each_present_cpu(i) {
Nathan Lynchcf950b72006-03-20 18:35:45 -0600623 int nid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700624
Milton Miller8b16cd22009-01-08 02:19:45 +0000625 cpu = of_get_cpu_node(i, NULL);
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600626 BUG_ON(!cpu);
Jeremy Kerr953039c2006-05-01 12:16:12 -0700627 nid = of_node_to_nid_single(cpu);
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600628 of_node_put(cpu);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700629
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600630 /*
631 * Don't fall back to default_nid yet -- we will plug
632 * cpus into nodes once the memory scan has discovered
633 * the topology.
634 */
635 if (nid < 0)
636 continue;
637 node_set_online(nid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700638 }
639
Mike Kravetz237a09892005-12-05 12:06:42 -0800640 get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700641 memory = NULL;
642 while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
643 unsigned long start;
644 unsigned long size;
Nathan Lynchcf950b72006-03-20 18:35:45 -0600645 int nid;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700646 int ranges;
Jeremy Kerra7f67bd2006-07-12 15:35:54 +1000647 const unsigned int *memcell_buf;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700648 unsigned int len;
649
Stephen Rothwelle2eb6392007-04-03 22:26:41 +1000650 memcell_buf = of_get_property(memory,
Michael Ellermanba759482005-12-04 18:39:55 +1100651 "linux,usable-memory", &len);
652 if (!memcell_buf || len <= 0)
Stephen Rothwelle2eb6392007-04-03 22:26:41 +1000653 memcell_buf = of_get_property(memory, "reg", &len);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700654 if (!memcell_buf || len <= 0)
655 continue;
656
Benjamin Herrenschmidtcc5d0182005-12-13 18:01:21 +1100657 /* ranges in cell */
658 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700659new_range:
660 /* these are order-sensitive, and modify the buffer pointer */
Mike Kravetz237a09892005-12-05 12:06:42 -0800661 start = read_n_cells(n_mem_addr_cells, &memcell_buf);
662 size = read_n_cells(n_mem_size_cells, &memcell_buf);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700663
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600664 /*
665 * Assumption: either all memory nodes or none will
666 * have associativity properties. If none, then
667 * everything goes to default_nid.
668 */
Jeremy Kerr953039c2006-05-01 12:16:12 -0700669 nid = of_node_to_nid_single(memory);
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600670 if (nid < 0)
671 nid = default_nid;
Balbir Singh1daa6d02008-02-01 15:57:31 +1100672
673 fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
Nathan Lynch482ec7c2006-03-20 18:36:45 -0600674 node_set_online(nid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700675
Anton Blanchard45fb6ce2005-11-11 14:22:35 +1100676 if (!(size = numa_enforce_memory_limit(start, size))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700677 if (--ranges)
678 goto new_range;
679 else
680 continue;
681 }
682
Mel Gormanc67c3cb2006-09-27 01:49:49 -0700683 add_active_range(nid, start >> PAGE_SHIFT,
684 (start >> PAGE_SHIFT) + (size >> PAGE_SHIFT));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700685
686 if (--ranges)
687 goto new_range;
688 }
689
Paul Mackerras02045682006-11-29 22:27:42 +1100690 /*
691 * Now do the same thing for each LMB listed in the ibm,dynamic-memory
692 * property in the ibm,dynamic-reconfiguration-memory node.
693 */
694 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
695 if (memory)
696 parse_drconf_memory(memory);
697
Linus Torvalds1da177e2005-04-16 15:20:36 -0700698 return 0;
699}
700
701static void __init setup_nonnuma(void)
702{
703 unsigned long top_of_ram = lmb_end_of_DRAM();
704 unsigned long total_ram = lmb_phys_mem_size();
Mel Gormanc67c3cb2006-09-27 01:49:49 -0700705 unsigned long start_pfn, end_pfn;
Balbir Singh1daa6d02008-02-01 15:57:31 +1100706 unsigned int i, nid = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700707
Olof Johanssone110b282006-04-12 15:25:01 -0500708 printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700709 top_of_ram, total_ram);
Olof Johanssone110b282006-04-12 15:25:01 -0500710 printk(KERN_DEBUG "Memory hole size: %ldMB\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700711 (top_of_ram - total_ram) >> 20);
712
Mel Gormanc67c3cb2006-09-27 01:49:49 -0700713 for (i = 0; i < lmb.memory.cnt; ++i) {
714 start_pfn = lmb.memory.region[i].base >> PAGE_SHIFT;
715 end_pfn = start_pfn + lmb_size_pages(&lmb.memory, i);
Balbir Singh1daa6d02008-02-01 15:57:31 +1100716
717 fake_numa_create_new_node(end_pfn, &nid);
718 add_active_range(nid, start_pfn, end_pfn);
719 node_set_online(nid);
Mel Gormanc67c3cb2006-09-27 01:49:49 -0700720 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700721}
722
Anton Blanchard4b703a22005-12-13 06:56:47 +1100723void __init dump_numa_cpu_topology(void)
724{
725 unsigned int node;
726 unsigned int cpu, count;
727
728 if (min_common_depth == -1 || !numa_enabled)
729 return;
730
731 for_each_online_node(node) {
Olof Johanssone110b282006-04-12 15:25:01 -0500732 printk(KERN_DEBUG "Node %d CPUs:", node);
Anton Blanchard4b703a22005-12-13 06:56:47 +1100733
734 count = 0;
735 /*
736 * If we used a CPU iterator here we would miss printing
737 * the holes in the cpumap.
738 */
739 for (cpu = 0; cpu < NR_CPUS; cpu++) {
740 if (cpu_isset(cpu, numa_cpumask_lookup_table[node])) {
741 if (count == 0)
742 printk(" %u", cpu);
743 ++count;
744 } else {
745 if (count > 1)
746 printk("-%u", cpu - 1);
747 count = 0;
748 }
749 }
750
751 if (count > 1)
752 printk("-%u", NR_CPUS - 1);
753 printk("\n");
754 }
755}
756
757static void __init dump_numa_memory_topology(void)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700758{
759 unsigned int node;
760 unsigned int count;
761
762 if (min_common_depth == -1 || !numa_enabled)
763 return;
764
765 for_each_online_node(node) {
766 unsigned long i;
767
Olof Johanssone110b282006-04-12 15:25:01 -0500768 printk(KERN_DEBUG "Node %d Memory:", node);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700769
770 count = 0;
771
Anton Blanchard45fb6ce2005-11-11 14:22:35 +1100772 for (i = 0; i < lmb_end_of_DRAM();
773 i += (1 << SECTION_SIZE_BITS)) {
774 if (early_pfn_to_nid(i >> PAGE_SHIFT) == node) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700775 if (count == 0)
776 printk(" 0x%lx", i);
777 ++count;
778 } else {
779 if (count > 0)
780 printk("-0x%lx", i);
781 count = 0;
782 }
783 }
784
785 if (count > 0)
786 printk("-0x%lx", i);
787 printk("\n");
788 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700789}
790
791/*
792 * Allocate some memory, satisfying the lmb or bootmem allocator where
793 * required. nid is the preferred node and end is the physical address of
794 * the highest address in the node.
795 *
Dave Hansen0be210f2008-12-09 08:21:35 +0000796 * Returns the virtual address of the memory.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700797 */
Dave Hansen893473d2008-12-09 08:21:36 +0000798static void __init *careful_zallocation(int nid, unsigned long size,
Anton Blanchard45fb6ce2005-11-11 14:22:35 +1100799 unsigned long align,
800 unsigned long end_pfn)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700801{
Dave Hansen0be210f2008-12-09 08:21:35 +0000802 void *ret;
Anton Blanchard45fb6ce2005-11-11 14:22:35 +1100803 int new_nid;
Dave Hansen0be210f2008-12-09 08:21:35 +0000804 unsigned long ret_paddr;
805
806 ret_paddr = __lmb_alloc_base(size, align, end_pfn << PAGE_SHIFT);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700807
808 /* retry over all memory */
Dave Hansen0be210f2008-12-09 08:21:35 +0000809 if (!ret_paddr)
810 ret_paddr = __lmb_alloc_base(size, align, lmb_end_of_DRAM());
Linus Torvalds1da177e2005-04-16 15:20:36 -0700811
Dave Hansen0be210f2008-12-09 08:21:35 +0000812 if (!ret_paddr)
Dave Hansen5d21ea22008-12-09 08:21:33 +0000813 panic("numa.c: cannot allocate %lu bytes for node %d",
Linus Torvalds1da177e2005-04-16 15:20:36 -0700814 size, nid);
815
Dave Hansen0be210f2008-12-09 08:21:35 +0000816 ret = __va(ret_paddr);
817
Linus Torvalds1da177e2005-04-16 15:20:36 -0700818 /*
Dave Hansenc555e5202008-12-09 08:21:32 +0000819 * We initialize the nodes in numeric order: 0, 1, 2...
820 * and hand over control from the LMB allocator to the
821 * bootmem allocator. If this function is called for
822 * node 5, then we know that all nodes <5 are using the
823 * bootmem allocator instead of the LMB allocator.
824 *
825 * So, check the nid from which this allocation came
826 * and double check to see if we need to use bootmem
827 * instead of the LMB. We don't free the LMB memory
828 * since it would be useless.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700829 */
Dave Hansen0be210f2008-12-09 08:21:35 +0000830 new_nid = early_pfn_to_nid(ret_paddr >> PAGE_SHIFT);
Anton Blanchard45fb6ce2005-11-11 14:22:35 +1100831 if (new_nid < nid) {
Dave Hansen0be210f2008-12-09 08:21:35 +0000832 ret = __alloc_bootmem_node(NODE_DATA(new_nid),
Linus Torvalds1da177e2005-04-16 15:20:36 -0700833 size, align, 0);
834
Dave Hansen0be210f2008-12-09 08:21:35 +0000835 dbg("alloc_bootmem %p %lx\n", ret, size);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700836 }
837
Dave Hansen893473d2008-12-09 08:21:36 +0000838 memset(ret, 0, size);
Dave Hansen0be210f2008-12-09 08:21:35 +0000839 return ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700840}
841
Chandra Seetharaman74b85f32006-06-27 02:54:09 -0700842static struct notifier_block __cpuinitdata ppc64_numa_nb = {
843 .notifier_call = cpu_numa_callback,
844 .priority = 1 /* Must run before sched domains notifier. */
845};
846
Dave Hansen4a618662008-11-24 12:02:35 +0000847static void mark_reserved_regions_for_nid(int nid)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700848{
Dave Hansen4a618662008-11-24 12:02:35 +0000849 struct pglist_data *node = NODE_DATA(nid);
850 int i;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700851
Jon Tollefson8f64e1f2008-10-09 10:18:40 +0000852 for (i = 0; i < lmb.reserved.cnt; i++) {
853 unsigned long physbase = lmb.reserved.region[i].base;
854 unsigned long size = lmb.reserved.region[i].size;
855 unsigned long start_pfn = physbase >> PAGE_SHIFT;
856 unsigned long end_pfn = ((physbase + size) >> PAGE_SHIFT);
857 struct node_active_region node_ar;
Dave Hansen4a618662008-11-24 12:02:35 +0000858 unsigned long node_end_pfn = node->node_start_pfn +
859 node->node_spanned_pages;
860
861 /*
862 * Check to make sure that this lmb.reserved area is
863 * within the bounds of the node that we care about.
864 * Checking the nid of the start and end points is not
865 * sufficient because the reserved area could span the
866 * entire node.
867 */
868 if (end_pfn <= node->node_start_pfn ||
869 start_pfn >= node_end_pfn)
870 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700871
Jon Tollefson8f64e1f2008-10-09 10:18:40 +0000872 get_node_active_region(start_pfn, &node_ar);
Jon Tollefsone8170372008-10-16 18:59:43 +0000873 while (start_pfn < end_pfn &&
874 node_ar.start_pfn < node_ar.end_pfn) {
875 unsigned long reserve_size = size;
Jon Tollefson8f64e1f2008-10-09 10:18:40 +0000876 /*
877 * if reserved region extends past active region
878 * then trim size to active region
879 */
880 if (end_pfn > node_ar.end_pfn)
Jon Tollefsone8170372008-10-16 18:59:43 +0000881 reserve_size = (node_ar.end_pfn << PAGE_SHIFT)
Jon Tollefson8f64e1f2008-10-09 10:18:40 +0000882 - (start_pfn << PAGE_SHIFT);
Dave Hansena4c74dd2008-12-11 08:36:06 +0000883 /*
884 * Only worry about *this* node, others may not
885 * yet have valid NODE_DATA().
886 */
887 if (node_ar.nid == nid) {
888 dbg("reserve_bootmem %lx %lx nid=%d\n",
889 physbase, reserve_size, node_ar.nid);
890 reserve_bootmem_node(NODE_DATA(node_ar.nid),
891 physbase, reserve_size,
892 BOOTMEM_DEFAULT);
893 }
Jon Tollefson8f64e1f2008-10-09 10:18:40 +0000894 /*
895 * if reserved region is contained in the active region
896 * then done.
897 */
898 if (end_pfn <= node_ar.end_pfn)
899 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700900
Jon Tollefson8f64e1f2008-10-09 10:18:40 +0000901 /*
902 * reserved region extends past the active region
903 * get next active region that contains this
904 * reserved region
905 */
906 start_pfn = node_ar.end_pfn;
907 physbase = start_pfn << PAGE_SHIFT;
Jon Tollefsone8170372008-10-16 18:59:43 +0000908 size = size - reserve_size;
Jon Tollefson8f64e1f2008-10-09 10:18:40 +0000909 get_node_active_region(start_pfn, &node_ar);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700910 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700911 }
Dave Hansen4a618662008-11-24 12:02:35 +0000912}
Jon Tollefson8f64e1f2008-10-09 10:18:40 +0000913
Dave Hansen4a618662008-11-24 12:02:35 +0000914
915void __init do_init_bootmem(void)
916{
917 int nid;
Dave Hansen4a618662008-11-24 12:02:35 +0000918
919 min_low_pfn = 0;
920 max_low_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
921 max_pfn = max_low_pfn;
922
923 if (parse_numa_properties())
924 setup_nonnuma();
925 else
926 dump_numa_memory_topology();
927
928 register_cpu_notifier(&ppc64_numa_nb);
929 cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE,
930 (void *)(unsigned long)boot_cpuid);
931
932 for_each_online_node(nid) {
933 unsigned long start_pfn, end_pfn;
Dave Hansen0be210f2008-12-09 08:21:35 +0000934 void *bootmem_vaddr;
Dave Hansen4a618662008-11-24 12:02:35 +0000935 unsigned long bootmap_pages;
936
937 get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
938
939 /*
940 * Allocate the node structure node local if possible
941 *
942 * Be careful moving this around, as it relies on all
943 * previous nodes' bootmem to be initialized and have
944 * all reserved areas marked.
945 */
Dave Hansen893473d2008-12-09 08:21:36 +0000946 NODE_DATA(nid) = careful_zallocation(nid,
Dave Hansen4a618662008-11-24 12:02:35 +0000947 sizeof(struct pglist_data),
948 SMP_CACHE_BYTES, end_pfn);
Dave Hansen4a618662008-11-24 12:02:35 +0000949
950 dbg("node %d\n", nid);
951 dbg("NODE_DATA() = %p\n", NODE_DATA(nid));
952
953 NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
954 NODE_DATA(nid)->node_start_pfn = start_pfn;
955 NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
956
957 if (NODE_DATA(nid)->node_spanned_pages == 0)
958 continue;
959
960 dbg("start_paddr = %lx\n", start_pfn << PAGE_SHIFT);
961 dbg("end_paddr = %lx\n", end_pfn << PAGE_SHIFT);
962
963 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
Dave Hansen893473d2008-12-09 08:21:36 +0000964 bootmem_vaddr = careful_zallocation(nid,
Dave Hansen4a618662008-11-24 12:02:35 +0000965 bootmap_pages << PAGE_SHIFT,
966 PAGE_SIZE, end_pfn);
Dave Hansen4a618662008-11-24 12:02:35 +0000967
Dave Hansen0be210f2008-12-09 08:21:35 +0000968 dbg("bootmap_vaddr = %p\n", bootmem_vaddr);
Dave Hansen4a618662008-11-24 12:02:35 +0000969
Dave Hansen0be210f2008-12-09 08:21:35 +0000970 init_bootmem_node(NODE_DATA(nid),
971 __pa(bootmem_vaddr) >> PAGE_SHIFT,
Dave Hansen4a618662008-11-24 12:02:35 +0000972 start_pfn, end_pfn);
973
974 free_bootmem_with_active_regions(nid, end_pfn);
975 /*
976 * Be very careful about moving this around. Future
Dave Hansen893473d2008-12-09 08:21:36 +0000977 * calls to careful_zallocation() depend on this getting
Dave Hansen4a618662008-11-24 12:02:35 +0000978 * done correctly.
979 */
980 mark_reserved_regions_for_nid(nid);
Jon Tollefson8f64e1f2008-10-09 10:18:40 +0000981 sparse_memory_present_with_active_regions(nid);
Dave Hansen4a618662008-11-24 12:02:35 +0000982 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700983}
984
985void __init paging_init(void)
986{
Mel Gorman6391af12006-10-11 01:20:39 -0700987 unsigned long max_zone_pfns[MAX_NR_ZONES];
988 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
989 max_zone_pfns[ZONE_DMA] = lmb_end_of_DRAM() >> PAGE_SHIFT;
Mel Gormanc67c3cb2006-09-27 01:49:49 -0700990 free_area_init_nodes(max_zone_pfns);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700991}
992
993static int __init early_numa(char *p)
994{
995 if (!p)
996 return 0;
997
998 if (strstr(p, "off"))
999 numa_enabled = 0;
1000
1001 if (strstr(p, "debug"))
1002 numa_debug = 1;
1003
Balbir Singh1daa6d02008-02-01 15:57:31 +11001004 p = strstr(p, "fake=");
1005 if (p)
1006 cmdline = p + strlen("fake=");
1007
Linus Torvalds1da177e2005-04-16 15:20:36 -07001008 return 0;
1009}
1010early_param("numa", early_numa);
Mike Kravetz237a09892005-12-05 12:06:42 -08001011
1012#ifdef CONFIG_MEMORY_HOTPLUG
1013/*
Nathan Fontenot0db93602008-07-03 13:25:08 +10001014 * Validate the node associated with the memory section we are
1015 * trying to add.
1016 */
1017int valid_hot_add_scn(int *nid, unsigned long start, u32 lmb_size,
1018 unsigned long scn_addr)
1019{
1020 nodemask_t nodes;
1021
1022 if (*nid < 0 || !node_online(*nid))
1023 *nid = any_online_node(NODE_MASK_ALL);
1024
1025 if ((scn_addr >= start) && (scn_addr < (start + lmb_size))) {
1026 nodes_setall(nodes);
1027 while (NODE_DATA(*nid)->node_spanned_pages == 0) {
1028 node_clear(*nid, nodes);
1029 *nid = any_online_node(nodes);
1030 }
1031
1032 return 1;
1033 }
1034
1035 return 0;
1036}
1037
1038/*
1039 * Find the node associated with a hot added memory section represented
1040 * by the ibm,dynamic-reconfiguration-memory node.
1041 */
1042static int hot_add_drconf_scn_to_nid(struct device_node *memory,
1043 unsigned long scn_addr)
1044{
1045 const u32 *dm;
1046 unsigned int n, rc;
1047 unsigned long lmb_size;
1048 int default_nid = any_online_node(NODE_MASK_ALL);
1049 int nid;
1050 struct assoc_arrays aa;
1051
1052 n = of_get_drconf_memory(memory, &dm);
1053 if (!n)
1054 return default_nid;;
1055
1056 lmb_size = of_get_lmb_size(memory);
1057 if (!lmb_size)
1058 return default_nid;
1059
1060 rc = of_get_assoc_arrays(memory, &aa);
1061 if (rc)
1062 return default_nid;
1063
1064 for (; n != 0; --n) {
1065 struct of_drconf_cell drmem;
1066
1067 read_drconf_cell(&drmem, &dm);
1068
1069 /* skip this block if it is reserved or not assigned to
1070 * this partition */
1071 if ((drmem.flags & DRCONF_MEM_RESERVED)
1072 || !(drmem.flags & DRCONF_MEM_ASSIGNED))
1073 continue;
1074
1075 nid = of_drconf_to_nid_single(&drmem, &aa);
1076
1077 if (valid_hot_add_scn(&nid, drmem.base_addr, lmb_size,
1078 scn_addr))
1079 return nid;
1080 }
1081
1082 BUG(); /* section address should be found above */
1083 return 0;
1084}
1085
1086/*
Mike Kravetz237a09892005-12-05 12:06:42 -08001087 * Find the node associated with a hot added memory section. Section
1088 * corresponds to a SPARSEMEM section, not an LMB. It is assumed that
1089 * sections are fully contained within a single LMB.
1090 */
1091int hot_add_scn_to_nid(unsigned long scn_addr)
1092{
1093 struct device_node *memory = NULL;
Andrew Morton069007a2006-03-24 02:34:46 -08001094 int nid;
Mike Kravetz237a09892005-12-05 12:06:42 -08001095
1096 if (!numa_enabled || (min_common_depth < 0))
Nathan Fontenot0db93602008-07-03 13:25:08 +10001097 return any_online_node(NODE_MASK_ALL);
1098
1099 memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
1100 if (memory) {
1101 nid = hot_add_drconf_scn_to_nid(memory, scn_addr);
1102 of_node_put(memory);
1103 return nid;
1104 }
Mike Kravetz237a09892005-12-05 12:06:42 -08001105
1106 while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
1107 unsigned long start, size;
Mike Kravetzb226e462005-12-16 14:30:35 -08001108 int ranges;
Jeremy Kerra7f67bd2006-07-12 15:35:54 +10001109 const unsigned int *memcell_buf;
Mike Kravetz237a09892005-12-05 12:06:42 -08001110 unsigned int len;
1111
Stephen Rothwelle2eb6392007-04-03 22:26:41 +10001112 memcell_buf = of_get_property(memory, "reg", &len);
Mike Kravetz237a09892005-12-05 12:06:42 -08001113 if (!memcell_buf || len <= 0)
1114 continue;
1115
Benjamin Herrenschmidtcc5d0182005-12-13 18:01:21 +11001116 /* ranges in cell */
1117 ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
Mike Kravetz237a09892005-12-05 12:06:42 -08001118ha_new_range:
1119 start = read_n_cells(n_mem_addr_cells, &memcell_buf);
1120 size = read_n_cells(n_mem_size_cells, &memcell_buf);
Jeremy Kerr953039c2006-05-01 12:16:12 -07001121 nid = of_node_to_nid_single(memory);
Mike Kravetz237a09892005-12-05 12:06:42 -08001122
Nathan Fontenot0db93602008-07-03 13:25:08 +10001123 if (valid_hot_add_scn(&nid, start, size, scn_addr)) {
Mike Kravetz237a09892005-12-05 12:06:42 -08001124 of_node_put(memory);
Nathan Fontenot0db93602008-07-03 13:25:08 +10001125 return nid;
Mike Kravetz237a09892005-12-05 12:06:42 -08001126 }
1127
1128 if (--ranges) /* process all ranges in cell */
1129 goto ha_new_range;
1130 }
Mike Kravetz237a09892005-12-05 12:06:42 -08001131 BUG(); /* section address should be found above */
Andrew Morton069007a2006-03-24 02:34:46 -08001132 return 0;
Mike Kravetz237a09892005-12-05 12:06:42 -08001133}
1134#endif /* CONFIG_MEMORY_HOTPLUG */