blob: 2d8fdc05f415428c9ec8b0a1681a470f624371ae [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */
5#include <linux/kernel.h>
6#include <linux/mm.h>
7#include <linux/string.h>
8#include <linux/init.h>
9#include <linux/bootmem.h>
10#include <linux/mmzone.h>
11#include <linux/ctype.h>
12#include <linux/module.h>
13#include <linux/nodemask.h>
14
15#include <asm/e820.h>
16#include <asm/proto.h>
17#include <asm/dma.h>
18#include <asm/numa.h>
19#include <asm/acpi.h>
Thomas Gleixnerc9ff0342008-01-30 13:30:16 +010020#include <asm/k8.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070021
22#ifndef Dprintk
23#define Dprintk(x...)
24#endif
25
Ravikiran G Thirumalai6c231b72005-09-06 15:17:45 -070026struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -070027bootmem_data_t plat_node_bdata[MAX_NUMNODES];
28
Eric Dumazetdcf36bf2006-03-25 16:31:46 +010029struct memnode memnode;
Linus Torvalds1da177e2005-04-16 15:20:36 -070030
Andi Kleen3f098c22005-09-12 18:49:24 +020031unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
32 [0 ... NR_CPUS-1] = NUMA_NO_NODE
Andi Kleen0b07e982005-09-12 18:49:24 +020033};
Andi Kleen3f098c22005-09-12 18:49:24 +020034unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
35 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
36};
37cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -070038
39int numa_off __initdata;
Amul Shah076422d2007-02-13 13:26:19 +010040unsigned long __initdata nodemap_addr;
41unsigned long __initdata nodemap_size;
Linus Torvalds1da177e2005-04-16 15:20:36 -070042
Eric Dumazet529a3402005-11-05 17:25:54 +010043
44/*
45 * Given a shift value, try to populate memnodemap[]
46 * Returns :
47 * 1 if OK
48 * 0 if memnodmap[] too small (of shift too small)
49 * -1 if node overlap or lost ram (shift too big)
50 */
Andi Kleend18ff472006-01-11 22:44:30 +010051static int __init
Andi Kleenabe059e2006-03-25 16:29:12 +010052populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
Linus Torvalds1da177e2005-04-16 15:20:36 -070053{
54 int i;
Eric Dumazet529a3402005-11-05 17:25:54 +010055 int res = -1;
56 unsigned long addr, end;
Keith Manntheyb6846642005-07-28 21:15:38 -070057
Amul Shah076422d2007-02-13 13:26:19 +010058 memset(memnodemap, 0xff, memnodemapsize);
Eric Dumazet529a3402005-11-05 17:25:54 +010059 for (i = 0; i < numnodes; i++) {
60 addr = nodes[i].start;
61 end = nodes[i].end;
62 if (addr >= end)
63 continue;
Amul Shah076422d2007-02-13 13:26:19 +010064 if ((end >> shift) >= memnodemapsize)
Eric Dumazet529a3402005-11-05 17:25:54 +010065 return 0;
66 do {
67 if (memnodemap[addr >> shift] != 0xff)
68 return -1;
69 memnodemap[addr >> shift] = i;
Amul Shah076422d2007-02-13 13:26:19 +010070 addr += (1UL << shift);
Eric Dumazet529a3402005-11-05 17:25:54 +010071 } while (addr < end);
72 res = 1;
73 }
74 return res;
75}
76
Amul Shah076422d2007-02-13 13:26:19 +010077static int __init allocate_cachealigned_memnodemap(void)
78{
79 unsigned long pad, pad_addr;
80
81 memnodemap = memnode.embedded_map;
Amul Shah54413922007-02-13 13:26:20 +010082 if (memnodemapsize <= 48)
Amul Shah076422d2007-02-13 13:26:19 +010083 return 0;
Amul Shah076422d2007-02-13 13:26:19 +010084
85 pad = L1_CACHE_BYTES - 1;
86 pad_addr = 0x8000;
87 nodemap_size = pad + memnodemapsize;
88 nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT,
89 nodemap_size);
90 if (nodemap_addr == -1UL) {
91 printk(KERN_ERR
92 "NUMA: Unable to allocate Memory to Node hash map\n");
93 nodemap_addr = nodemap_size = 0;
94 return -1;
95 }
96 pad_addr = (nodemap_addr + pad) & ~pad;
97 memnodemap = phys_to_virt(pad_addr);
98
99 printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
100 nodemap_addr, nodemap_addr + nodemap_size);
101 return 0;
102}
103
104/*
105 * The LSB of all start and end addresses in the node map is the value of the
106 * maximum possible shift.
107 */
108static int __init
109extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes)
110{
Amul Shah54413922007-02-13 13:26:20 +0100111 int i, nodes_used = 0;
Amul Shah076422d2007-02-13 13:26:19 +0100112 unsigned long start, end;
113 unsigned long bitfield = 0, memtop = 0;
114
115 for (i = 0; i < numnodes; i++) {
116 start = nodes[i].start;
117 end = nodes[i].end;
118 if (start >= end)
119 continue;
Amul Shah54413922007-02-13 13:26:20 +0100120 bitfield |= start;
121 nodes_used++;
Amul Shah076422d2007-02-13 13:26:19 +0100122 if (end > memtop)
123 memtop = end;
124 }
Amul Shah54413922007-02-13 13:26:20 +0100125 if (nodes_used <= 1)
126 i = 63;
127 else
128 i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
Amul Shah076422d2007-02-13 13:26:19 +0100129 memnodemapsize = (memtop >> i)+1;
130 return i;
131}
132
Andi Kleenabe059e2006-03-25 16:29:12 +0100133int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
Eric Dumazet529a3402005-11-05 17:25:54 +0100134{
Amul Shah076422d2007-02-13 13:26:19 +0100135 int shift;
Eric Dumazet529a3402005-11-05 17:25:54 +0100136
Amul Shah076422d2007-02-13 13:26:19 +0100137 shift = extract_lsb_from_nodes(nodes, numnodes);
138 if (allocate_cachealigned_memnodemap())
139 return -1;
Andi Kleen6b050f82006-01-11 22:44:33 +0100140 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
Eric Dumazet529a3402005-11-05 17:25:54 +0100141 shift);
142
143 if (populate_memnodemap(nodes, numnodes, shift) != 1) {
144 printk(KERN_INFO
Keith Manntheyb6846642005-07-28 21:15:38 -0700145 "Your memory is not aligned you need to rebuild your kernel "
Eric Dumazet529a3402005-11-05 17:25:54 +0100146 "with a bigger NODEMAPSIZE shift=%d\n",
147 shift);
148 return -1;
149 }
Keith Manntheyb6846642005-07-28 21:15:38 -0700150 return shift;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700151}
152
Matt Tolentinobbfceef2005-06-23 00:08:07 -0700153#ifdef CONFIG_SPARSEMEM
154int early_pfn_to_nid(unsigned long pfn)
155{
156 return phys_to_nid(pfn << PAGE_SHIFT);
157}
158#endif
159
Andi Kleena8062232006-04-07 19:49:21 +0200160static void * __init
161early_node_mem(int nodeid, unsigned long start, unsigned long end,
162 unsigned long size)
163{
164 unsigned long mem = find_e820_area(start, end, size);
165 void *ptr;
166 if (mem != -1L)
167 return __va(mem);
168 ptr = __alloc_bootmem_nopanic(size,
169 SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS));
Yoann Padioleau83e83d52007-10-17 18:04:35 +0200170 if (ptr == NULL) {
Andi Kleena8062232006-04-07 19:49:21 +0200171 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
172 size, nodeid);
173 return NULL;
174 }
175 return ptr;
176}
177
Linus Torvalds1da177e2005-04-16 15:20:36 -0700178/* Initialize bootmem allocator for a node */
179void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
180{
181 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
182 unsigned long nodedata_phys;
Andi Kleena8062232006-04-07 19:49:21 +0200183 void *bootmap;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700184 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
185
186 start = round_up(start, ZONE_ALIGN);
187
Andi Kleen6b050f82006-01-11 22:44:33 +0100188 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700189
190 start_pfn = start >> PAGE_SHIFT;
191 end_pfn = end >> PAGE_SHIFT;
192
Andi Kleena8062232006-04-07 19:49:21 +0200193 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size);
194 if (node_data[nodeid] == NULL)
195 return;
196 nodedata_phys = __pa(node_data[nodeid]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700197
Linus Torvalds1da177e2005-04-16 15:20:36 -0700198 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
199 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
200 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
201 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
202
203 /* Find a place for the bootmem map */
204 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
205 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
Andi Kleena8062232006-04-07 19:49:21 +0200206 bootmap = early_node_mem(nodeid, bootmap_start, end,
207 bootmap_pages<<PAGE_SHIFT);
208 if (bootmap == NULL) {
209 if (nodedata_phys < start || nodedata_phys >= end)
210 free_bootmem((unsigned long)node_data[nodeid],pgdat_size);
211 node_data[nodeid] = NULL;
212 return;
213 }
214 bootmap_start = __pa(bootmap);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700215 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
216
217 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
218 bootmap_start >> PAGE_SHIFT,
219 start_pfn, end_pfn);
220
Mel Gorman5cb248a2006-09-27 01:49:52 -0700221 free_bootmem_with_active_regions(nodeid, end);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700222
223 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
224 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200225#ifdef CONFIG_ACPI_NUMA
226 srat_reserve_add_area(nodeid);
227#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700228 node_set_online(nodeid);
229}
230
231/* Initialize final allocator for a zone */
232void __init setup_node_zones(int nodeid)
233{
Andi Kleen267b4802006-03-25 16:31:10 +0100234 unsigned long start_pfn, end_pfn, memmapsize, limit;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700235
Andi Kleena2f1b422005-11-05 17:25:53 +0100236 start_pfn = node_start_pfn(nodeid);
237 end_pfn = node_end_pfn(nodeid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700238
Mel Gorman5cb248a2006-09-27 01:49:52 -0700239 Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n",
Andi Kleena2f1b422005-11-05 17:25:53 +0100240 nodeid, start_pfn, end_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700241
Andi Kleen267b4802006-03-25 16:31:10 +0100242 /* Try to allocate mem_map at end to not fill up precious <4GB
243 memory. */
244 memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
245 limit = end_pfn << PAGE_SHIFT;
Andy Whitcroft3b5fd592006-04-22 02:35:41 -0700246#ifdef CONFIG_FLAT_NODE_MEM_MAP
Andi Kleen267b4802006-03-25 16:31:10 +0100247 NODE_DATA(nodeid)->node_mem_map =
248 __alloc_bootmem_core(NODE_DATA(nodeid)->bdata,
249 memmapsize, SMP_CACHE_BYTES,
250 round_down(limit - memmapsize, PAGE_SIZE),
251 limit);
Andy Whitcroft3b5fd592006-04-22 02:35:41 -0700252#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -0700253}
254
255void __init numa_init_array(void)
256{
257 int rr, i;
258 /* There are unfortunately some poorly designed mainboards around
259 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
260 mapping. To avoid this fill in the mapping for all possible
261 CPUs, as the number of CPUs is not known yet.
262 We round robin the existing nodes. */
Ravikiran G Thirumalai85cc5132005-09-30 11:59:22 -0700263 rr = first_node(node_online_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700264 for (i = 0; i < NR_CPUS; i++) {
Mike Travis98c9e272007-10-17 18:04:39 +0200265 if (cpu_to_node(i) != NUMA_NO_NODE)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700266 continue;
Andi Kleen69d81fc2005-11-05 17:25:53 +0100267 numa_set_node(i, rr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700268 rr = next_node(rr, node_online_map);
269 if (rr == MAX_NUMNODES)
270 rr = first_node(node_online_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700271 }
272
Linus Torvalds1da177e2005-04-16 15:20:36 -0700273}
274
275#ifdef CONFIG_NUMA_EMU
Rohit Seth53fee042007-02-13 13:26:22 +0100276/* Numa emulation */
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200277char *cmdline __initdata;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700278
Rohit Seth53fee042007-02-13 13:26:22 +0100279/*
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200280 * Setups up nid to range from addr to addr + size. If the end boundary is
281 * greater than max_addr, then max_addr is used instead. The return value is 0
282 * if there is additional memory left for allocation past addr and -1 otherwise.
283 * addr is adjusted to be at the end of the node.
Rohit Seth53fee042007-02-13 13:26:22 +0100284 */
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200285static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
286 u64 size, u64 max_addr)
Rohit Seth53fee042007-02-13 13:26:22 +0100287{
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200288 int ret = 0;
289 nodes[nid].start = *addr;
290 *addr += size;
291 if (*addr >= max_addr) {
292 *addr = max_addr;
293 ret = -1;
294 }
295 nodes[nid].end = *addr;
Suresh Siddhae3f1cae2007-05-02 19:27:20 +0200296 node_set(nid, node_possible_map);
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200297 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
298 nodes[nid].start, nodes[nid].end,
299 (nodes[nid].end - nodes[nid].start) >> 20);
300 return ret;
Rohit Seth53fee042007-02-13 13:26:22 +0100301}
302
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200303/*
304 * Splits num_nodes nodes up equally starting at node_start. The return value
305 * is the number of nodes split up and addr is adjusted to be at the end of the
306 * last node allocated.
307 */
308static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr,
309 u64 max_addr, int node_start,
310 int num_nodes)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700311{
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200312 unsigned int big;
313 u64 size;
314 int i;
Rohit Seth53fee042007-02-13 13:26:22 +0100315
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200316 if (num_nodes <= 0)
317 return -1;
318 if (num_nodes > MAX_NUMNODES)
319 num_nodes = MAX_NUMNODES;
David Rientjesa7e96622007-07-21 17:11:29 +0200320 size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) /
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200321 num_nodes;
Rohit Seth53fee042007-02-13 13:26:22 +0100322 /*
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200323 * Calculate the number of big nodes that can be allocated as a result
324 * of consolidating the leftovers.
Rohit Seth53fee042007-02-13 13:26:22 +0100325 */
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200326 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) /
327 FAKE_NODE_MIN_SIZE;
Rohit Seth53fee042007-02-13 13:26:22 +0100328
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200329 /* Round down to nearest FAKE_NODE_MIN_SIZE. */
330 size &= FAKE_NODE_MIN_HASH_MASK;
331 if (!size) {
332 printk(KERN_ERR "Not enough memory for each node. "
333 "NUMA emulation disabled.\n");
334 return -1;
Rohit Seth53fee042007-02-13 13:26:22 +0100335 }
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200336
337 for (i = node_start; i < num_nodes + node_start; i++) {
338 u64 end = *addr + size;
Rohit Seth53fee042007-02-13 13:26:22 +0100339 if (i < big)
340 end += FAKE_NODE_MIN_SIZE;
341 /*
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200342 * The final node can have the remaining system RAM. Other
343 * nodes receive roughly the same amount of available pages.
Rohit Seth53fee042007-02-13 13:26:22 +0100344 */
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200345 if (i == num_nodes + node_start - 1)
Rohit Seth53fee042007-02-13 13:26:22 +0100346 end = max_addr;
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200347 else
David Rientjesa7e96622007-07-21 17:11:29 +0200348 while (end - *addr - e820_hole_size(*addr, end) <
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200349 size) {
350 end += FAKE_NODE_MIN_SIZE;
351 if (end > max_addr) {
352 end = max_addr;
353 break;
354 }
355 }
356 if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0)
357 break;
358 }
359 return i - node_start + 1;
360}
361
362/*
David Rientjes382591d2007-05-02 19:27:09 +0200363 * Splits the remaining system RAM into chunks of size. The remaining memory is
364 * always assigned to a final node and can be asymmetric. Returns the number of
365 * nodes split.
366 */
367static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr,
368 u64 max_addr, int node_start, u64 size)
369{
370 int i = node_start;
371 size = (size << 20) & FAKE_NODE_MIN_HASH_MASK;
372 while (!setup_node_range(i++, nodes, addr, size, max_addr))
373 ;
374 return i - node_start;
375}
376
377/*
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200378 * Sets up the system RAM area from start_pfn to end_pfn according to the
379 * numa=fake command-line option.
380 */
381static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
382{
383 struct bootnode nodes[MAX_NUMNODES];
384 u64 addr = start_pfn << PAGE_SHIFT;
385 u64 max_addr = end_pfn << PAGE_SHIFT;
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200386 int num_nodes = 0;
David Rientjes382591d2007-05-02 19:27:09 +0200387 int coeff_flag;
388 int coeff = -1;
389 int num = 0;
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200390 u64 size;
391 int i;
392
393 memset(&nodes, 0, sizeof(nodes));
394 /*
395 * If the numa=fake command-line is just a single number N, split the
396 * system RAM into N fake nodes.
397 */
398 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) {
399 num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0,
400 simple_strtol(cmdline, NULL, 0));
401 if (num_nodes < 0)
402 return num_nodes;
403 goto out;
404 }
405
406 /* Parse the command line. */
David Rientjes382591d2007-05-02 19:27:09 +0200407 for (coeff_flag = 0; ; cmdline++) {
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200408 if (*cmdline && isdigit(*cmdline)) {
409 num = num * 10 + *cmdline - '0';
410 continue;
411 }
David Rientjes382591d2007-05-02 19:27:09 +0200412 if (*cmdline == '*') {
413 if (num > 0)
414 coeff = num;
415 coeff_flag = 1;
416 }
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200417 if (!*cmdline || *cmdline == ',') {
David Rientjes382591d2007-05-02 19:27:09 +0200418 if (!coeff_flag)
419 coeff = 1;
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200420 /*
421 * Round down to the nearest FAKE_NODE_MIN_SIZE.
422 * Command-line coefficients are in megabytes.
423 */
424 size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK;
David Rientjes382591d2007-05-02 19:27:09 +0200425 if (size)
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200426 for (i = 0; i < coeff; i++, num_nodes++)
427 if (setup_node_range(num_nodes, nodes,
428 &addr, size, max_addr) < 0)
429 goto done;
David Rientjes382591d2007-05-02 19:27:09 +0200430 if (!*cmdline)
431 break;
432 coeff_flag = 0;
433 coeff = -1;
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200434 }
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200435 num = 0;
436 }
437done:
438 if (!num_nodes)
439 return -1;
David Rientjes14694d72007-05-02 19:27:09 +0200440 /* Fill remainder of system RAM, if appropriate. */
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200441 if (addr < max_addr) {
David Rientjes382591d2007-05-02 19:27:09 +0200442 if (coeff_flag && coeff < 0) {
443 /* Split remaining nodes into num-sized chunks */
444 num_nodes += split_nodes_by_size(nodes, &addr, max_addr,
445 num_nodes, num);
446 goto out;
447 }
David Rientjes14694d72007-05-02 19:27:09 +0200448 switch (*(cmdline - 1)) {
449 case '*':
450 /* Split remaining nodes into coeff chunks */
451 if (coeff <= 0)
452 break;
453 num_nodes += split_nodes_equally(nodes, &addr, max_addr,
454 num_nodes, coeff);
455 break;
456 case ',':
457 /* Do not allocate remaining system RAM */
458 break;
459 default:
460 /* Give one final node */
461 setup_node_range(num_nodes, nodes, &addr,
462 max_addr - addr, max_addr);
463 num_nodes++;
464 }
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200465 }
466out:
467 memnode_shift = compute_hash_shift(nodes, num_nodes);
468 if (memnode_shift < 0) {
469 memnode_shift = 0;
470 printk(KERN_ERR "No NUMA hash function found. NUMA emulation "
471 "disabled.\n");
472 return -1;
473 }
474
475 /*
476 * We need to vacate all active ranges that may have been registered by
David Rientjes1c05f092007-07-21 17:11:30 +0200477 * SRAT and set acpi_numa to -1 so that srat_disabled() always returns
478 * true. NUMA emulation has succeeded so we will not scan ACPI nodes.
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200479 */
480 remove_all_active_ranges();
David Rientjes1c05f092007-07-21 17:11:30 +0200481#ifdef CONFIG_ACPI_NUMA
482 acpi_numa = -1;
483#endif
Suresh Siddhae3f1cae2007-05-02 19:27:20 +0200484 for_each_node_mask(i, node_possible_map) {
Mel Gorman5cb248a2006-09-27 01:49:52 -0700485 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
486 nodes[i].end >> PAGE_SHIFT);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700487 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
Mel Gorman5cb248a2006-09-27 01:49:52 -0700488 }
David Rientjes3484d792007-07-21 17:10:32 +0200489 acpi_fake_nodes(nodes, num_nodes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700490 numa_init_array();
491 return 0;
492}
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200493#endif /* CONFIG_NUMA_EMU */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700494
495void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
496{
497 int i;
498
Suresh Siddhae3f1cae2007-05-02 19:27:20 +0200499 nodes_clear(node_possible_map);
500
Linus Torvalds1da177e2005-04-16 15:20:36 -0700501#ifdef CONFIG_NUMA_EMU
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200502 if (cmdline && !numa_emulation(start_pfn, end_pfn))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700503 return;
Suresh Siddhae3f1cae2007-05-02 19:27:20 +0200504 nodes_clear(node_possible_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700505#endif
506
507#ifdef CONFIG_ACPI_NUMA
508 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
509 end_pfn << PAGE_SHIFT))
510 return;
Suresh Siddhae3f1cae2007-05-02 19:27:20 +0200511 nodes_clear(node_possible_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700512#endif
513
514#ifdef CONFIG_K8_NUMA
515 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
516 return;
Suresh Siddhae3f1cae2007-05-02 19:27:20 +0200517 nodes_clear(node_possible_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700518#endif
519 printk(KERN_INFO "%s\n",
520 numa_off ? "NUMA turned off" : "No NUMA configuration found");
521
522 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
523 start_pfn << PAGE_SHIFT,
524 end_pfn << PAGE_SHIFT);
525 /* setup dummy node covering all memory */
526 memnode_shift = 63;
Amul Shah076422d2007-02-13 13:26:19 +0100527 memnodemap = memnode.embedded_map;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700528 memnodemap[0] = 0;
529 nodes_clear(node_online_map);
530 node_set_online(0);
Suresh Siddhae3f1cae2007-05-02 19:27:20 +0200531 node_set(0, node_possible_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700532 for (i = 0; i < NR_CPUS; i++)
Andi Kleen69d81fc2005-11-05 17:25:53 +0100533 numa_set_node(i, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700534 node_to_cpumask[0] = cpumask_of_cpu(0);
Mel Gorman5cb248a2006-09-27 01:49:52 -0700535 e820_register_active_regions(0, start_pfn, end_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700536 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
537}
538
Ashok Raje6982c62005-06-25 14:54:58 -0700539__cpuinit void numa_add_cpu(int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700540{
Ravikiran G Thirumalaie6a045a2005-09-30 11:59:21 -0700541 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700542}
543
Andi Kleen69d81fc2005-11-05 17:25:53 +0100544void __cpuinit numa_set_node(int cpu, int node)
545{
Ravikiran G Thirumalaidf79efd2006-01-11 22:45:39 +0100546 cpu_pda(cpu)->nodenumber = node;
Mike Travis98c9e272007-10-17 18:04:39 +0200547 cpu_to_node(cpu) = node;
Andi Kleen69d81fc2005-11-05 17:25:53 +0100548}
549
Linus Torvalds1da177e2005-04-16 15:20:36 -0700550unsigned long __init numa_free_all_bootmem(void)
551{
552 int i;
553 unsigned long pages = 0;
554 for_each_online_node(i) {
555 pages += free_all_bootmem_node(NODE_DATA(i));
556 }
557 return pages;
558}
559
560void __init paging_init(void)
561{
562 int i;
Mel Gorman6391af12006-10-11 01:20:39 -0700563 unsigned long max_zone_pfns[MAX_NR_ZONES];
564 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
565 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
566 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
567 max_zone_pfns[ZONE_NORMAL] = end_pfn;
Bob Piccod3ee8712005-11-05 17:25:54 +0100568
Bob Piccof0a5a582007-02-13 13:26:25 +0100569 sparse_memory_present_with_active_regions(MAX_NUMNODES);
570 sparse_init();
Bob Piccod3ee8712005-11-05 17:25:54 +0100571
Linus Torvalds1da177e2005-04-16 15:20:36 -0700572 for_each_online_node(i) {
573 setup_node_zones(i);
574 }
Mel Gorman5cb248a2006-09-27 01:49:52 -0700575
576 free_area_init_nodes(max_zone_pfns);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700577}
578
Andi Kleen2c8c0e62006-09-26 10:52:32 +0200579static __init int numa_setup(char *opt)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700580{
Andi Kleen2c8c0e62006-09-26 10:52:32 +0200581 if (!opt)
582 return -EINVAL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700583 if (!strncmp(opt,"off",3))
584 numa_off = 1;
585#ifdef CONFIG_NUMA_EMU
David Rientjes8b8ca80e2007-05-02 19:27:09 +0200586 if (!strncmp(opt, "fake=", 5))
587 cmdline = opt + 5;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700588#endif
589#ifdef CONFIG_ACPI_NUMA
590 if (!strncmp(opt,"noacpi",6))
591 acpi_numa = -1;
Andi Kleen68a3a7f2006-04-07 19:49:18 +0200592 if (!strncmp(opt,"hotadd=", 7))
593 hotadd_percent = simple_strtoul(opt+7, NULL, 10);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700594#endif
Andi Kleen2c8c0e62006-09-26 10:52:32 +0200595 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700596}
597
Andi Kleen2c8c0e62006-09-26 10:52:32 +0200598early_param("numa", numa_setup);
599
Ravikiran Thirumalai05b3cbd2006-01-11 22:45:36 +0100600/*
601 * Setup early cpu_to_node.
602 *
603 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
604 * and apicid_to_node[] tables have valid entries for a CPU.
605 * This means we skip cpu_to_node[] initialisation for NUMA
606 * emulation and faking node case (when running a kernel compiled
607 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
608 * is already initialized in a round robin manner at numa_init_array,
609 * prior to this call, and this initialization is good enough
610 * for the fake NUMA cases.
611 */
612void __init init_cpu_to_node(void)
613{
614 int i;
615 for (i = 0; i < NR_CPUS; i++) {
Mike Travis71fff5e2007-10-19 20:35:03 +0200616 u8 apicid = x86_cpu_to_apicid_init[i];
Ravikiran Thirumalai05b3cbd2006-01-11 22:45:36 +0100617 if (apicid == BAD_APICID)
618 continue;
619 if (apicid_to_node[apicid] == NUMA_NO_NODE)
620 continue;
Daniel Yeisleyd1db4ec2006-02-15 15:17:41 -0800621 numa_set_node(i,apicid_to_node[apicid]);
Ravikiran Thirumalai05b3cbd2006-01-11 22:45:36 +0100622 }
623}
624
Linus Torvalds1da177e2005-04-16 15:20:36 -0700625EXPORT_SYMBOL(cpu_to_node);
626EXPORT_SYMBOL(node_to_cpumask);
Eric Dumazetdcf36bf2006-03-25 16:31:46 +0100627EXPORT_SYMBOL(memnode);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700628EXPORT_SYMBOL(node_data);
Andi Kleencf050132006-01-11 22:46:27 +0100629
630#ifdef CONFIG_DISCONTIGMEM
631/*
632 * Functions to convert PFNs from/to per node page addresses.
633 * These are out of line because they are quite big.
634 * They could be all tuned by pre caching more state.
635 * Should do that.
636 */
637
Andi Kleencf050132006-01-11 22:46:27 +0100638int pfn_valid(unsigned long pfn)
639{
640 unsigned nid;
641 if (pfn >= num_physpages)
642 return 0;
643 nid = pfn_to_nid(pfn);
644 if (nid == 0xff)
645 return 0;
646 return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid);
647}
648EXPORT_SYMBOL(pfn_valid);
649#endif