blob: 6ef9f9a762356c955fe3927fbed882b663e0dd2e [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */
5#include <linux/kernel.h>
6#include <linux/mm.h>
7#include <linux/string.h>
8#include <linux/init.h>
9#include <linux/bootmem.h>
10#include <linux/mmzone.h>
11#include <linux/ctype.h>
12#include <linux/module.h>
13#include <linux/nodemask.h>
14
15#include <asm/e820.h>
16#include <asm/proto.h>
17#include <asm/dma.h>
18#include <asm/numa.h>
19#include <asm/acpi.h>
20
21#ifndef Dprintk
22#define Dprintk(x...)
23#endif
24
Ravikiran G Thirumalai6c231b72005-09-06 15:17:45 -070025struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -070026bootmem_data_t plat_node_bdata[MAX_NUMNODES];
27
28int memnode_shift;
29u8 memnodemap[NODEMAPSIZE];
30
Andi Kleen3f098c22005-09-12 18:49:24 +020031unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
32 [0 ... NR_CPUS-1] = NUMA_NO_NODE
Andi Kleen0b07e982005-09-12 18:49:24 +020033};
Andi Kleen3f098c22005-09-12 18:49:24 +020034unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
35 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
36};
37cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
Linus Torvalds1da177e2005-04-16 15:20:36 -070038
39int numa_off __initdata;
40
Eric Dumazet529a3402005-11-05 17:25:54 +010041
42/*
43 * Given a shift value, try to populate memnodemap[]
44 * Returns :
45 * 1 if OK
46 * 0 if memnodmap[] too small (of shift too small)
47 * -1 if node overlap or lost ram (shift too big)
48 */
Andi Kleend18ff472006-01-11 22:44:30 +010049static int __init
50populate_memnodemap(const struct node *nodes, int numnodes, int shift)
Linus Torvalds1da177e2005-04-16 15:20:36 -070051{
52 int i;
Eric Dumazet529a3402005-11-05 17:25:54 +010053 int res = -1;
54 unsigned long addr, end;
Keith Manntheyb6846642005-07-28 21:15:38 -070055
Eric Dumazet8309cf62005-12-12 22:17:14 -080056 if (shift >= 64)
57 return -1;
Eric Dumazet529a3402005-11-05 17:25:54 +010058 memset(memnodemap, 0xff, sizeof(memnodemap));
59 for (i = 0; i < numnodes; i++) {
60 addr = nodes[i].start;
61 end = nodes[i].end;
62 if (addr >= end)
63 continue;
64 if ((end >> shift) >= NODEMAPSIZE)
65 return 0;
66 do {
67 if (memnodemap[addr >> shift] != 0xff)
68 return -1;
69 memnodemap[addr >> shift] = i;
Eric Dumazet8309cf62005-12-12 22:17:14 -080070 addr += (1UL << shift);
Eric Dumazet529a3402005-11-05 17:25:54 +010071 } while (addr < end);
72 res = 1;
73 }
74 return res;
75}
76
77int __init compute_hash_shift(struct node *nodes, int numnodes)
78{
79 int shift = 20;
80
81 while (populate_memnodemap(nodes, numnodes, shift + 1) >= 0)
Keith Manntheyb6846642005-07-28 21:15:38 -070082 shift++;
83
Andi Kleen6b050f82006-01-11 22:44:33 +010084 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
Eric Dumazet529a3402005-11-05 17:25:54 +010085 shift);
86
87 if (populate_memnodemap(nodes, numnodes, shift) != 1) {
88 printk(KERN_INFO
Keith Manntheyb6846642005-07-28 21:15:38 -070089 "Your memory is not aligned you need to rebuild your kernel "
Eric Dumazet529a3402005-11-05 17:25:54 +010090 "with a bigger NODEMAPSIZE shift=%d\n",
91 shift);
92 return -1;
93 }
Keith Manntheyb6846642005-07-28 21:15:38 -070094 return shift;
Linus Torvalds1da177e2005-04-16 15:20:36 -070095}
96
Matt Tolentinobbfceef2005-06-23 00:08:07 -070097#ifdef CONFIG_SPARSEMEM
98int early_pfn_to_nid(unsigned long pfn)
99{
100 return phys_to_nid(pfn << PAGE_SHIFT);
101}
102#endif
103
Linus Torvalds1da177e2005-04-16 15:20:36 -0700104/* Initialize bootmem allocator for a node */
105void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
106{
107 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
108 unsigned long nodedata_phys;
109 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
110
111 start = round_up(start, ZONE_ALIGN);
112
Andi Kleen6b050f82006-01-11 22:44:33 +0100113 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700114
115 start_pfn = start >> PAGE_SHIFT;
116 end_pfn = end >> PAGE_SHIFT;
117
Linus Torvalds1da177e2005-04-16 15:20:36 -0700118 nodedata_phys = find_e820_area(start, end, pgdat_size);
119 if (nodedata_phys == -1L)
120 panic("Cannot find memory pgdat in node %d\n", nodeid);
121
122 Dprintk("nodedata_phys %lx\n", nodedata_phys);
123
124 node_data[nodeid] = phys_to_virt(nodedata_phys);
125 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
126 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
127 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
128 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
129
130 /* Find a place for the bootmem map */
131 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
132 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
133 bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
134 if (bootmap_start == -1L)
135 panic("Not enough continuous space for bootmap on node %d", nodeid);
136 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
137
138 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
139 bootmap_start >> PAGE_SHIFT,
140 start_pfn, end_pfn);
141
142 e820_bootmem_free(NODE_DATA(nodeid), start, end);
143
144 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
145 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
146 node_set_online(nodeid);
147}
148
149/* Initialize final allocator for a zone */
150void __init setup_node_zones(int nodeid)
151{
152 unsigned long start_pfn, end_pfn;
153 unsigned long zones[MAX_NR_ZONES];
Andi Kleen485761b2005-08-26 18:34:10 -0700154 unsigned long holes[MAX_NR_ZONES];
Linus Torvalds1da177e2005-04-16 15:20:36 -0700155
Andi Kleena2f1b422005-11-05 17:25:53 +0100156 start_pfn = node_start_pfn(nodeid);
157 end_pfn = node_end_pfn(nodeid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700158
Andi Kleen6b050f82006-01-11 22:44:33 +0100159 Dprintk(KERN_INFO "Setting up node %d %lx-%lx\n",
Andi Kleena2f1b422005-11-05 17:25:53 +0100160 nodeid, start_pfn, end_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700161
Andi Kleena2f1b422005-11-05 17:25:53 +0100162 size_zones(zones, holes, start_pfn, end_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700163 free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
Andi Kleen485761b2005-08-26 18:34:10 -0700164 start_pfn, holes);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700165}
166
167void __init numa_init_array(void)
168{
169 int rr, i;
170 /* There are unfortunately some poorly designed mainboards around
171 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
172 mapping. To avoid this fill in the mapping for all possible
173 CPUs, as the number of CPUs is not known yet.
174 We round robin the existing nodes. */
Ravikiran G Thirumalai85cc5132005-09-30 11:59:22 -0700175 rr = first_node(node_online_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700176 for (i = 0; i < NR_CPUS; i++) {
177 if (cpu_to_node[i] != NUMA_NO_NODE)
178 continue;
Andi Kleen69d81fc2005-11-05 17:25:53 +0100179 numa_set_node(i, rr);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700180 rr = next_node(rr, node_online_map);
181 if (rr == MAX_NUMNODES)
182 rr = first_node(node_online_map);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700183 }
184
Linus Torvalds1da177e2005-04-16 15:20:36 -0700185}
186
187#ifdef CONFIG_NUMA_EMU
188int numa_fake __initdata = 0;
189
190/* Numa emulation */
191static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
192{
193 int i;
194 struct node nodes[MAX_NUMNODES];
195 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
196
197 /* Kludge needed for the hash function */
198 if (hweight64(sz) > 1) {
199 unsigned long x = 1;
200 while ((x << 1) < sz)
201 x <<= 1;
202 if (x < sz/2)
Andi Kleen6b050f82006-01-11 22:44:33 +0100203 printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700204 sz = x;
205 }
206
207 memset(&nodes,0,sizeof(nodes));
208 for (i = 0; i < numa_fake; i++) {
209 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
210 if (i == numa_fake-1)
211 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
212 nodes[i].end = nodes[i].start + sz;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700213 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
214 i,
215 nodes[i].start, nodes[i].end,
216 (nodes[i].end - nodes[i].start) >> 20);
217 node_set_online(i);
218 }
219 memnode_shift = compute_hash_shift(nodes, numa_fake);
220 if (memnode_shift < 0) {
221 memnode_shift = 0;
222 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
223 return -1;
224 }
225 for_each_online_node(i)
226 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
227 numa_init_array();
228 return 0;
229}
230#endif
231
232void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
233{
234 int i;
235
236#ifdef CONFIG_NUMA_EMU
237 if (numa_fake && !numa_emulation(start_pfn, end_pfn))
238 return;
239#endif
240
241#ifdef CONFIG_ACPI_NUMA
242 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
243 end_pfn << PAGE_SHIFT))
244 return;
245#endif
246
247#ifdef CONFIG_K8_NUMA
248 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
249 return;
250#endif
251 printk(KERN_INFO "%s\n",
252 numa_off ? "NUMA turned off" : "No NUMA configuration found");
253
254 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
255 start_pfn << PAGE_SHIFT,
256 end_pfn << PAGE_SHIFT);
257 /* setup dummy node covering all memory */
258 memnode_shift = 63;
259 memnodemap[0] = 0;
260 nodes_clear(node_online_map);
261 node_set_online(0);
262 for (i = 0; i < NR_CPUS; i++)
Andi Kleen69d81fc2005-11-05 17:25:53 +0100263 numa_set_node(i, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700264 node_to_cpumask[0] = cpumask_of_cpu(0);
265 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
266}
267
Ashok Raje6982c62005-06-25 14:54:58 -0700268__cpuinit void numa_add_cpu(int cpu)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700269{
Ravikiran G Thirumalaie6a045a2005-09-30 11:59:21 -0700270 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700271}
272
Andi Kleen69d81fc2005-11-05 17:25:53 +0100273void __cpuinit numa_set_node(int cpu, int node)
274{
Ravikiran G Thirumalaidf79efd2006-01-11 22:45:39 +0100275 cpu_pda(cpu)->nodenumber = node;
Andi Kleen69d81fc2005-11-05 17:25:53 +0100276 cpu_to_node[cpu] = node;
277}
278
Linus Torvalds1da177e2005-04-16 15:20:36 -0700279unsigned long __init numa_free_all_bootmem(void)
280{
281 int i;
282 unsigned long pages = 0;
283 for_each_online_node(i) {
284 pages += free_all_bootmem_node(NODE_DATA(i));
285 }
286 return pages;
287}
288
Bob Piccod3ee8712005-11-05 17:25:54 +0100289#ifdef CONFIG_SPARSEMEM
290static void __init arch_sparse_init(void)
291{
292 int i;
293
294 for_each_online_node(i)
295 memory_present(i, node_start_pfn(i), node_end_pfn(i));
296
297 sparse_init();
298}
299#else
300#define arch_sparse_init() do {} while (0)
301#endif
302
Linus Torvalds1da177e2005-04-16 15:20:36 -0700303void __init paging_init(void)
304{
305 int i;
Bob Piccod3ee8712005-11-05 17:25:54 +0100306
307 arch_sparse_init();
308
Linus Torvalds1da177e2005-04-16 15:20:36 -0700309 for_each_online_node(i) {
310 setup_node_zones(i);
311 }
312}
313
314/* [numa=off] */
315__init int numa_setup(char *opt)
316{
317 if (!strncmp(opt,"off",3))
318 numa_off = 1;
319#ifdef CONFIG_NUMA_EMU
320 if(!strncmp(opt, "fake=", 5)) {
321 numa_fake = simple_strtoul(opt+5,NULL,0); ;
322 if (numa_fake >= MAX_NUMNODES)
323 numa_fake = MAX_NUMNODES;
324 }
325#endif
326#ifdef CONFIG_ACPI_NUMA
327 if (!strncmp(opt,"noacpi",6))
328 acpi_numa = -1;
329#endif
330 return 1;
331}
332
Ravikiran Thirumalai05b3cbd2006-01-11 22:45:36 +0100333/*
334 * Setup early cpu_to_node.
335 *
336 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
337 * and apicid_to_node[] tables have valid entries for a CPU.
338 * This means we skip cpu_to_node[] initialisation for NUMA
339 * emulation and faking node case (when running a kernel compiled
340 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
341 * is already initialized in a round robin manner at numa_init_array,
342 * prior to this call, and this initialization is good enough
343 * for the fake NUMA cases.
344 */
345void __init init_cpu_to_node(void)
346{
347 int i;
348 for (i = 0; i < NR_CPUS; i++) {
349 u8 apicid = x86_cpu_to_apicid[i];
350 if (apicid == BAD_APICID)
351 continue;
352 if (apicid_to_node[apicid] == NUMA_NO_NODE)
353 continue;
354 cpu_to_node[i] = apicid_to_node[apicid];
355 }
356}
357
Linus Torvalds1da177e2005-04-16 15:20:36 -0700358EXPORT_SYMBOL(cpu_to_node);
359EXPORT_SYMBOL(node_to_cpumask);
360EXPORT_SYMBOL(memnode_shift);
361EXPORT_SYMBOL(memnodemap);
362EXPORT_SYMBOL(node_data);
Andi Kleencf050132006-01-11 22:46:27 +0100363
364#ifdef CONFIG_DISCONTIGMEM
365/*
366 * Functions to convert PFNs from/to per node page addresses.
367 * These are out of line because they are quite big.
368 * They could be all tuned by pre caching more state.
369 * Should do that.
370 */
371
372/* Requires pfn_valid(pfn) to be true */
373struct page *pfn_to_page(unsigned long pfn)
374{
375 int nid = phys_to_nid(((unsigned long)(pfn)) << PAGE_SHIFT);
376 return (pfn - node_start_pfn(nid)) + NODE_DATA(nid)->node_mem_map;
377}
378EXPORT_SYMBOL(pfn_to_page);
379
380unsigned long page_to_pfn(struct page *page)
381{
382 return (long)(((page) - page_zone(page)->zone_mem_map) +
383 page_zone(page)->zone_start_pfn);
384}
385EXPORT_SYMBOL(page_to_pfn);
386
387int pfn_valid(unsigned long pfn)
388{
389 unsigned nid;
390 if (pfn >= num_physpages)
391 return 0;
392 nid = pfn_to_nid(pfn);
393 if (nid == 0xff)
394 return 0;
395 return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid);
396}
397EXPORT_SYMBOL(pfn_valid);
398#endif