blob: 84cde796ecb133ebfaf145262323d7ab20379482 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */
5#include <linux/kernel.h>
6#include <linux/mm.h>
7#include <linux/string.h>
8#include <linux/init.h>
9#include <linux/bootmem.h>
10#include <linux/mmzone.h>
11#include <linux/ctype.h>
12#include <linux/module.h>
13#include <linux/nodemask.h>
14
15#include <asm/e820.h>
16#include <asm/proto.h>
17#include <asm/dma.h>
18#include <asm/numa.h>
19#include <asm/acpi.h>
20
21#ifndef Dprintk
22#define Dprintk(x...)
23#endif
24
25struct pglist_data *node_data[MAX_NUMNODES];
26bootmem_data_t plat_node_bdata[MAX_NUMNODES];
27
28int memnode_shift;
29u8 memnodemap[NODEMAPSIZE];
30
31unsigned char cpu_to_node[NR_CPUS] = { [0 ... NR_CPUS-1] = NUMA_NO_NODE };
32cpumask_t node_to_cpumask[MAX_NUMNODES];
33
34int numa_off __initdata;
35
36int __init compute_hash_shift(struct node *nodes, int numnodes)
37{
38 int i;
39 int shift = 24;
40 u64 addr;
41
42 /* When in doubt use brute force. */
43 while (shift < 48) {
44 memset(memnodemap,0xff,sizeof(*memnodemap) * NODEMAPSIZE);
45 for (i = 0; i < numnodes; i++) {
46 if (nodes[i].start == nodes[i].end)
47 continue;
48 for (addr = nodes[i].start;
49 addr < nodes[i].end;
50 addr += (1UL << shift)) {
51 if (memnodemap[addr >> shift] != 0xff &&
52 memnodemap[addr >> shift] != i) {
53 printk(KERN_INFO
54 "node %d shift %d addr %Lx conflict %d\n",
55 i, shift, addr, memnodemap[addr>>shift]);
56 goto next;
57 }
58 memnodemap[addr >> shift] = i;
59 }
60 }
61 return shift;
62 next:
63 shift++;
64 }
65 memset(memnodemap,0,sizeof(*memnodemap) * NODEMAPSIZE);
66 return -1;
67}
68
Matt Tolentinobbfceef2005-06-23 00:08:07 -070069#ifdef CONFIG_SPARSEMEM
70int early_pfn_to_nid(unsigned long pfn)
71{
72 return phys_to_nid(pfn << PAGE_SHIFT);
73}
74#endif
75
Linus Torvalds1da177e2005-04-16 15:20:36 -070076/* Initialize bootmem allocator for a node */
77void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
78{
79 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
80 unsigned long nodedata_phys;
81 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
82
83 start = round_up(start, ZONE_ALIGN);
84
85 printk("Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
86
87 start_pfn = start >> PAGE_SHIFT;
88 end_pfn = end >> PAGE_SHIFT;
89
Matt Tolentinobbfceef2005-06-23 00:08:07 -070090 memory_present(nodeid, start_pfn, end_pfn);
Linus Torvalds1da177e2005-04-16 15:20:36 -070091 nodedata_phys = find_e820_area(start, end, pgdat_size);
92 if (nodedata_phys == -1L)
93 panic("Cannot find memory pgdat in node %d\n", nodeid);
94
95 Dprintk("nodedata_phys %lx\n", nodedata_phys);
96
97 node_data[nodeid] = phys_to_virt(nodedata_phys);
98 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
99 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
100 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
101 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
102
103 /* Find a place for the bootmem map */
104 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
105 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
106 bootmap_start = find_e820_area(bootmap_start, end, bootmap_pages<<PAGE_SHIFT);
107 if (bootmap_start == -1L)
108 panic("Not enough continuous space for bootmap on node %d", nodeid);
109 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
110
111 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
112 bootmap_start >> PAGE_SHIFT,
113 start_pfn, end_pfn);
114
115 e820_bootmem_free(NODE_DATA(nodeid), start, end);
116
117 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
118 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
119 node_set_online(nodeid);
120}
121
122/* Initialize final allocator for a zone */
123void __init setup_node_zones(int nodeid)
124{
125 unsigned long start_pfn, end_pfn;
126 unsigned long zones[MAX_NR_ZONES];
127 unsigned long dma_end_pfn;
128
129 memset(zones, 0, sizeof(unsigned long) * MAX_NR_ZONES);
130
131 start_pfn = node_start_pfn(nodeid);
132 end_pfn = node_end_pfn(nodeid);
133
134 Dprintk(KERN_INFO "setting up node %d %lx-%lx\n", nodeid, start_pfn, end_pfn);
135
136 /* All nodes > 0 have a zero length zone DMA */
137 dma_end_pfn = __pa(MAX_DMA_ADDRESS) >> PAGE_SHIFT;
138 if (start_pfn < dma_end_pfn) {
139 zones[ZONE_DMA] = dma_end_pfn - start_pfn;
140 zones[ZONE_NORMAL] = end_pfn - dma_end_pfn;
141 } else {
142 zones[ZONE_NORMAL] = end_pfn - start_pfn;
143 }
144
145 free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
146 start_pfn, NULL);
147}
148
149void __init numa_init_array(void)
150{
151 int rr, i;
152 /* There are unfortunately some poorly designed mainboards around
153 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
154 mapping. To avoid this fill in the mapping for all possible
155 CPUs, as the number of CPUs is not known yet.
156 We round robin the existing nodes. */
157 rr = 0;
158 for (i = 0; i < NR_CPUS; i++) {
159 if (cpu_to_node[i] != NUMA_NO_NODE)
160 continue;
161 rr = next_node(rr, node_online_map);
162 if (rr == MAX_NUMNODES)
163 rr = first_node(node_online_map);
164 cpu_to_node[i] = rr;
165 rr++;
166 }
167
168 set_bit(0, &node_to_cpumask[cpu_to_node(0)]);
169}
170
171#ifdef CONFIG_NUMA_EMU
172int numa_fake __initdata = 0;
173
174/* Numa emulation */
175static int numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
176{
177 int i;
178 struct node nodes[MAX_NUMNODES];
179 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
180
181 /* Kludge needed for the hash function */
182 if (hweight64(sz) > 1) {
183 unsigned long x = 1;
184 while ((x << 1) < sz)
185 x <<= 1;
186 if (x < sz/2)
187 printk("Numa emulation unbalanced. Complain to maintainer\n");
188 sz = x;
189 }
190
191 memset(&nodes,0,sizeof(nodes));
192 for (i = 0; i < numa_fake; i++) {
193 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
194 if (i == numa_fake-1)
195 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
196 nodes[i].end = nodes[i].start + sz;
197 if (i != numa_fake-1)
198 nodes[i].end--;
199 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
200 i,
201 nodes[i].start, nodes[i].end,
202 (nodes[i].end - nodes[i].start) >> 20);
203 node_set_online(i);
204 }
205 memnode_shift = compute_hash_shift(nodes, numa_fake);
206 if (memnode_shift < 0) {
207 memnode_shift = 0;
208 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
209 return -1;
210 }
211 for_each_online_node(i)
212 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
213 numa_init_array();
214 return 0;
215}
216#endif
217
218void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
219{
220 int i;
221
222#ifdef CONFIG_NUMA_EMU
223 if (numa_fake && !numa_emulation(start_pfn, end_pfn))
224 return;
225#endif
226
227#ifdef CONFIG_ACPI_NUMA
228 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
229 end_pfn << PAGE_SHIFT))
230 return;
231#endif
232
233#ifdef CONFIG_K8_NUMA
234 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
235 return;
236#endif
237 printk(KERN_INFO "%s\n",
238 numa_off ? "NUMA turned off" : "No NUMA configuration found");
239
240 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
241 start_pfn << PAGE_SHIFT,
242 end_pfn << PAGE_SHIFT);
243 /* setup dummy node covering all memory */
244 memnode_shift = 63;
245 memnodemap[0] = 0;
246 nodes_clear(node_online_map);
247 node_set_online(0);
248 for (i = 0; i < NR_CPUS; i++)
249 cpu_to_node[i] = 0;
250 node_to_cpumask[0] = cpumask_of_cpu(0);
251 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
252}
253
254__init void numa_add_cpu(int cpu)
255{
256 /* BP is initialized elsewhere */
257 if (cpu)
258 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
259}
260
261unsigned long __init numa_free_all_bootmem(void)
262{
263 int i;
264 unsigned long pages = 0;
265 for_each_online_node(i) {
266 pages += free_all_bootmem_node(NODE_DATA(i));
267 }
268 return pages;
269}
270
271void __init paging_init(void)
272{
273 int i;
274 for_each_online_node(i) {
275 setup_node_zones(i);
276 }
277}
278
279/* [numa=off] */
280__init int numa_setup(char *opt)
281{
282 if (!strncmp(opt,"off",3))
283 numa_off = 1;
284#ifdef CONFIG_NUMA_EMU
285 if(!strncmp(opt, "fake=", 5)) {
286 numa_fake = simple_strtoul(opt+5,NULL,0); ;
287 if (numa_fake >= MAX_NUMNODES)
288 numa_fake = MAX_NUMNODES;
289 }
290#endif
291#ifdef CONFIG_ACPI_NUMA
292 if (!strncmp(opt,"noacpi",6))
293 acpi_numa = -1;
294#endif
295 return 1;
296}
297
298EXPORT_SYMBOL(cpu_to_node);
299EXPORT_SYMBOL(node_to_cpumask);
300EXPORT_SYMBOL(memnode_shift);
301EXPORT_SYMBOL(memnodemap);
302EXPORT_SYMBOL(node_data);