Linus Torvalds | 1da177e | 2005-04-16 15:20:36 -0700 | [diff] [blame^] | 1 | /* |
| 2 | * Some of the code in this file has been gleaned from the 64 bit |
| 3 | * discontigmem support code base. |
| 4 | * |
| 5 | * Copyright (C) 2002, IBM Corp. |
| 6 | * |
| 7 | * All rights reserved. |
| 8 | * |
| 9 | * This program is free software; you can redistribute it and/or modify |
| 10 | * it under the terms of the GNU General Public License as published by |
| 11 | * the Free Software Foundation; either version 2 of the License, or |
| 12 | * (at your option) any later version. |
| 13 | * |
| 14 | * This program is distributed in the hope that it will be useful, but |
| 15 | * WITHOUT ANY WARRANTY; without even the implied warranty of |
| 16 | * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or |
| 17 | * NON INFRINGEMENT. See the GNU General Public License for more |
| 18 | * details. |
| 19 | * |
| 20 | * You should have received a copy of the GNU General Public License |
| 21 | * along with this program; if not, write to the Free Software |
| 22 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
| 23 | * |
| 24 | * Send feedback to Pat Gaughen <gone@us.ibm.com> |
| 25 | */ |
| 26 | #include <linux/config.h> |
| 27 | #include <linux/mm.h> |
| 28 | #include <linux/bootmem.h> |
| 29 | #include <linux/mmzone.h> |
| 30 | #include <linux/acpi.h> |
| 31 | #include <linux/nodemask.h> |
| 32 | #include <asm/srat.h> |
| 33 | #include <asm/topology.h> |
| 34 | |
| 35 | /* |
| 36 | * proximity macros and definitions |
| 37 | */ |
| 38 | #define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */ |
| 39 | #define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */ |
| 40 | #define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit)) |
| 41 | #define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit))) |
| 42 | #define MAX_PXM_DOMAINS 256 /* 1 byte and no promises about values */ |
| 43 | /* bitmap length; _PXM is at most 255 */ |
| 44 | #define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8) |
| 45 | static u8 pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */ |
| 46 | |
| 47 | #define MAX_CHUNKS_PER_NODE 4 |
| 48 | #define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES) |
| 49 | struct node_memory_chunk_s { |
| 50 | unsigned long start_pfn; |
| 51 | unsigned long end_pfn; |
| 52 | u8 pxm; // proximity domain of node |
| 53 | u8 nid; // which cnode contains this chunk? |
| 54 | u8 bank; // which mem bank on this node |
| 55 | }; |
| 56 | static struct node_memory_chunk_s node_memory_chunk[MAXCHUNKS]; |
| 57 | |
| 58 | static int num_memory_chunks; /* total number of memory chunks */ |
| 59 | static int zholes_size_init; |
| 60 | static unsigned long zholes_size[MAX_NUMNODES * MAX_NR_ZONES]; |
| 61 | |
| 62 | extern void * boot_ioremap(unsigned long, unsigned long); |
| 63 | |
| 64 | /* Identify CPU proximity domains */ |
| 65 | static void __init parse_cpu_affinity_structure(char *p) |
| 66 | { |
| 67 | struct acpi_table_processor_affinity *cpu_affinity = |
| 68 | (struct acpi_table_processor_affinity *) p; |
| 69 | |
| 70 | if (!cpu_affinity->flags.enabled) |
| 71 | return; /* empty entry */ |
| 72 | |
| 73 | /* mark this node as "seen" in node bitmap */ |
| 74 | BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain); |
| 75 | |
| 76 | printk("CPU 0x%02X in proximity domain 0x%02X\n", |
| 77 | cpu_affinity->apic_id, cpu_affinity->proximity_domain); |
| 78 | } |
| 79 | |
| 80 | /* |
| 81 | * Identify memory proximity domains and hot-remove capabilities. |
| 82 | * Fill node memory chunk list structure. |
| 83 | */ |
| 84 | static void __init parse_memory_affinity_structure (char *sratp) |
| 85 | { |
| 86 | unsigned long long paddr, size; |
| 87 | unsigned long start_pfn, end_pfn; |
| 88 | u8 pxm; |
| 89 | struct node_memory_chunk_s *p, *q, *pend; |
| 90 | struct acpi_table_memory_affinity *memory_affinity = |
| 91 | (struct acpi_table_memory_affinity *) sratp; |
| 92 | |
| 93 | if (!memory_affinity->flags.enabled) |
| 94 | return; /* empty entry */ |
| 95 | |
| 96 | /* mark this node as "seen" in node bitmap */ |
| 97 | BMAP_SET(pxm_bitmap, memory_affinity->proximity_domain); |
| 98 | |
| 99 | /* calculate info for memory chunk structure */ |
| 100 | paddr = memory_affinity->base_addr_hi; |
| 101 | paddr = (paddr << 32) | memory_affinity->base_addr_lo; |
| 102 | size = memory_affinity->length_hi; |
| 103 | size = (size << 32) | memory_affinity->length_lo; |
| 104 | |
| 105 | start_pfn = paddr >> PAGE_SHIFT; |
| 106 | end_pfn = (paddr + size) >> PAGE_SHIFT; |
| 107 | |
| 108 | pxm = memory_affinity->proximity_domain; |
| 109 | |
| 110 | if (num_memory_chunks >= MAXCHUNKS) { |
| 111 | printk("Too many mem chunks in SRAT. Ignoring %lld MBytes at %llx\n", |
| 112 | size/(1024*1024), paddr); |
| 113 | return; |
| 114 | } |
| 115 | |
| 116 | /* Insertion sort based on base address */ |
| 117 | pend = &node_memory_chunk[num_memory_chunks]; |
| 118 | for (p = &node_memory_chunk[0]; p < pend; p++) { |
| 119 | if (start_pfn < p->start_pfn) |
| 120 | break; |
| 121 | } |
| 122 | if (p < pend) { |
| 123 | for (q = pend; q >= p; q--) |
| 124 | *(q + 1) = *q; |
| 125 | } |
| 126 | p->start_pfn = start_pfn; |
| 127 | p->end_pfn = end_pfn; |
| 128 | p->pxm = pxm; |
| 129 | |
| 130 | num_memory_chunks++; |
| 131 | |
| 132 | printk("Memory range 0x%lX to 0x%lX (type 0x%X) in proximity domain 0x%02X %s\n", |
| 133 | start_pfn, end_pfn, |
| 134 | memory_affinity->memory_type, |
| 135 | memory_affinity->proximity_domain, |
| 136 | (memory_affinity->flags.hot_pluggable ? |
| 137 | "enabled and removable" : "enabled" ) ); |
| 138 | } |
| 139 | |
| 140 | #if MAX_NR_ZONES != 3 |
| 141 | #error "MAX_NR_ZONES != 3, chunk_to_zone requires review" |
| 142 | #endif |
| 143 | /* Take a chunk of pages from page frame cstart to cend and count the number |
| 144 | * of pages in each zone, returned via zones[]. |
| 145 | */ |
| 146 | static __init void chunk_to_zones(unsigned long cstart, unsigned long cend, |
| 147 | unsigned long *zones) |
| 148 | { |
| 149 | unsigned long max_dma; |
| 150 | extern unsigned long max_low_pfn; |
| 151 | |
| 152 | int z; |
| 153 | unsigned long rend; |
| 154 | |
| 155 | /* FIXME: MAX_DMA_ADDRESS and max_low_pfn are trying to provide |
| 156 | * similarly scoped information and should be handled in a consistant |
| 157 | * manner. |
| 158 | */ |
| 159 | max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; |
| 160 | |
| 161 | /* Split the hole into the zones in which it falls. Repeatedly |
| 162 | * take the segment in which the remaining hole starts, round it |
| 163 | * to the end of that zone. |
| 164 | */ |
| 165 | memset(zones, 0, MAX_NR_ZONES * sizeof(long)); |
| 166 | while (cstart < cend) { |
| 167 | if (cstart < max_dma) { |
| 168 | z = ZONE_DMA; |
| 169 | rend = (cend < max_dma)? cend : max_dma; |
| 170 | |
| 171 | } else if (cstart < max_low_pfn) { |
| 172 | z = ZONE_NORMAL; |
| 173 | rend = (cend < max_low_pfn)? cend : max_low_pfn; |
| 174 | |
| 175 | } else { |
| 176 | z = ZONE_HIGHMEM; |
| 177 | rend = cend; |
| 178 | } |
| 179 | zones[z] += rend - cstart; |
| 180 | cstart = rend; |
| 181 | } |
| 182 | } |
| 183 | |
| 184 | /* |
| 185 | * The SRAT table always lists ascending addresses, so can always |
| 186 | * assume that the first "start" address that you see is the real |
| 187 | * start of the node, and that the current "end" address is after |
| 188 | * the previous one. |
| 189 | */ |
| 190 | static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk) |
| 191 | { |
| 192 | /* |
| 193 | * Only add present memory as told by the e820. |
| 194 | * There is no guarantee from the SRAT that the memory it |
| 195 | * enumerates is present at boot time because it represents |
| 196 | * *possible* memory hotplug areas the same as normal RAM. |
| 197 | */ |
| 198 | if (memory_chunk->start_pfn >= max_pfn) { |
| 199 | printk (KERN_INFO "Ignoring SRAT pfns: 0x%08lx -> %08lx\n", |
| 200 | memory_chunk->start_pfn, memory_chunk->end_pfn); |
| 201 | return; |
| 202 | } |
| 203 | if (memory_chunk->nid != nid) |
| 204 | return; |
| 205 | |
| 206 | if (!node_has_online_mem(nid)) |
| 207 | node_start_pfn[nid] = memory_chunk->start_pfn; |
| 208 | |
| 209 | if (node_start_pfn[nid] > memory_chunk->start_pfn) |
| 210 | node_start_pfn[nid] = memory_chunk->start_pfn; |
| 211 | |
| 212 | if (node_end_pfn[nid] < memory_chunk->end_pfn) |
| 213 | node_end_pfn[nid] = memory_chunk->end_pfn; |
| 214 | } |
| 215 | |
| 216 | /* Parse the ACPI Static Resource Affinity Table */ |
| 217 | static int __init acpi20_parse_srat(struct acpi_table_srat *sratp) |
| 218 | { |
| 219 | u8 *start, *end, *p; |
| 220 | int i, j, nid; |
| 221 | u8 pxm_to_nid_map[MAX_PXM_DOMAINS];/* _PXM to logical node ID map */ |
| 222 | u8 nid_to_pxm_map[MAX_NUMNODES];/* logical node ID to _PXM map */ |
| 223 | |
| 224 | start = (u8 *)(&(sratp->reserved) + 1); /* skip header */ |
| 225 | p = start; |
| 226 | end = (u8 *)sratp + sratp->header.length; |
| 227 | |
| 228 | memset(pxm_bitmap, 0, sizeof(pxm_bitmap)); /* init proximity domain bitmap */ |
| 229 | memset(node_memory_chunk, 0, sizeof(node_memory_chunk)); |
| 230 | memset(zholes_size, 0, sizeof(zholes_size)); |
| 231 | |
| 232 | /* -1 in these maps means not available */ |
| 233 | memset(pxm_to_nid_map, -1, sizeof(pxm_to_nid_map)); |
| 234 | memset(nid_to_pxm_map, -1, sizeof(nid_to_pxm_map)); |
| 235 | |
| 236 | num_memory_chunks = 0; |
| 237 | while (p < end) { |
| 238 | switch (*p) { |
| 239 | case ACPI_SRAT_PROCESSOR_AFFINITY: |
| 240 | parse_cpu_affinity_structure(p); |
| 241 | break; |
| 242 | case ACPI_SRAT_MEMORY_AFFINITY: |
| 243 | parse_memory_affinity_structure(p); |
| 244 | break; |
| 245 | default: |
| 246 | printk("ACPI 2.0 SRAT: unknown entry skipped: type=0x%02X, len=%d\n", p[0], p[1]); |
| 247 | break; |
| 248 | } |
| 249 | p += p[1]; |
| 250 | if (p[1] == 0) { |
| 251 | printk("acpi20_parse_srat: Entry length value is zero;" |
| 252 | " can't parse any further!\n"); |
| 253 | break; |
| 254 | } |
| 255 | } |
| 256 | |
| 257 | if (num_memory_chunks == 0) { |
| 258 | printk("could not finy any ACPI SRAT memory areas.\n"); |
| 259 | goto out_fail; |
| 260 | } |
| 261 | |
| 262 | /* Calculate total number of nodes in system from PXM bitmap and create |
| 263 | * a set of sequential node IDs starting at zero. (ACPI doesn't seem |
| 264 | * to specify the range of _PXM values.) |
| 265 | */ |
| 266 | /* |
| 267 | * MCD - we no longer HAVE to number nodes sequentially. PXM domain |
| 268 | * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically |
| 269 | * 32, so we will continue numbering them in this manner until MAX_NUMNODES |
| 270 | * approaches MAX_PXM_DOMAINS for i386. |
| 271 | */ |
| 272 | nodes_clear(node_online_map); |
| 273 | for (i = 0; i < MAX_PXM_DOMAINS; i++) { |
| 274 | if (BMAP_TEST(pxm_bitmap, i)) { |
| 275 | nid = num_online_nodes(); |
| 276 | pxm_to_nid_map[i] = nid; |
| 277 | nid_to_pxm_map[nid] = i; |
| 278 | node_set_online(nid); |
| 279 | } |
| 280 | } |
| 281 | BUG_ON(num_online_nodes() == 0); |
| 282 | |
| 283 | /* set cnode id in memory chunk structure */ |
| 284 | for (i = 0; i < num_memory_chunks; i++) |
| 285 | node_memory_chunk[i].nid = pxm_to_nid_map[node_memory_chunk[i].pxm]; |
| 286 | |
| 287 | printk("pxm bitmap: "); |
| 288 | for (i = 0; i < sizeof(pxm_bitmap); i++) { |
| 289 | printk("%02X ", pxm_bitmap[i]); |
| 290 | } |
| 291 | printk("\n"); |
| 292 | printk("Number of logical nodes in system = %d\n", num_online_nodes()); |
| 293 | printk("Number of memory chunks in system = %d\n", num_memory_chunks); |
| 294 | |
| 295 | for (j = 0; j < num_memory_chunks; j++){ |
| 296 | struct node_memory_chunk_s * chunk = &node_memory_chunk[j]; |
| 297 | printk("chunk %d nid %d start_pfn %08lx end_pfn %08lx\n", |
| 298 | j, chunk->nid, chunk->start_pfn, chunk->end_pfn); |
| 299 | node_read_chunk(chunk->nid, chunk); |
| 300 | } |
| 301 | |
| 302 | for_each_online_node(nid) { |
| 303 | unsigned long start = node_start_pfn[nid]; |
| 304 | unsigned long end = node_end_pfn[nid]; |
| 305 | |
| 306 | memory_present(nid, start, end); |
| 307 | node_remap_size[nid] = node_memmap_size_bytes(nid, start, end); |
| 308 | } |
| 309 | return 1; |
| 310 | out_fail: |
| 311 | return 0; |
| 312 | } |
| 313 | |
| 314 | int __init get_memcfg_from_srat(void) |
| 315 | { |
| 316 | struct acpi_table_header *header = NULL; |
| 317 | struct acpi_table_rsdp *rsdp = NULL; |
| 318 | struct acpi_table_rsdt *rsdt = NULL; |
| 319 | struct acpi_pointer *rsdp_address = NULL; |
| 320 | struct acpi_table_rsdt saved_rsdt; |
| 321 | int tables = 0; |
| 322 | int i = 0; |
| 323 | |
| 324 | acpi_find_root_pointer(ACPI_PHYSICAL_ADDRESSING, rsdp_address); |
| 325 | |
| 326 | if (rsdp_address->pointer_type == ACPI_PHYSICAL_POINTER) { |
| 327 | printk("%s: assigning address to rsdp\n", __FUNCTION__); |
| 328 | rsdp = (struct acpi_table_rsdp *) |
| 329 | (u32)rsdp_address->pointer.physical; |
| 330 | } else { |
| 331 | printk("%s: rsdp_address is not a physical pointer\n", __FUNCTION__); |
| 332 | goto out_err; |
| 333 | } |
| 334 | if (!rsdp) { |
| 335 | printk("%s: Didn't find ACPI root!\n", __FUNCTION__); |
| 336 | goto out_err; |
| 337 | } |
| 338 | |
| 339 | printk(KERN_INFO "%.8s v%d [%.6s]\n", rsdp->signature, rsdp->revision, |
| 340 | rsdp->oem_id); |
| 341 | |
| 342 | if (strncmp(rsdp->signature, RSDP_SIG,strlen(RSDP_SIG))) { |
| 343 | printk(KERN_WARNING "%s: RSDP table signature incorrect\n", __FUNCTION__); |
| 344 | goto out_err; |
| 345 | } |
| 346 | |
| 347 | rsdt = (struct acpi_table_rsdt *) |
| 348 | boot_ioremap(rsdp->rsdt_address, sizeof(struct acpi_table_rsdt)); |
| 349 | |
| 350 | if (!rsdt) { |
| 351 | printk(KERN_WARNING |
| 352 | "%s: ACPI: Invalid root system description tables (RSDT)\n", |
| 353 | __FUNCTION__); |
| 354 | goto out_err; |
| 355 | } |
| 356 | |
| 357 | header = & rsdt->header; |
| 358 | |
| 359 | if (strncmp(header->signature, RSDT_SIG, strlen(RSDT_SIG))) { |
| 360 | printk(KERN_WARNING "ACPI: RSDT signature incorrect\n"); |
| 361 | goto out_err; |
| 362 | } |
| 363 | |
| 364 | /* |
| 365 | * The number of tables is computed by taking the |
| 366 | * size of all entries (header size minus total |
| 367 | * size of RSDT) divided by the size of each entry |
| 368 | * (4-byte table pointers). |
| 369 | */ |
| 370 | tables = (header->length - sizeof(struct acpi_table_header)) / 4; |
| 371 | |
| 372 | if (!tables) |
| 373 | goto out_err; |
| 374 | |
| 375 | memcpy(&saved_rsdt, rsdt, sizeof(saved_rsdt)); |
| 376 | |
| 377 | if (saved_rsdt.header.length > sizeof(saved_rsdt)) { |
| 378 | printk(KERN_WARNING "ACPI: Too big length in RSDT: %d\n", |
| 379 | saved_rsdt.header.length); |
| 380 | goto out_err; |
| 381 | } |
| 382 | |
| 383 | printk("Begin SRAT table scan....\n"); |
| 384 | |
| 385 | for (i = 0; i < tables; i++) { |
| 386 | /* Map in header, then map in full table length. */ |
| 387 | header = (struct acpi_table_header *) |
| 388 | boot_ioremap(saved_rsdt.entry[i], sizeof(struct acpi_table_header)); |
| 389 | if (!header) |
| 390 | break; |
| 391 | header = (struct acpi_table_header *) |
| 392 | boot_ioremap(saved_rsdt.entry[i], header->length); |
| 393 | if (!header) |
| 394 | break; |
| 395 | |
| 396 | if (strncmp((char *) &header->signature, "SRAT", 4)) |
| 397 | continue; |
| 398 | |
| 399 | /* we've found the srat table. don't need to look at any more tables */ |
| 400 | return acpi20_parse_srat((struct acpi_table_srat *)header); |
| 401 | } |
| 402 | out_err: |
| 403 | printk("failed to get NUMA memory information from SRAT table\n"); |
| 404 | return 0; |
| 405 | } |
| 406 | |
| 407 | /* For each node run the memory list to determine whether there are |
| 408 | * any memory holes. For each hole determine which ZONE they fall |
| 409 | * into. |
| 410 | * |
| 411 | * NOTE#1: this requires knowledge of the zone boundries and so |
| 412 | * _cannot_ be performed before those are calculated in setup_memory. |
| 413 | * |
| 414 | * NOTE#2: we rely on the fact that the memory chunks are ordered by |
| 415 | * start pfn number during setup. |
| 416 | */ |
| 417 | static void __init get_zholes_init(void) |
| 418 | { |
| 419 | int nid; |
| 420 | int c; |
| 421 | int first; |
| 422 | unsigned long end = 0; |
| 423 | |
| 424 | for_each_online_node(nid) { |
| 425 | first = 1; |
| 426 | for (c = 0; c < num_memory_chunks; c++){ |
| 427 | if (node_memory_chunk[c].nid == nid) { |
| 428 | if (first) { |
| 429 | end = node_memory_chunk[c].end_pfn; |
| 430 | first = 0; |
| 431 | |
| 432 | } else { |
| 433 | /* Record any gap between this chunk |
| 434 | * and the previous chunk on this node |
| 435 | * against the zones it spans. |
| 436 | */ |
| 437 | chunk_to_zones(end, |
| 438 | node_memory_chunk[c].start_pfn, |
| 439 | &zholes_size[nid * MAX_NR_ZONES]); |
| 440 | } |
| 441 | } |
| 442 | } |
| 443 | } |
| 444 | } |
| 445 | |
| 446 | unsigned long * __init get_zholes_size(int nid) |
| 447 | { |
| 448 | if (!zholes_size_init) { |
| 449 | zholes_size_init++; |
| 450 | get_zholes_init(); |
| 451 | } |
| 452 | if (nid >= MAX_NUMNODES || !node_online(nid)) |
| 453 | printk("%s: nid = %d is invalid/offline. num_online_nodes = %d", |
| 454 | __FUNCTION__, nid, num_online_nodes()); |
| 455 | return &zholes_size[nid * MAX_NR_ZONES]; |
| 456 | } |