Tejun Heo | 9f64553 | 2010-04-09 18:57:01 +0900 | [diff] [blame] | 1 | /* |
| 2 | * mm/percpu-vm.c - vmalloc area based chunk allocation |
| 3 | * |
| 4 | * Copyright (C) 2010 SUSE Linux Products GmbH |
| 5 | * Copyright (C) 2010 Tejun Heo <tj@kernel.org> |
| 6 | * |
| 7 | * This file is released under the GPLv2. |
| 8 | * |
| 9 | * Chunks are mapped into vmalloc areas and populated page by page. |
| 10 | * This is the default chunk allocator. |
| 11 | */ |
| 12 | |
| 13 | static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, |
| 14 | unsigned int cpu, int page_idx) |
| 15 | { |
| 16 | /* must not be used on pre-mapped chunk */ |
| 17 | WARN_ON(chunk->immutable); |
| 18 | |
| 19 | return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx)); |
| 20 | } |
| 21 | |
| 22 | /** |
| 23 | * pcpu_get_pages_and_bitmap - get temp pages array and bitmap |
| 24 | * @chunk: chunk of interest |
| 25 | * @bitmapp: output parameter for bitmap |
| 26 | * @may_alloc: may allocate the array |
| 27 | * |
| 28 | * Returns pointer to array of pointers to struct page and bitmap, |
| 29 | * both of which can be indexed with pcpu_page_idx(). The returned |
| 30 | * array is cleared to zero and *@bitmapp is copied from |
| 31 | * @chunk->populated. Note that there is only one array and bitmap |
| 32 | * and access exclusion is the caller's responsibility. |
| 33 | * |
| 34 | * CONTEXT: |
| 35 | * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc. |
| 36 | * Otherwise, don't care. |
| 37 | * |
| 38 | * RETURNS: |
| 39 | * Pointer to temp pages array on success, NULL on failure. |
| 40 | */ |
| 41 | static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, |
| 42 | unsigned long **bitmapp, |
| 43 | bool may_alloc) |
| 44 | { |
| 45 | static struct page **pages; |
| 46 | static unsigned long *bitmap; |
| 47 | size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); |
| 48 | size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) * |
| 49 | sizeof(unsigned long); |
| 50 | |
| 51 | if (!pages || !bitmap) { |
| 52 | if (may_alloc && !pages) |
Bob Liu | 90459ce0 | 2011-08-04 11:02:33 +0200 | [diff] [blame] | 53 | pages = pcpu_mem_zalloc(pages_size); |
Tejun Heo | 9f64553 | 2010-04-09 18:57:01 +0900 | [diff] [blame] | 54 | if (may_alloc && !bitmap) |
Bob Liu | 90459ce0 | 2011-08-04 11:02:33 +0200 | [diff] [blame] | 55 | bitmap = pcpu_mem_zalloc(bitmap_size); |
Tejun Heo | 9f64553 | 2010-04-09 18:57:01 +0900 | [diff] [blame] | 56 | if (!pages || !bitmap) |
| 57 | return NULL; |
| 58 | } |
| 59 | |
Tejun Heo | 9f64553 | 2010-04-09 18:57:01 +0900 | [diff] [blame] | 60 | bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages); |
| 61 | |
| 62 | *bitmapp = bitmap; |
| 63 | return pages; |
| 64 | } |
| 65 | |
| 66 | /** |
| 67 | * pcpu_free_pages - free pages which were allocated for @chunk |
| 68 | * @chunk: chunk pages were allocated for |
| 69 | * @pages: array of pages to be freed, indexed by pcpu_page_idx() |
| 70 | * @populated: populated bitmap |
| 71 | * @page_start: page index of the first page to be freed |
| 72 | * @page_end: page index of the last page to be freed + 1 |
| 73 | * |
| 74 | * Free pages [@page_start and @page_end) in @pages for all units. |
| 75 | * The pages were allocated for @chunk. |
| 76 | */ |
| 77 | static void pcpu_free_pages(struct pcpu_chunk *chunk, |
| 78 | struct page **pages, unsigned long *populated, |
| 79 | int page_start, int page_end) |
| 80 | { |
| 81 | unsigned int cpu; |
| 82 | int i; |
| 83 | |
| 84 | for_each_possible_cpu(cpu) { |
| 85 | for (i = page_start; i < page_end; i++) { |
| 86 | struct page *page = pages[pcpu_page_idx(cpu, i)]; |
| 87 | |
| 88 | if (page) |
| 89 | __free_page(page); |
| 90 | } |
| 91 | } |
| 92 | } |
| 93 | |
| 94 | /** |
| 95 | * pcpu_alloc_pages - allocates pages for @chunk |
| 96 | * @chunk: target chunk |
| 97 | * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() |
| 98 | * @populated: populated bitmap |
| 99 | * @page_start: page index of the first page to be allocated |
| 100 | * @page_end: page index of the last page to be allocated + 1 |
| 101 | * |
| 102 | * Allocate pages [@page_start,@page_end) into @pages for all units. |
| 103 | * The allocation is for @chunk. Percpu core doesn't care about the |
| 104 | * content of @pages and will pass it verbatim to pcpu_map_pages(). |
| 105 | */ |
| 106 | static int pcpu_alloc_pages(struct pcpu_chunk *chunk, |
| 107 | struct page **pages, unsigned long *populated, |
| 108 | int page_start, int page_end) |
| 109 | { |
| 110 | const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; |
| 111 | unsigned int cpu; |
| 112 | int i; |
| 113 | |
| 114 | for_each_possible_cpu(cpu) { |
| 115 | for (i = page_start; i < page_end; i++) { |
| 116 | struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; |
| 117 | |
| 118 | *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0); |
| 119 | if (!*pagep) { |
| 120 | pcpu_free_pages(chunk, pages, populated, |
| 121 | page_start, page_end); |
| 122 | return -ENOMEM; |
| 123 | } |
| 124 | } |
| 125 | } |
| 126 | return 0; |
| 127 | } |
| 128 | |
| 129 | /** |
| 130 | * pcpu_pre_unmap_flush - flush cache prior to unmapping |
| 131 | * @chunk: chunk the regions to be flushed belongs to |
| 132 | * @page_start: page index of the first page to be flushed |
| 133 | * @page_end: page index of the last page to be flushed + 1 |
| 134 | * |
| 135 | * Pages in [@page_start,@page_end) of @chunk are about to be |
| 136 | * unmapped. Flush cache. As each flushing trial can be very |
| 137 | * expensive, issue flush on the whole region at once rather than |
| 138 | * doing it for each cpu. This could be an overkill but is more |
| 139 | * scalable. |
| 140 | */ |
| 141 | static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, |
| 142 | int page_start, int page_end) |
| 143 | { |
| 144 | flush_cache_vunmap( |
Tejun Heo | a855b84 | 2011-11-18 10:55:35 -0800 | [diff] [blame^] | 145 | pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), |
| 146 | pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); |
Tejun Heo | 9f64553 | 2010-04-09 18:57:01 +0900 | [diff] [blame] | 147 | } |
| 148 | |
| 149 | static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) |
| 150 | { |
| 151 | unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT); |
| 152 | } |
| 153 | |
| 154 | /** |
| 155 | * pcpu_unmap_pages - unmap pages out of a pcpu_chunk |
| 156 | * @chunk: chunk of interest |
| 157 | * @pages: pages array which can be used to pass information to free |
| 158 | * @populated: populated bitmap |
| 159 | * @page_start: page index of the first page to unmap |
| 160 | * @page_end: page index of the last page to unmap + 1 |
| 161 | * |
| 162 | * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. |
| 163 | * Corresponding elements in @pages were cleared by the caller and can |
| 164 | * be used to carry information to pcpu_free_pages() which will be |
| 165 | * called after all unmaps are finished. The caller should call |
| 166 | * proper pre/post flush functions. |
| 167 | */ |
| 168 | static void pcpu_unmap_pages(struct pcpu_chunk *chunk, |
| 169 | struct page **pages, unsigned long *populated, |
| 170 | int page_start, int page_end) |
| 171 | { |
| 172 | unsigned int cpu; |
| 173 | int i; |
| 174 | |
| 175 | for_each_possible_cpu(cpu) { |
| 176 | for (i = page_start; i < page_end; i++) { |
| 177 | struct page *page; |
| 178 | |
| 179 | page = pcpu_chunk_page(chunk, cpu, i); |
| 180 | WARN_ON(!page); |
| 181 | pages[pcpu_page_idx(cpu, i)] = page; |
| 182 | } |
| 183 | __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start), |
| 184 | page_end - page_start); |
| 185 | } |
| 186 | |
| 187 | for (i = page_start; i < page_end; i++) |
| 188 | __clear_bit(i, populated); |
| 189 | } |
| 190 | |
| 191 | /** |
| 192 | * pcpu_post_unmap_tlb_flush - flush TLB after unmapping |
| 193 | * @chunk: pcpu_chunk the regions to be flushed belong to |
| 194 | * @page_start: page index of the first page to be flushed |
| 195 | * @page_end: page index of the last page to be flushed + 1 |
| 196 | * |
| 197 | * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush |
| 198 | * TLB for the regions. This can be skipped if the area is to be |
| 199 | * returned to vmalloc as vmalloc will handle TLB flushing lazily. |
| 200 | * |
| 201 | * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once |
| 202 | * for the whole region. |
| 203 | */ |
| 204 | static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, |
| 205 | int page_start, int page_end) |
| 206 | { |
| 207 | flush_tlb_kernel_range( |
Tejun Heo | a855b84 | 2011-11-18 10:55:35 -0800 | [diff] [blame^] | 208 | pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), |
| 209 | pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); |
Tejun Heo | 9f64553 | 2010-04-09 18:57:01 +0900 | [diff] [blame] | 210 | } |
| 211 | |
| 212 | static int __pcpu_map_pages(unsigned long addr, struct page **pages, |
| 213 | int nr_pages) |
| 214 | { |
| 215 | return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT, |
| 216 | PAGE_KERNEL, pages); |
| 217 | } |
| 218 | |
| 219 | /** |
| 220 | * pcpu_map_pages - map pages into a pcpu_chunk |
| 221 | * @chunk: chunk of interest |
| 222 | * @pages: pages array containing pages to be mapped |
| 223 | * @populated: populated bitmap |
| 224 | * @page_start: page index of the first page to map |
| 225 | * @page_end: page index of the last page to map + 1 |
| 226 | * |
| 227 | * For each cpu, map pages [@page_start,@page_end) into @chunk. The |
| 228 | * caller is responsible for calling pcpu_post_map_flush() after all |
| 229 | * mappings are complete. |
| 230 | * |
| 231 | * This function is responsible for setting corresponding bits in |
| 232 | * @chunk->populated bitmap and whatever is necessary for reverse |
| 233 | * lookup (addr -> chunk). |
| 234 | */ |
| 235 | static int pcpu_map_pages(struct pcpu_chunk *chunk, |
| 236 | struct page **pages, unsigned long *populated, |
| 237 | int page_start, int page_end) |
| 238 | { |
| 239 | unsigned int cpu, tcpu; |
| 240 | int i, err; |
| 241 | |
| 242 | for_each_possible_cpu(cpu) { |
| 243 | err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start), |
| 244 | &pages[pcpu_page_idx(cpu, page_start)], |
| 245 | page_end - page_start); |
| 246 | if (err < 0) |
| 247 | goto err; |
| 248 | } |
| 249 | |
| 250 | /* mapping successful, link chunk and mark populated */ |
| 251 | for (i = page_start; i < page_end; i++) { |
| 252 | for_each_possible_cpu(cpu) |
| 253 | pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], |
| 254 | chunk); |
| 255 | __set_bit(i, populated); |
| 256 | } |
| 257 | |
| 258 | return 0; |
| 259 | |
| 260 | err: |
| 261 | for_each_possible_cpu(tcpu) { |
| 262 | if (tcpu == cpu) |
| 263 | break; |
| 264 | __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start), |
| 265 | page_end - page_start); |
| 266 | } |
| 267 | return err; |
| 268 | } |
| 269 | |
| 270 | /** |
| 271 | * pcpu_post_map_flush - flush cache after mapping |
| 272 | * @chunk: pcpu_chunk the regions to be flushed belong to |
| 273 | * @page_start: page index of the first page to be flushed |
| 274 | * @page_end: page index of the last page to be flushed + 1 |
| 275 | * |
| 276 | * Pages [@page_start,@page_end) of @chunk have been mapped. Flush |
| 277 | * cache. |
| 278 | * |
| 279 | * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once |
| 280 | * for the whole region. |
| 281 | */ |
| 282 | static void pcpu_post_map_flush(struct pcpu_chunk *chunk, |
| 283 | int page_start, int page_end) |
| 284 | { |
| 285 | flush_cache_vmap( |
Tejun Heo | a855b84 | 2011-11-18 10:55:35 -0800 | [diff] [blame^] | 286 | pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), |
| 287 | pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); |
Tejun Heo | 9f64553 | 2010-04-09 18:57:01 +0900 | [diff] [blame] | 288 | } |
| 289 | |
| 290 | /** |
| 291 | * pcpu_populate_chunk - populate and map an area of a pcpu_chunk |
| 292 | * @chunk: chunk of interest |
| 293 | * @off: offset to the area to populate |
| 294 | * @size: size of the area to populate in bytes |
| 295 | * |
| 296 | * For each cpu, populate and map pages [@page_start,@page_end) into |
| 297 | * @chunk. The area is cleared on return. |
| 298 | * |
| 299 | * CONTEXT: |
| 300 | * pcpu_alloc_mutex, does GFP_KERNEL allocation. |
| 301 | */ |
| 302 | static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) |
| 303 | { |
| 304 | int page_start = PFN_DOWN(off); |
| 305 | int page_end = PFN_UP(off + size); |
| 306 | int free_end = page_start, unmap_end = page_start; |
| 307 | struct page **pages; |
| 308 | unsigned long *populated; |
| 309 | unsigned int cpu; |
| 310 | int rs, re, rc; |
| 311 | |
| 312 | /* quick path, check whether all pages are already there */ |
| 313 | rs = page_start; |
| 314 | pcpu_next_pop(chunk, &rs, &re, page_end); |
| 315 | if (rs == page_start && re == page_end) |
| 316 | goto clear; |
| 317 | |
| 318 | /* need to allocate and map pages, this chunk can't be immutable */ |
| 319 | WARN_ON(chunk->immutable); |
| 320 | |
| 321 | pages = pcpu_get_pages_and_bitmap(chunk, &populated, true); |
| 322 | if (!pages) |
| 323 | return -ENOMEM; |
| 324 | |
| 325 | /* alloc and map */ |
| 326 | pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { |
| 327 | rc = pcpu_alloc_pages(chunk, pages, populated, rs, re); |
| 328 | if (rc) |
| 329 | goto err_free; |
| 330 | free_end = re; |
| 331 | } |
| 332 | |
| 333 | pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { |
| 334 | rc = pcpu_map_pages(chunk, pages, populated, rs, re); |
| 335 | if (rc) |
| 336 | goto err_unmap; |
| 337 | unmap_end = re; |
| 338 | } |
| 339 | pcpu_post_map_flush(chunk, page_start, page_end); |
| 340 | |
| 341 | /* commit new bitmap */ |
| 342 | bitmap_copy(chunk->populated, populated, pcpu_unit_pages); |
| 343 | clear: |
| 344 | for_each_possible_cpu(cpu) |
| 345 | memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); |
| 346 | return 0; |
| 347 | |
| 348 | err_unmap: |
| 349 | pcpu_pre_unmap_flush(chunk, page_start, unmap_end); |
| 350 | pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end) |
| 351 | pcpu_unmap_pages(chunk, pages, populated, rs, re); |
| 352 | pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end); |
| 353 | err_free: |
| 354 | pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end) |
| 355 | pcpu_free_pages(chunk, pages, populated, rs, re); |
| 356 | return rc; |
| 357 | } |
| 358 | |
| 359 | /** |
| 360 | * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk |
| 361 | * @chunk: chunk to depopulate |
| 362 | * @off: offset to the area to depopulate |
| 363 | * @size: size of the area to depopulate in bytes |
| 364 | * @flush: whether to flush cache and tlb or not |
| 365 | * |
| 366 | * For each cpu, depopulate and unmap pages [@page_start,@page_end) |
| 367 | * from @chunk. If @flush is true, vcache is flushed before unmapping |
| 368 | * and tlb after. |
| 369 | * |
| 370 | * CONTEXT: |
| 371 | * pcpu_alloc_mutex. |
| 372 | */ |
| 373 | static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) |
| 374 | { |
| 375 | int page_start = PFN_DOWN(off); |
| 376 | int page_end = PFN_UP(off + size); |
| 377 | struct page **pages; |
| 378 | unsigned long *populated; |
| 379 | int rs, re; |
| 380 | |
| 381 | /* quick path, check whether it's empty already */ |
| 382 | rs = page_start; |
| 383 | pcpu_next_unpop(chunk, &rs, &re, page_end); |
| 384 | if (rs == page_start && re == page_end) |
| 385 | return; |
| 386 | |
| 387 | /* immutable chunks can't be depopulated */ |
| 388 | WARN_ON(chunk->immutable); |
| 389 | |
| 390 | /* |
| 391 | * If control reaches here, there must have been at least one |
| 392 | * successful population attempt so the temp pages array must |
| 393 | * be available now. |
| 394 | */ |
| 395 | pages = pcpu_get_pages_and_bitmap(chunk, &populated, false); |
| 396 | BUG_ON(!pages); |
| 397 | |
| 398 | /* unmap and free */ |
| 399 | pcpu_pre_unmap_flush(chunk, page_start, page_end); |
| 400 | |
| 401 | pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) |
| 402 | pcpu_unmap_pages(chunk, pages, populated, rs, re); |
| 403 | |
| 404 | /* no need to flush tlb, vmalloc will handle it lazily */ |
| 405 | |
| 406 | pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) |
| 407 | pcpu_free_pages(chunk, pages, populated, rs, re); |
| 408 | |
| 409 | /* commit new bitmap */ |
| 410 | bitmap_copy(chunk->populated, populated, pcpu_unit_pages); |
| 411 | } |
| 412 | |
| 413 | static struct pcpu_chunk *pcpu_create_chunk(void) |
| 414 | { |
| 415 | struct pcpu_chunk *chunk; |
| 416 | struct vm_struct **vms; |
| 417 | |
| 418 | chunk = pcpu_alloc_chunk(); |
| 419 | if (!chunk) |
| 420 | return NULL; |
| 421 | |
| 422 | vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, |
David Rientjes | ec3f64f | 2011-01-13 15:46:01 -0800 | [diff] [blame] | 423 | pcpu_nr_groups, pcpu_atom_size); |
Tejun Heo | 9f64553 | 2010-04-09 18:57:01 +0900 | [diff] [blame] | 424 | if (!vms) { |
| 425 | pcpu_free_chunk(chunk); |
| 426 | return NULL; |
| 427 | } |
| 428 | |
| 429 | chunk->data = vms; |
| 430 | chunk->base_addr = vms[0]->addr - pcpu_group_offsets[0]; |
| 431 | return chunk; |
| 432 | } |
| 433 | |
| 434 | static void pcpu_destroy_chunk(struct pcpu_chunk *chunk) |
| 435 | { |
| 436 | if (chunk && chunk->data) |
| 437 | pcpu_free_vm_areas(chunk->data, pcpu_nr_groups); |
| 438 | pcpu_free_chunk(chunk); |
| 439 | } |
| 440 | |
| 441 | static struct page *pcpu_addr_to_page(void *addr) |
| 442 | { |
| 443 | return vmalloc_to_page(addr); |
| 444 | } |
| 445 | |
| 446 | static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai) |
| 447 | { |
| 448 | /* no extra restriction */ |
| 449 | return 0; |
| 450 | } |