Tejun Heo | 9f64553 | 2010-04-09 18:57:01 +0900 | [diff] [blame] | 1 | /* |
| 2 | * mm/percpu-vm.c - vmalloc area based chunk allocation |
| 3 | * |
| 4 | * Copyright (C) 2010 SUSE Linux Products GmbH |
| 5 | * Copyright (C) 2010 Tejun Heo <tj@kernel.org> |
| 6 | * |
| 7 | * This file is released under the GPLv2. |
| 8 | * |
| 9 | * Chunks are mapped into vmalloc areas and populated page by page. |
| 10 | * This is the default chunk allocator. |
| 11 | */ |
| 12 | |
| 13 | static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, |
| 14 | unsigned int cpu, int page_idx) |
| 15 | { |
| 16 | /* must not be used on pre-mapped chunk */ |
| 17 | WARN_ON(chunk->immutable); |
| 18 | |
| 19 | return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx)); |
| 20 | } |
| 21 | |
| 22 | /** |
| 23 | * pcpu_get_pages_and_bitmap - get temp pages array and bitmap |
| 24 | * @chunk: chunk of interest |
| 25 | * @bitmapp: output parameter for bitmap |
| 26 | * @may_alloc: may allocate the array |
| 27 | * |
| 28 | * Returns pointer to array of pointers to struct page and bitmap, |
| 29 | * both of which can be indexed with pcpu_page_idx(). The returned |
| 30 | * array is cleared to zero and *@bitmapp is copied from |
| 31 | * @chunk->populated. Note that there is only one array and bitmap |
| 32 | * and access exclusion is the caller's responsibility. |
| 33 | * |
| 34 | * CONTEXT: |
| 35 | * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc. |
| 36 | * Otherwise, don't care. |
| 37 | * |
| 38 | * RETURNS: |
| 39 | * Pointer to temp pages array on success, NULL on failure. |
| 40 | */ |
| 41 | static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, |
| 42 | unsigned long **bitmapp, |
| 43 | bool may_alloc) |
| 44 | { |
| 45 | static struct page **pages; |
| 46 | static unsigned long *bitmap; |
| 47 | size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); |
| 48 | size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) * |
| 49 | sizeof(unsigned long); |
| 50 | |
| 51 | if (!pages || !bitmap) { |
| 52 | if (may_alloc && !pages) |
Bob Liu | 90459ce0 | 2011-08-04 11:02:33 +0200 | [diff] [blame] | 53 | pages = pcpu_mem_zalloc(pages_size); |
Tejun Heo | 9f64553 | 2010-04-09 18:57:01 +0900 | [diff] [blame] | 54 | if (may_alloc && !bitmap) |
Bob Liu | 90459ce0 | 2011-08-04 11:02:33 +0200 | [diff] [blame] | 55 | bitmap = pcpu_mem_zalloc(bitmap_size); |
Tejun Heo | 9f64553 | 2010-04-09 18:57:01 +0900 | [diff] [blame] | 56 | if (!pages || !bitmap) |
| 57 | return NULL; |
| 58 | } |
| 59 | |
Tejun Heo | 9f64553 | 2010-04-09 18:57:01 +0900 | [diff] [blame] | 60 | bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages); |
| 61 | |
| 62 | *bitmapp = bitmap; |
| 63 | return pages; |
| 64 | } |
| 65 | |
| 66 | /** |
| 67 | * pcpu_free_pages - free pages which were allocated for @chunk |
| 68 | * @chunk: chunk pages were allocated for |
| 69 | * @pages: array of pages to be freed, indexed by pcpu_page_idx() |
| 70 | * @populated: populated bitmap |
| 71 | * @page_start: page index of the first page to be freed |
| 72 | * @page_end: page index of the last page to be freed + 1 |
| 73 | * |
| 74 | * Free pages [@page_start and @page_end) in @pages for all units. |
| 75 | * The pages were allocated for @chunk. |
| 76 | */ |
| 77 | static void pcpu_free_pages(struct pcpu_chunk *chunk, |
| 78 | struct page **pages, unsigned long *populated, |
| 79 | int page_start, int page_end) |
| 80 | { |
| 81 | unsigned int cpu; |
| 82 | int i; |
| 83 | |
| 84 | for_each_possible_cpu(cpu) { |
| 85 | for (i = page_start; i < page_end; i++) { |
| 86 | struct page *page = pages[pcpu_page_idx(cpu, i)]; |
| 87 | |
| 88 | if (page) |
| 89 | __free_page(page); |
| 90 | } |
| 91 | } |
| 92 | } |
| 93 | |
| 94 | /** |
| 95 | * pcpu_alloc_pages - allocates pages for @chunk |
| 96 | * @chunk: target chunk |
| 97 | * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() |
| 98 | * @populated: populated bitmap |
| 99 | * @page_start: page index of the first page to be allocated |
| 100 | * @page_end: page index of the last page to be allocated + 1 |
| 101 | * |
| 102 | * Allocate pages [@page_start,@page_end) into @pages for all units. |
| 103 | * The allocation is for @chunk. Percpu core doesn't care about the |
| 104 | * content of @pages and will pass it verbatim to pcpu_map_pages(). |
| 105 | */ |
| 106 | static int pcpu_alloc_pages(struct pcpu_chunk *chunk, |
| 107 | struct page **pages, unsigned long *populated, |
| 108 | int page_start, int page_end) |
| 109 | { |
| 110 | const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; |
Tejun Heo | f0d2796 | 2014-08-15 16:06:06 -0400 | [diff] [blame] | 111 | unsigned int cpu, tcpu; |
Tejun Heo | 9f64553 | 2010-04-09 18:57:01 +0900 | [diff] [blame] | 112 | int i; |
| 113 | |
| 114 | for_each_possible_cpu(cpu) { |
| 115 | for (i = page_start; i < page_end; i++) { |
| 116 | struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; |
| 117 | |
| 118 | *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0); |
Tejun Heo | f0d2796 | 2014-08-15 16:06:06 -0400 | [diff] [blame] | 119 | if (!*pagep) |
| 120 | goto err; |
Tejun Heo | 9f64553 | 2010-04-09 18:57:01 +0900 | [diff] [blame] | 121 | } |
| 122 | } |
| 123 | return 0; |
Tejun Heo | f0d2796 | 2014-08-15 16:06:06 -0400 | [diff] [blame] | 124 | |
| 125 | err: |
| 126 | while (--i >= page_start) |
| 127 | __free_page(pages[pcpu_page_idx(cpu, i)]); |
| 128 | |
| 129 | for_each_possible_cpu(tcpu) { |
| 130 | if (tcpu == cpu) |
| 131 | break; |
| 132 | for (i = page_start; i < page_end; i++) |
| 133 | __free_page(pages[pcpu_page_idx(tcpu, i)]); |
| 134 | } |
| 135 | return -ENOMEM; |
Tejun Heo | 9f64553 | 2010-04-09 18:57:01 +0900 | [diff] [blame] | 136 | } |
| 137 | |
| 138 | /** |
| 139 | * pcpu_pre_unmap_flush - flush cache prior to unmapping |
| 140 | * @chunk: chunk the regions to be flushed belongs to |
| 141 | * @page_start: page index of the first page to be flushed |
| 142 | * @page_end: page index of the last page to be flushed + 1 |
| 143 | * |
| 144 | * Pages in [@page_start,@page_end) of @chunk are about to be |
| 145 | * unmapped. Flush cache. As each flushing trial can be very |
| 146 | * expensive, issue flush on the whole region at once rather than |
| 147 | * doing it for each cpu. This could be an overkill but is more |
| 148 | * scalable. |
| 149 | */ |
| 150 | static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, |
| 151 | int page_start, int page_end) |
| 152 | { |
| 153 | flush_cache_vunmap( |
Tejun Heo | a855b84 | 2011-11-18 10:55:35 -0800 | [diff] [blame] | 154 | pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), |
| 155 | pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); |
Tejun Heo | 9f64553 | 2010-04-09 18:57:01 +0900 | [diff] [blame] | 156 | } |
| 157 | |
| 158 | static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) |
| 159 | { |
| 160 | unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT); |
| 161 | } |
| 162 | |
| 163 | /** |
| 164 | * pcpu_unmap_pages - unmap pages out of a pcpu_chunk |
| 165 | * @chunk: chunk of interest |
| 166 | * @pages: pages array which can be used to pass information to free |
| 167 | * @populated: populated bitmap |
| 168 | * @page_start: page index of the first page to unmap |
| 169 | * @page_end: page index of the last page to unmap + 1 |
| 170 | * |
| 171 | * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. |
| 172 | * Corresponding elements in @pages were cleared by the caller and can |
| 173 | * be used to carry information to pcpu_free_pages() which will be |
| 174 | * called after all unmaps are finished. The caller should call |
| 175 | * proper pre/post flush functions. |
| 176 | */ |
| 177 | static void pcpu_unmap_pages(struct pcpu_chunk *chunk, |
| 178 | struct page **pages, unsigned long *populated, |
| 179 | int page_start, int page_end) |
| 180 | { |
| 181 | unsigned int cpu; |
| 182 | int i; |
| 183 | |
| 184 | for_each_possible_cpu(cpu) { |
| 185 | for (i = page_start; i < page_end; i++) { |
| 186 | struct page *page; |
| 187 | |
| 188 | page = pcpu_chunk_page(chunk, cpu, i); |
| 189 | WARN_ON(!page); |
| 190 | pages[pcpu_page_idx(cpu, i)] = page; |
| 191 | } |
| 192 | __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start), |
| 193 | page_end - page_start); |
| 194 | } |
| 195 | |
Akinobu Mita | 26dd8e0 | 2012-01-21 00:15:23 +0900 | [diff] [blame] | 196 | bitmap_clear(populated, page_start, page_end - page_start); |
Tejun Heo | 9f64553 | 2010-04-09 18:57:01 +0900 | [diff] [blame] | 197 | } |
| 198 | |
| 199 | /** |
| 200 | * pcpu_post_unmap_tlb_flush - flush TLB after unmapping |
| 201 | * @chunk: pcpu_chunk the regions to be flushed belong to |
| 202 | * @page_start: page index of the first page to be flushed |
| 203 | * @page_end: page index of the last page to be flushed + 1 |
| 204 | * |
| 205 | * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush |
| 206 | * TLB for the regions. This can be skipped if the area is to be |
| 207 | * returned to vmalloc as vmalloc will handle TLB flushing lazily. |
| 208 | * |
| 209 | * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once |
| 210 | * for the whole region. |
| 211 | */ |
| 212 | static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, |
| 213 | int page_start, int page_end) |
| 214 | { |
| 215 | flush_tlb_kernel_range( |
Tejun Heo | a855b84 | 2011-11-18 10:55:35 -0800 | [diff] [blame] | 216 | pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), |
| 217 | pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); |
Tejun Heo | 9f64553 | 2010-04-09 18:57:01 +0900 | [diff] [blame] | 218 | } |
| 219 | |
| 220 | static int __pcpu_map_pages(unsigned long addr, struct page **pages, |
| 221 | int nr_pages) |
| 222 | { |
| 223 | return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT, |
| 224 | PAGE_KERNEL, pages); |
| 225 | } |
| 226 | |
| 227 | /** |
| 228 | * pcpu_map_pages - map pages into a pcpu_chunk |
| 229 | * @chunk: chunk of interest |
| 230 | * @pages: pages array containing pages to be mapped |
| 231 | * @populated: populated bitmap |
| 232 | * @page_start: page index of the first page to map |
| 233 | * @page_end: page index of the last page to map + 1 |
| 234 | * |
| 235 | * For each cpu, map pages [@page_start,@page_end) into @chunk. The |
| 236 | * caller is responsible for calling pcpu_post_map_flush() after all |
| 237 | * mappings are complete. |
| 238 | * |
| 239 | * This function is responsible for setting corresponding bits in |
| 240 | * @chunk->populated bitmap and whatever is necessary for reverse |
| 241 | * lookup (addr -> chunk). |
| 242 | */ |
| 243 | static int pcpu_map_pages(struct pcpu_chunk *chunk, |
| 244 | struct page **pages, unsigned long *populated, |
| 245 | int page_start, int page_end) |
| 246 | { |
| 247 | unsigned int cpu, tcpu; |
| 248 | int i, err; |
| 249 | |
| 250 | for_each_possible_cpu(cpu) { |
| 251 | err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start), |
| 252 | &pages[pcpu_page_idx(cpu, page_start)], |
| 253 | page_end - page_start); |
| 254 | if (err < 0) |
| 255 | goto err; |
| 256 | } |
| 257 | |
| 258 | /* mapping successful, link chunk and mark populated */ |
| 259 | for (i = page_start; i < page_end; i++) { |
| 260 | for_each_possible_cpu(cpu) |
| 261 | pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], |
| 262 | chunk); |
| 263 | __set_bit(i, populated); |
| 264 | } |
| 265 | |
| 266 | return 0; |
| 267 | |
| 268 | err: |
| 269 | for_each_possible_cpu(tcpu) { |
| 270 | if (tcpu == cpu) |
| 271 | break; |
| 272 | __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start), |
| 273 | page_end - page_start); |
| 274 | } |
Tejun Heo | 849f516 | 2014-08-15 16:06:10 -0400 | [diff] [blame^] | 275 | pcpu_post_unmap_tlb_flush(chunk, page_start, page_end); |
Tejun Heo | 9f64553 | 2010-04-09 18:57:01 +0900 | [diff] [blame] | 276 | return err; |
| 277 | } |
| 278 | |
| 279 | /** |
| 280 | * pcpu_post_map_flush - flush cache after mapping |
| 281 | * @chunk: pcpu_chunk the regions to be flushed belong to |
| 282 | * @page_start: page index of the first page to be flushed |
| 283 | * @page_end: page index of the last page to be flushed + 1 |
| 284 | * |
| 285 | * Pages [@page_start,@page_end) of @chunk have been mapped. Flush |
| 286 | * cache. |
| 287 | * |
| 288 | * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once |
| 289 | * for the whole region. |
| 290 | */ |
| 291 | static void pcpu_post_map_flush(struct pcpu_chunk *chunk, |
| 292 | int page_start, int page_end) |
| 293 | { |
| 294 | flush_cache_vmap( |
Tejun Heo | a855b84 | 2011-11-18 10:55:35 -0800 | [diff] [blame] | 295 | pcpu_chunk_addr(chunk, pcpu_low_unit_cpu, page_start), |
| 296 | pcpu_chunk_addr(chunk, pcpu_high_unit_cpu, page_end)); |
Tejun Heo | 9f64553 | 2010-04-09 18:57:01 +0900 | [diff] [blame] | 297 | } |
| 298 | |
| 299 | /** |
| 300 | * pcpu_populate_chunk - populate and map an area of a pcpu_chunk |
| 301 | * @chunk: chunk of interest |
| 302 | * @off: offset to the area to populate |
| 303 | * @size: size of the area to populate in bytes |
| 304 | * |
| 305 | * For each cpu, populate and map pages [@page_start,@page_end) into |
| 306 | * @chunk. The area is cleared on return. |
| 307 | * |
| 308 | * CONTEXT: |
| 309 | * pcpu_alloc_mutex, does GFP_KERNEL allocation. |
| 310 | */ |
| 311 | static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) |
| 312 | { |
| 313 | int page_start = PFN_DOWN(off); |
| 314 | int page_end = PFN_UP(off + size); |
| 315 | int free_end = page_start, unmap_end = page_start; |
| 316 | struct page **pages; |
| 317 | unsigned long *populated; |
| 318 | unsigned int cpu; |
| 319 | int rs, re, rc; |
| 320 | |
| 321 | /* quick path, check whether all pages are already there */ |
| 322 | rs = page_start; |
| 323 | pcpu_next_pop(chunk, &rs, &re, page_end); |
| 324 | if (rs == page_start && re == page_end) |
| 325 | goto clear; |
| 326 | |
| 327 | /* need to allocate and map pages, this chunk can't be immutable */ |
| 328 | WARN_ON(chunk->immutable); |
| 329 | |
| 330 | pages = pcpu_get_pages_and_bitmap(chunk, &populated, true); |
| 331 | if (!pages) |
| 332 | return -ENOMEM; |
| 333 | |
| 334 | /* alloc and map */ |
| 335 | pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { |
| 336 | rc = pcpu_alloc_pages(chunk, pages, populated, rs, re); |
| 337 | if (rc) |
| 338 | goto err_free; |
| 339 | free_end = re; |
| 340 | } |
| 341 | |
| 342 | pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { |
| 343 | rc = pcpu_map_pages(chunk, pages, populated, rs, re); |
| 344 | if (rc) |
| 345 | goto err_unmap; |
| 346 | unmap_end = re; |
| 347 | } |
| 348 | pcpu_post_map_flush(chunk, page_start, page_end); |
| 349 | |
| 350 | /* commit new bitmap */ |
| 351 | bitmap_copy(chunk->populated, populated, pcpu_unit_pages); |
| 352 | clear: |
| 353 | for_each_possible_cpu(cpu) |
| 354 | memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); |
| 355 | return 0; |
| 356 | |
| 357 | err_unmap: |
| 358 | pcpu_pre_unmap_flush(chunk, page_start, unmap_end); |
| 359 | pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end) |
| 360 | pcpu_unmap_pages(chunk, pages, populated, rs, re); |
| 361 | pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end); |
| 362 | err_free: |
| 363 | pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end) |
| 364 | pcpu_free_pages(chunk, pages, populated, rs, re); |
| 365 | return rc; |
| 366 | } |
| 367 | |
| 368 | /** |
| 369 | * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk |
| 370 | * @chunk: chunk to depopulate |
| 371 | * @off: offset to the area to depopulate |
| 372 | * @size: size of the area to depopulate in bytes |
Tejun Heo | 9f64553 | 2010-04-09 18:57:01 +0900 | [diff] [blame] | 373 | * |
| 374 | * For each cpu, depopulate and unmap pages [@page_start,@page_end) |
| 375 | * from @chunk. If @flush is true, vcache is flushed before unmapping |
| 376 | * and tlb after. |
| 377 | * |
| 378 | * CONTEXT: |
| 379 | * pcpu_alloc_mutex. |
| 380 | */ |
| 381 | static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) |
| 382 | { |
| 383 | int page_start = PFN_DOWN(off); |
| 384 | int page_end = PFN_UP(off + size); |
| 385 | struct page **pages; |
| 386 | unsigned long *populated; |
| 387 | int rs, re; |
| 388 | |
| 389 | /* quick path, check whether it's empty already */ |
| 390 | rs = page_start; |
| 391 | pcpu_next_unpop(chunk, &rs, &re, page_end); |
| 392 | if (rs == page_start && re == page_end) |
| 393 | return; |
| 394 | |
| 395 | /* immutable chunks can't be depopulated */ |
| 396 | WARN_ON(chunk->immutable); |
| 397 | |
| 398 | /* |
| 399 | * If control reaches here, there must have been at least one |
| 400 | * successful population attempt so the temp pages array must |
| 401 | * be available now. |
| 402 | */ |
| 403 | pages = pcpu_get_pages_and_bitmap(chunk, &populated, false); |
| 404 | BUG_ON(!pages); |
| 405 | |
| 406 | /* unmap and free */ |
| 407 | pcpu_pre_unmap_flush(chunk, page_start, page_end); |
| 408 | |
| 409 | pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) |
| 410 | pcpu_unmap_pages(chunk, pages, populated, rs, re); |
| 411 | |
| 412 | /* no need to flush tlb, vmalloc will handle it lazily */ |
| 413 | |
| 414 | pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) |
| 415 | pcpu_free_pages(chunk, pages, populated, rs, re); |
| 416 | |
| 417 | /* commit new bitmap */ |
| 418 | bitmap_copy(chunk->populated, populated, pcpu_unit_pages); |
| 419 | } |
| 420 | |
| 421 | static struct pcpu_chunk *pcpu_create_chunk(void) |
| 422 | { |
| 423 | struct pcpu_chunk *chunk; |
| 424 | struct vm_struct **vms; |
| 425 | |
| 426 | chunk = pcpu_alloc_chunk(); |
| 427 | if (!chunk) |
| 428 | return NULL; |
| 429 | |
| 430 | vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, |
David Rientjes | ec3f64f | 2011-01-13 15:46:01 -0800 | [diff] [blame] | 431 | pcpu_nr_groups, pcpu_atom_size); |
Tejun Heo | 9f64553 | 2010-04-09 18:57:01 +0900 | [diff] [blame] | 432 | if (!vms) { |
| 433 | pcpu_free_chunk(chunk); |
| 434 | return NULL; |
| 435 | } |
| 436 | |
| 437 | chunk->data = vms; |
| 438 | chunk->base_addr = vms[0]->addr - pcpu_group_offsets[0]; |
| 439 | return chunk; |
| 440 | } |
| 441 | |
| 442 | static void pcpu_destroy_chunk(struct pcpu_chunk *chunk) |
| 443 | { |
| 444 | if (chunk && chunk->data) |
| 445 | pcpu_free_vm_areas(chunk->data, pcpu_nr_groups); |
| 446 | pcpu_free_chunk(chunk); |
| 447 | } |
| 448 | |
| 449 | static struct page *pcpu_addr_to_page(void *addr) |
| 450 | { |
| 451 | return vmalloc_to_page(addr); |
| 452 | } |
| 453 | |
| 454 | static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai) |
| 455 | { |
| 456 | /* no extra restriction */ |
| 457 | return 0; |
| 458 | } |