blob: 69a0dbf3536268df6d9d2000a36f7a222162ba12 [file] [log] [blame]
Qu Wenruocac06d82021-01-26 16:33:47 +08001// SPDX-License-Identifier: GPL-2.0
2
3#include <linux/slab.h>
4#include "ctree.h"
5#include "subpage.h"
6
Qu Wenruo894d1372021-03-25 15:14:45 +08007/*
8 * Subpage (sectorsize < PAGE_SIZE) support overview:
9 *
10 * Limitations:
11 *
12 * - Only support 64K page size for now
13 * This is to make metadata handling easier, as 64K page would ensure
14 * all nodesize would fit inside one page, thus we don't need to handle
15 * cases where a tree block crosses several pages.
16 *
17 * - Only metadata read-write for now
18 * The data read-write part is in development.
19 *
20 * - Metadata can't cross 64K page boundary
21 * btrfs-progs and kernel have done that for a while, thus only ancient
22 * filesystems could have such problem. For such case, do a graceful
23 * rejection.
24 *
25 * Special behavior:
26 *
27 * - Metadata
28 * Metadata read is fully supported.
29 * Meaning when reading one tree block will only trigger the read for the
30 * needed range, other unrelated range in the same page will not be touched.
31 *
32 * Metadata write support is partial.
33 * The writeback is still for the full page, but we will only submit
34 * the dirty extent buffers in the page.
35 *
36 * This means, if we have a metadata page like this:
37 *
38 * Page offset
39 * 0 16K 32K 48K 64K
40 * |/////////| |///////////|
41 * \- Tree block A \- Tree block B
42 *
43 * Even if we just want to writeback tree block A, we will also writeback
44 * tree block B if it's also dirty.
45 *
46 * This may cause extra metadata writeback which results more COW.
47 *
48 * Implementation:
49 *
50 * - Common
51 * Both metadata and data will use a new structure, btrfs_subpage, to
52 * record the status of each sector inside a page. This provides the extra
53 * granularity needed.
54 *
55 * - Metadata
56 * Since we have multiple tree blocks inside one page, we can't rely on page
57 * locking anymore, or we will have greatly reduced concurrency or even
58 * deadlocks (hold one tree lock while trying to lock another tree lock in
59 * the same page).
60 *
61 * Thus for metadata locking, subpage support relies on io_tree locking only.
62 * This means a slightly higher tree locking latency.
63 */
64
Qu Wenruocac06d82021-01-26 16:33:47 +080065int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
66 struct page *page, enum btrfs_subpage_type type)
67{
Qu Wenruo760f9912021-01-26 16:33:48 +080068 struct btrfs_subpage *subpage = NULL;
69 int ret;
Qu Wenruocac06d82021-01-26 16:33:47 +080070
71 /*
72 * We have cases like a dummy extent buffer page, which is not mappped
73 * and doesn't need to be locked.
74 */
75 if (page->mapping)
76 ASSERT(PageLocked(page));
77 /* Either not subpage, or the page already has private attached */
78 if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page))
79 return 0;
80
Qu Wenruo760f9912021-01-26 16:33:48 +080081 ret = btrfs_alloc_subpage(fs_info, &subpage, type);
82 if (ret < 0)
83 return ret;
Qu Wenruocac06d82021-01-26 16:33:47 +080084 attach_page_private(page, subpage);
85 return 0;
86}
87
88void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
89 struct page *page)
90{
91 struct btrfs_subpage *subpage;
92
93 /* Either not subpage, or already detached */
94 if (fs_info->sectorsize == PAGE_SIZE || !PagePrivate(page))
95 return;
96
97 subpage = (struct btrfs_subpage *)detach_page_private(page);
98 ASSERT(subpage);
Qu Wenruo760f9912021-01-26 16:33:48 +080099 btrfs_free_subpage(subpage);
100}
101
102int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
103 struct btrfs_subpage **ret,
104 enum btrfs_subpage_type type)
105{
106 if (fs_info->sectorsize == PAGE_SIZE)
107 return 0;
108
109 *ret = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS);
110 if (!*ret)
111 return -ENOMEM;
112 spin_lock_init(&(*ret)->lock);
Qu Wenruo1e1de382021-05-31 16:50:44 +0800113 if (type == BTRFS_SUBPAGE_METADATA) {
Qu Wenruo8ff84662021-01-26 16:33:50 +0800114 atomic_set(&(*ret)->eb_refs, 0);
Qu Wenruo1e1de382021-05-31 16:50:44 +0800115 } else {
Qu Wenruo92082d42021-02-02 10:28:36 +0800116 atomic_set(&(*ret)->readers, 0);
Qu Wenruo1e1de382021-05-31 16:50:44 +0800117 atomic_set(&(*ret)->writers, 0);
118 }
Qu Wenruo760f9912021-01-26 16:33:48 +0800119 return 0;
120}
121
122void btrfs_free_subpage(struct btrfs_subpage *subpage)
123{
Qu Wenruocac06d82021-01-26 16:33:47 +0800124 kfree(subpage);
125}
Qu Wenruo8ff84662021-01-26 16:33:50 +0800126
127/*
128 * Increase the eb_refs of current subpage.
129 *
130 * This is important for eb allocation, to prevent race with last eb freeing
131 * of the same page.
132 * With the eb_refs increased before the eb inserted into radix tree,
133 * detach_extent_buffer_page() won't detach the page private while we're still
134 * allocating the extent buffer.
135 */
136void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
137 struct page *page)
138{
139 struct btrfs_subpage *subpage;
140
141 if (fs_info->sectorsize == PAGE_SIZE)
142 return;
143
144 ASSERT(PagePrivate(page) && page->mapping);
145 lockdep_assert_held(&page->mapping->private_lock);
146
147 subpage = (struct btrfs_subpage *)page->private;
148 atomic_inc(&subpage->eb_refs);
149}
150
151void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info,
152 struct page *page)
153{
154 struct btrfs_subpage *subpage;
155
156 if (fs_info->sectorsize == PAGE_SIZE)
157 return;
158
159 ASSERT(PagePrivate(page) && page->mapping);
160 lockdep_assert_held(&page->mapping->private_lock);
161
162 subpage = (struct btrfs_subpage *)page->private;
163 ASSERT(atomic_read(&subpage->eb_refs));
164 atomic_dec(&subpage->eb_refs);
165}
Qu Wenruoa1d767c2021-01-26 16:33:52 +0800166
Qu Wenruo92082d42021-02-02 10:28:36 +0800167static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
168 struct page *page, u64 start, u32 len)
169{
170 /* Basic checks */
171 ASSERT(PagePrivate(page) && page->private);
172 ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
173 IS_ALIGNED(len, fs_info->sectorsize));
174 /*
175 * The range check only works for mapped page, we can still have
176 * unmapped page like dummy extent buffer pages.
177 */
178 if (page->mapping)
179 ASSERT(page_offset(page) <= start &&
180 start + len <= page_offset(page) + PAGE_SIZE);
181}
182
183void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
184 struct page *page, u64 start, u32 len)
185{
186 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
187 const int nbits = len >> fs_info->sectorsize_bits;
188 int ret;
189
190 btrfs_subpage_assert(fs_info, page, start, len);
191
192 ret = atomic_add_return(nbits, &subpage->readers);
193 ASSERT(ret == nbits);
194}
195
196void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
197 struct page *page, u64 start, u32 len)
198{
199 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
200 const int nbits = len >> fs_info->sectorsize_bits;
201
202 btrfs_subpage_assert(fs_info, page, start, len);
203 ASSERT(atomic_read(&subpage->readers) >= nbits);
204 if (atomic_sub_and_test(nbits, &subpage->readers))
205 unlock_page(page);
206}
207
Qu Wenruo1e1de382021-05-31 16:50:44 +0800208static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len)
209{
210 u64 orig_start = *start;
211 u32 orig_len = *len;
212
213 *start = max_t(u64, page_offset(page), orig_start);
214 *len = min_t(u64, page_offset(page) + PAGE_SIZE,
215 orig_start + orig_len) - *start;
216}
217
218void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
219 struct page *page, u64 start, u32 len)
220{
221 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
222 const int nbits = (len >> fs_info->sectorsize_bits);
223 int ret;
224
225 btrfs_subpage_assert(fs_info, page, start, len);
226
227 ASSERT(atomic_read(&subpage->readers) == 0);
228 ret = atomic_add_return(nbits, &subpage->writers);
229 ASSERT(ret == nbits);
230}
231
232bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
233 struct page *page, u64 start, u32 len)
234{
235 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
236 const int nbits = (len >> fs_info->sectorsize_bits);
237
238 btrfs_subpage_assert(fs_info, page, start, len);
239
240 ASSERT(atomic_read(&subpage->writers) >= nbits);
241 return atomic_sub_and_test(nbits, &subpage->writers);
242}
243
244/*
245 * Lock a page for delalloc page writeback.
246 *
247 * Return -EAGAIN if the page is not properly initialized.
248 * Return 0 with the page locked, and writer counter updated.
249 *
250 * Even with 0 returned, the page still need extra check to make sure
251 * it's really the correct page, as the caller is using
252 * find_get_pages_contig(), which can race with page invalidating.
253 */
254int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info,
255 struct page *page, u64 start, u32 len)
256{
257 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) {
258 lock_page(page);
259 return 0;
260 }
261 lock_page(page);
262 if (!PagePrivate(page) || !page->private) {
263 unlock_page(page);
264 return -EAGAIN;
265 }
266 btrfs_subpage_clamp_range(page, &start, &len);
267 btrfs_subpage_start_writer(fs_info, page, start, len);
268 return 0;
269}
270
271void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
272 struct page *page, u64 start, u32 len)
273{
274 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE)
275 return unlock_page(page);
276 btrfs_subpage_clamp_range(page, &start, &len);
277 if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len))
278 unlock_page(page);
279}
280
Qu Wenruoa1d767c2021-01-26 16:33:52 +0800281/*
282 * Convert the [start, start + len) range into a u16 bitmap
283 *
284 * For example: if start == page_offset() + 16K, len = 16K, we get 0x00f0.
285 */
286static u16 btrfs_subpage_calc_bitmap(const struct btrfs_fs_info *fs_info,
287 struct page *page, u64 start, u32 len)
288{
289 const int bit_start = offset_in_page(start) >> fs_info->sectorsize_bits;
290 const int nbits = len >> fs_info->sectorsize_bits;
291
Qu Wenruo92082d42021-02-02 10:28:36 +0800292 btrfs_subpage_assert(fs_info, page, start, len);
Qu Wenruoa1d767c2021-01-26 16:33:52 +0800293
294 /*
Qu Wenruoa1d767c2021-01-26 16:33:52 +0800295 * Here nbits can be 16, thus can go beyond u16 range. We make the
296 * first left shift to be calculate in unsigned long (at least u32),
297 * then truncate the result to u16.
298 */
299 return (u16)(((1UL << nbits) - 1) << bit_start);
300}
301
302void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info,
303 struct page *page, u64 start, u32 len)
304{
305 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
306 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
307 unsigned long flags;
308
309 spin_lock_irqsave(&subpage->lock, flags);
310 subpage->uptodate_bitmap |= tmp;
311 if (subpage->uptodate_bitmap == U16_MAX)
312 SetPageUptodate(page);
313 spin_unlock_irqrestore(&subpage->lock, flags);
314}
315
316void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info,
317 struct page *page, u64 start, u32 len)
318{
319 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
320 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
321 unsigned long flags;
322
323 spin_lock_irqsave(&subpage->lock, flags);
324 subpage->uptodate_bitmap &= ~tmp;
325 ClearPageUptodate(page);
326 spin_unlock_irqrestore(&subpage->lock, flags);
327}
328
Qu Wenruo03a816b32021-01-26 16:33:53 +0800329void btrfs_subpage_set_error(const struct btrfs_fs_info *fs_info,
330 struct page *page, u64 start, u32 len)
331{
332 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
333 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
334 unsigned long flags;
335
336 spin_lock_irqsave(&subpage->lock, flags);
337 subpage->error_bitmap |= tmp;
338 SetPageError(page);
339 spin_unlock_irqrestore(&subpage->lock, flags);
340}
341
342void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info,
343 struct page *page, u64 start, u32 len)
344{
345 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
346 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
347 unsigned long flags;
348
349 spin_lock_irqsave(&subpage->lock, flags);
350 subpage->error_bitmap &= ~tmp;
351 if (subpage->error_bitmap == 0)
352 ClearPageError(page);
353 spin_unlock_irqrestore(&subpage->lock, flags);
354}
355
Qu Wenruod8a57132021-03-25 15:14:37 +0800356void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
357 struct page *page, u64 start, u32 len)
358{
359 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
360 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
361 unsigned long flags;
362
363 spin_lock_irqsave(&subpage->lock, flags);
364 subpage->dirty_bitmap |= tmp;
365 spin_unlock_irqrestore(&subpage->lock, flags);
366 set_page_dirty(page);
367}
368
369/*
370 * Extra clear_and_test function for subpage dirty bitmap.
371 *
372 * Return true if we're the last bits in the dirty_bitmap and clear the
373 * dirty_bitmap.
374 * Return false otherwise.
375 *
376 * NOTE: Callers should manually clear page dirty for true case, as we have
377 * extra handling for tree blocks.
378 */
379bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
380 struct page *page, u64 start, u32 len)
381{
382 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
383 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
384 unsigned long flags;
385 bool last = false;
386
387 spin_lock_irqsave(&subpage->lock, flags);
388 subpage->dirty_bitmap &= ~tmp;
389 if (subpage->dirty_bitmap == 0)
390 last = true;
391 spin_unlock_irqrestore(&subpage->lock, flags);
392 return last;
393}
394
395void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info,
396 struct page *page, u64 start, u32 len)
397{
398 bool last;
399
400 last = btrfs_subpage_clear_and_test_dirty(fs_info, page, start, len);
401 if (last)
402 clear_page_dirty_for_io(page);
403}
404
Qu Wenruo3470da32021-03-25 15:14:38 +0800405void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,
406 struct page *page, u64 start, u32 len)
407{
408 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
409 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
410 unsigned long flags;
411
412 spin_lock_irqsave(&subpage->lock, flags);
413 subpage->writeback_bitmap |= tmp;
414 set_page_writeback(page);
415 spin_unlock_irqrestore(&subpage->lock, flags);
416}
417
418void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
419 struct page *page, u64 start, u32 len)
420{
421 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
422 u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
423 unsigned long flags;
424
425 spin_lock_irqsave(&subpage->lock, flags);
426 subpage->writeback_bitmap &= ~tmp;
427 if (subpage->writeback_bitmap == 0)
428 end_page_writeback(page);
429 spin_unlock_irqrestore(&subpage->lock, flags);
430}
431
Qu Wenruoa1d767c2021-01-26 16:33:52 +0800432/*
433 * Unlike set/clear which is dependent on each page status, for test all bits
434 * are tested in the same way.
435 */
436#define IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(name) \
437bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \
438 struct page *page, u64 start, u32 len) \
439{ \
440 struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \
441 const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); \
442 unsigned long flags; \
443 bool ret; \
444 \
445 spin_lock_irqsave(&subpage->lock, flags); \
446 ret = ((subpage->name##_bitmap & tmp) == tmp); \
447 spin_unlock_irqrestore(&subpage->lock, flags); \
448 return ret; \
449}
450IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate);
Qu Wenruo03a816b32021-01-26 16:33:53 +0800451IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error);
Qu Wenruod8a57132021-03-25 15:14:37 +0800452IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty);
Qu Wenruo3470da32021-03-25 15:14:38 +0800453IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback);
Qu Wenruoa1d767c2021-01-26 16:33:52 +0800454
455/*
456 * Note that, in selftests (extent-io-tests), we can have empty fs_info passed
457 * in. We only test sectorsize == PAGE_SIZE cases so far, thus we can fall
458 * back to regular sectorsize branch.
459 */
460#define IMPLEMENT_BTRFS_PAGE_OPS(name, set_page_func, clear_page_func, \
461 test_page_func) \
462void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \
463 struct page *page, u64 start, u32 len) \
464{ \
465 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \
466 set_page_func(page); \
467 return; \
468 } \
469 btrfs_subpage_set_##name(fs_info, page, start, len); \
470} \
471void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \
472 struct page *page, u64 start, u32 len) \
473{ \
474 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \
475 clear_page_func(page); \
476 return; \
477 } \
478 btrfs_subpage_clear_##name(fs_info, page, start, len); \
479} \
480bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \
481 struct page *page, u64 start, u32 len) \
482{ \
483 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \
484 return test_page_func(page); \
485 return btrfs_subpage_test_##name(fs_info, page, start, len); \
Qu Wenruo60e2d252021-05-31 16:50:39 +0800486} \
487void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
488 struct page *page, u64 start, u32 len) \
489{ \
490 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \
491 set_page_func(page); \
492 return; \
493 } \
494 btrfs_subpage_clamp_range(page, &start, &len); \
495 btrfs_subpage_set_##name(fs_info, page, start, len); \
496} \
497void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
498 struct page *page, u64 start, u32 len) \
499{ \
500 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \
501 clear_page_func(page); \
502 return; \
503 } \
504 btrfs_subpage_clamp_range(page, &start, &len); \
505 btrfs_subpage_clear_##name(fs_info, page, start, len); \
506} \
507bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
508 struct page *page, u64 start, u32 len) \
509{ \
510 if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \
511 return test_page_func(page); \
512 btrfs_subpage_clamp_range(page, &start, &len); \
513 return btrfs_subpage_test_##name(fs_info, page, start, len); \
Qu Wenruoa1d767c2021-01-26 16:33:52 +0800514}
515IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate,
516 PageUptodate);
Qu Wenruo03a816b32021-01-26 16:33:53 +0800517IMPLEMENT_BTRFS_PAGE_OPS(error, SetPageError, ClearPageError, PageError);
Qu Wenruod8a57132021-03-25 15:14:37 +0800518IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io,
519 PageDirty);
Qu Wenruo3470da32021-03-25 15:14:38 +0800520IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback,
521 PageWriteback);