blob: 1ac58d7e7790b59913cd3f68d4a39cee0c0b2a13 [file] [log] [blame]
Josef Bacik280c29082019-06-18 16:09:19 -04001// SPDX-License-Identifier: GPL-2.0
2
3#include "ctree.h"
4#include "space-info.h"
5#include "sysfs.h"
6#include "volumes.h"
Josef Bacik5da6afe2019-06-18 16:09:24 -04007#include "free-space-cache.h"
Josef Bacik0d9764f2019-06-18 16:09:25 -04008#include "ordered-data.h"
9#include "transaction.h"
10#include "math.h"
Josef Bacik280c29082019-06-18 16:09:19 -040011
12u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
13 bool may_use_included)
14{
15 ASSERT(s_info);
16 return s_info->bytes_used + s_info->bytes_reserved +
17 s_info->bytes_pinned + s_info->bytes_readonly +
18 (may_use_included ? s_info->bytes_may_use : 0);
19}
20
21/*
22 * after adding space to the filesystem, we need to clear the full flags
23 * on all the space infos.
24 */
25void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
26{
27 struct list_head *head = &info->space_info;
28 struct btrfs_space_info *found;
29
30 rcu_read_lock();
31 list_for_each_entry_rcu(found, head, list)
32 found->full = 0;
33 rcu_read_unlock();
34}
35
36static const char *alloc_name(u64 flags)
37{
38 switch (flags) {
39 case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA:
40 return "mixed";
41 case BTRFS_BLOCK_GROUP_METADATA:
42 return "metadata";
43 case BTRFS_BLOCK_GROUP_DATA:
44 return "data";
45 case BTRFS_BLOCK_GROUP_SYSTEM:
46 return "system";
47 default:
48 WARN_ON(1);
49 return "invalid-combination";
50 };
51}
52
53static int create_space_info(struct btrfs_fs_info *info, u64 flags)
54{
55
56 struct btrfs_space_info *space_info;
57 int i;
58 int ret;
59
60 space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
61 if (!space_info)
62 return -ENOMEM;
63
64 ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
65 GFP_KERNEL);
66 if (ret) {
67 kfree(space_info);
68 return ret;
69 }
70
71 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
72 INIT_LIST_HEAD(&space_info->block_groups[i]);
73 init_rwsem(&space_info->groups_sem);
74 spin_lock_init(&space_info->lock);
75 space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
76 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
77 init_waitqueue_head(&space_info->wait);
78 INIT_LIST_HEAD(&space_info->ro_bgs);
79 INIT_LIST_HEAD(&space_info->tickets);
80 INIT_LIST_HEAD(&space_info->priority_tickets);
81
82 ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype,
83 info->space_info_kobj, "%s",
84 alloc_name(space_info->flags));
85 if (ret) {
86 kobject_put(&space_info->kobj);
87 return ret;
88 }
89
90 list_add_rcu(&space_info->list, &info->space_info);
91 if (flags & BTRFS_BLOCK_GROUP_DATA)
92 info->data_sinfo = space_info;
93
94 return ret;
95}
96
97int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
98{
99 struct btrfs_super_block *disk_super;
100 u64 features;
101 u64 flags;
102 int mixed = 0;
103 int ret;
104
105 disk_super = fs_info->super_copy;
106 if (!btrfs_super_root(disk_super))
107 return -EINVAL;
108
109 features = btrfs_super_incompat_flags(disk_super);
110 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
111 mixed = 1;
112
113 flags = BTRFS_BLOCK_GROUP_SYSTEM;
114 ret = create_space_info(fs_info, flags);
115 if (ret)
116 goto out;
117
118 if (mixed) {
119 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
120 ret = create_space_info(fs_info, flags);
121 } else {
122 flags = BTRFS_BLOCK_GROUP_METADATA;
123 ret = create_space_info(fs_info, flags);
124 if (ret)
125 goto out;
126
127 flags = BTRFS_BLOCK_GROUP_DATA;
128 ret = create_space_info(fs_info, flags);
129 }
130out:
131 return ret;
132}
133
134void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
135 u64 total_bytes, u64 bytes_used,
136 u64 bytes_readonly,
137 struct btrfs_space_info **space_info)
138{
139 struct btrfs_space_info *found;
140 int factor;
141
142 factor = btrfs_bg_type_to_factor(flags);
143
144 found = btrfs_find_space_info(info, flags);
145 ASSERT(found);
146 spin_lock(&found->lock);
147 found->total_bytes += total_bytes;
148 found->disk_total += total_bytes * factor;
149 found->bytes_used += bytes_used;
150 found->disk_used += bytes_used * factor;
151 found->bytes_readonly += bytes_readonly;
152 if (total_bytes > 0)
153 found->full = 0;
154 btrfs_space_info_add_new_bytes(info, found,
155 total_bytes - bytes_used -
156 bytes_readonly);
157 spin_unlock(&found->lock);
158 *space_info = found;
159}
160
161struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
162 u64 flags)
163{
164 struct list_head *head = &info->space_info;
165 struct btrfs_space_info *found;
166
167 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
168
169 rcu_read_lock();
170 list_for_each_entry_rcu(found, head, list) {
171 if (found->flags & flags) {
172 rcu_read_unlock();
173 return found;
174 }
175 }
176 rcu_read_unlock();
177 return NULL;
178}
Josef Bacik41783ef2019-06-18 16:09:20 -0400179
180static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
181{
182 return (global->size << 1);
183}
184
185int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
186 struct btrfs_space_info *space_info, u64 bytes,
187 enum btrfs_reserve_flush_enum flush,
188 bool system_chunk)
189{
190 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
191 u64 profile;
192 u64 space_size;
193 u64 avail;
194 u64 used;
195 int factor;
196
197 /* Don't overcommit when in mixed mode. */
198 if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
199 return 0;
200
201 if (system_chunk)
202 profile = btrfs_system_alloc_profile(fs_info);
203 else
204 profile = btrfs_metadata_alloc_profile(fs_info);
205
206 used = btrfs_space_info_used(space_info, false);
207
208 /*
209 * We only want to allow over committing if we have lots of actual space
210 * free, but if we don't have enough space to handle the global reserve
211 * space then we could end up having a real enospc problem when trying
212 * to allocate a chunk or some other such important allocation.
213 */
214 spin_lock(&global_rsv->lock);
215 space_size = calc_global_rsv_need_space(global_rsv);
216 spin_unlock(&global_rsv->lock);
217 if (used + space_size >= space_info->total_bytes)
218 return 0;
219
220 used += space_info->bytes_may_use;
221
222 avail = atomic64_read(&fs_info->free_chunk_space);
223
224 /*
225 * If we have dup, raid1 or raid10 then only half of the free
226 * space is actually usable. For raid56, the space info used
227 * doesn't include the parity drive, so we don't have to
228 * change the math
229 */
230 factor = btrfs_bg_type_to_factor(profile);
231 avail = div_u64(avail, factor);
232
233 /*
234 * If we aren't flushing all things, let us overcommit up to
235 * 1/2th of the space. If we can flush, don't let us overcommit
236 * too much, let it overcommit up to 1/8 of the space.
237 */
238 if (flush == BTRFS_RESERVE_FLUSH_ALL)
239 avail >>= 3;
240 else
241 avail >>= 1;
242
243 if (used + bytes < space_info->total_bytes + avail)
244 return 1;
245 return 0;
246}
Josef Bacikb338b012019-06-18 16:09:22 -0400247
248/*
249 * This is for space we already have accounted in space_info->bytes_may_use, so
250 * basically when we're returning space from block_rsv's.
251 */
252void btrfs_space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
253 struct btrfs_space_info *space_info,
254 u64 num_bytes)
255{
256 struct reserve_ticket *ticket;
257 struct list_head *head;
258 u64 used;
259 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
260 bool check_overcommit = false;
261
262 spin_lock(&space_info->lock);
263 head = &space_info->priority_tickets;
264
265 /*
266 * If we are over our limit then we need to check and see if we can
267 * overcommit, and if we can't then we just need to free up our space
268 * and not satisfy any requests.
269 */
270 used = btrfs_space_info_used(space_info, true);
271 if (used - num_bytes >= space_info->total_bytes)
272 check_overcommit = true;
273again:
274 while (!list_empty(head) && num_bytes) {
275 ticket = list_first_entry(head, struct reserve_ticket,
276 list);
277 /*
278 * We use 0 bytes because this space is already reserved, so
279 * adding the ticket space would be a double count.
280 */
281 if (check_overcommit &&
282 !btrfs_can_overcommit(fs_info, space_info, 0, flush,
283 false))
284 break;
285 if (num_bytes >= ticket->bytes) {
286 list_del_init(&ticket->list);
287 num_bytes -= ticket->bytes;
288 ticket->bytes = 0;
289 space_info->tickets_id++;
290 wake_up(&ticket->wait);
291 } else {
292 ticket->bytes -= num_bytes;
293 num_bytes = 0;
294 }
295 }
296
297 if (num_bytes && head == &space_info->priority_tickets) {
298 head = &space_info->tickets;
299 flush = BTRFS_RESERVE_FLUSH_ALL;
300 goto again;
301 }
302 btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes);
303 trace_btrfs_space_reservation(fs_info, "space_info",
304 space_info->flags, num_bytes, 0);
305 spin_unlock(&space_info->lock);
306}
307
308/*
309 * This is for newly allocated space that isn't accounted in
310 * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent
311 * we use this helper.
312 */
313void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
314 struct btrfs_space_info *space_info,
315 u64 num_bytes)
316{
317 struct reserve_ticket *ticket;
318 struct list_head *head = &space_info->priority_tickets;
319
320again:
321 while (!list_empty(head) && num_bytes) {
322 ticket = list_first_entry(head, struct reserve_ticket,
323 list);
324 if (num_bytes >= ticket->bytes) {
325 trace_btrfs_space_reservation(fs_info, "space_info",
326 space_info->flags,
327 ticket->bytes, 1);
328 list_del_init(&ticket->list);
329 num_bytes -= ticket->bytes;
330 btrfs_space_info_update_bytes_may_use(fs_info,
331 space_info,
332 ticket->bytes);
333 ticket->bytes = 0;
334 space_info->tickets_id++;
335 wake_up(&ticket->wait);
336 } else {
337 trace_btrfs_space_reservation(fs_info, "space_info",
338 space_info->flags,
339 num_bytes, 1);
340 btrfs_space_info_update_bytes_may_use(fs_info,
341 space_info,
342 num_bytes);
343 ticket->bytes -= num_bytes;
344 num_bytes = 0;
345 }
346 }
347
348 if (num_bytes && head == &space_info->priority_tickets) {
349 head = &space_info->tickets;
350 goto again;
351 }
352}
Josef Bacik5da6afe2019-06-18 16:09:24 -0400353
354#define DUMP_BLOCK_RSV(fs_info, rsv_name) \
355do { \
356 struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \
357 spin_lock(&__rsv->lock); \
358 btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \
359 __rsv->size, __rsv->reserved); \
360 spin_unlock(&__rsv->lock); \
361} while (0)
362
363void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
364 struct btrfs_space_info *info, u64 bytes,
365 int dump_block_groups)
366{
367 struct btrfs_block_group_cache *cache;
368 int index = 0;
369
370 spin_lock(&info->lock);
371 btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
372 info->flags,
373 info->total_bytes - btrfs_space_info_used(info, true),
374 info->full ? "" : "not ");
375 btrfs_info(fs_info,
376 "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
377 info->total_bytes, info->bytes_used, info->bytes_pinned,
378 info->bytes_reserved, info->bytes_may_use,
379 info->bytes_readonly);
380 spin_unlock(&info->lock);
381
382 DUMP_BLOCK_RSV(fs_info, global_block_rsv);
383 DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
384 DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
385 DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
386 DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
387
388 if (!dump_block_groups)
389 return;
390
391 down_read(&info->groups_sem);
392again:
393 list_for_each_entry(cache, &info->block_groups[index], list) {
394 spin_lock(&cache->lock);
395 btrfs_info(fs_info,
396 "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
397 cache->key.objectid, cache->key.offset,
398 btrfs_block_group_used(&cache->item), cache->pinned,
399 cache->reserved, cache->ro ? "[readonly]" : "");
400 btrfs_dump_free_space(cache, bytes);
401 spin_unlock(&cache->lock);
402 }
403 if (++index < BTRFS_NR_RAID_TYPES)
404 goto again;
405 up_read(&info->groups_sem);
406}
Josef Bacik0d9764f2019-06-18 16:09:25 -0400407
408static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
409 unsigned long nr_pages, int nr_items)
410{
411 struct super_block *sb = fs_info->sb;
412
413 if (down_read_trylock(&sb->s_umount)) {
414 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
415 up_read(&sb->s_umount);
416 } else {
417 /*
418 * We needn't worry the filesystem going from r/w to r/o though
419 * we don't acquire ->s_umount mutex, because the filesystem
420 * should guarantee the delalloc inodes list be empty after
421 * the filesystem is readonly(all dirty pages are written to
422 * the disk).
423 */
424 btrfs_start_delalloc_roots(fs_info, nr_items);
425 if (!current->journal_info)
426 btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
427 }
428}
429
430static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
431 u64 to_reclaim)
432{
433 u64 bytes;
434 u64 nr;
435
436 bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
437 nr = div64_u64(to_reclaim, bytes);
438 if (!nr)
439 nr = 1;
440 return nr;
441}
442
443#define EXTENT_SIZE_PER_ITEM SZ_256K
444
445/*
446 * shrink metadata reservation for delalloc
447 */
448static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
449 u64 orig, bool wait_ordered)
450{
451 struct btrfs_space_info *space_info;
452 struct btrfs_trans_handle *trans;
453 u64 delalloc_bytes;
454 u64 dio_bytes;
455 u64 async_pages;
456 u64 items;
457 long time_left;
458 unsigned long nr_pages;
459 int loops;
460
461 /* Calc the number of the pages we need flush for space reservation */
462 items = calc_reclaim_items_nr(fs_info, to_reclaim);
463 to_reclaim = items * EXTENT_SIZE_PER_ITEM;
464
465 trans = (struct btrfs_trans_handle *)current->journal_info;
466 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
467
468 delalloc_bytes = percpu_counter_sum_positive(
469 &fs_info->delalloc_bytes);
470 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
471 if (delalloc_bytes == 0 && dio_bytes == 0) {
472 if (trans)
473 return;
474 if (wait_ordered)
475 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
476 return;
477 }
478
479 /*
480 * If we are doing more ordered than delalloc we need to just wait on
481 * ordered extents, otherwise we'll waste time trying to flush delalloc
482 * that likely won't give us the space back we need.
483 */
484 if (dio_bytes > delalloc_bytes)
485 wait_ordered = true;
486
487 loops = 0;
488 while ((delalloc_bytes || dio_bytes) && loops < 3) {
489 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
490
491 /*
492 * Triggers inode writeback for up to nr_pages. This will invoke
493 * ->writepages callback and trigger delalloc filling
494 * (btrfs_run_delalloc_range()).
495 */
496 btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
497
498 /*
499 * We need to wait for the compressed pages to start before
500 * we continue.
501 */
502 async_pages = atomic_read(&fs_info->async_delalloc_pages);
503 if (!async_pages)
504 goto skip_async;
505
506 /*
507 * Calculate how many compressed pages we want to be written
508 * before we continue. I.e if there are more async pages than we
509 * require wait_event will wait until nr_pages are written.
510 */
511 if (async_pages <= nr_pages)
512 async_pages = 0;
513 else
514 async_pages -= nr_pages;
515
516 wait_event(fs_info->async_submit_wait,
517 atomic_read(&fs_info->async_delalloc_pages) <=
518 (int)async_pages);
519skip_async:
520 spin_lock(&space_info->lock);
521 if (list_empty(&space_info->tickets) &&
522 list_empty(&space_info->priority_tickets)) {
523 spin_unlock(&space_info->lock);
524 break;
525 }
526 spin_unlock(&space_info->lock);
527
528 loops++;
529 if (wait_ordered && !trans) {
530 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
531 } else {
532 time_left = schedule_timeout_killable(1);
533 if (time_left)
534 break;
535 }
536 delalloc_bytes = percpu_counter_sum_positive(
537 &fs_info->delalloc_bytes);
538 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
539 }
540}
541
542/**
543 * maybe_commit_transaction - possibly commit the transaction if its ok to
544 * @root - the root we're allocating for
545 * @bytes - the number of bytes we want to reserve
546 * @force - force the commit
547 *
548 * This will check to make sure that committing the transaction will actually
549 * get us somewhere and then commit the transaction if it does. Otherwise it
550 * will return -ENOSPC.
551 */
552static int may_commit_transaction(struct btrfs_fs_info *fs_info,
553 struct btrfs_space_info *space_info)
554{
555 struct reserve_ticket *ticket = NULL;
556 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
557 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
558 struct btrfs_trans_handle *trans;
559 u64 bytes_needed;
560 u64 reclaim_bytes = 0;
561
562 trans = (struct btrfs_trans_handle *)current->journal_info;
563 if (trans)
564 return -EAGAIN;
565
566 spin_lock(&space_info->lock);
567 if (!list_empty(&space_info->priority_tickets))
568 ticket = list_first_entry(&space_info->priority_tickets,
569 struct reserve_ticket, list);
570 else if (!list_empty(&space_info->tickets))
571 ticket = list_first_entry(&space_info->tickets,
572 struct reserve_ticket, list);
573 bytes_needed = (ticket) ? ticket->bytes : 0;
574 spin_unlock(&space_info->lock);
575
576 if (!bytes_needed)
577 return 0;
578
579 trans = btrfs_join_transaction(fs_info->extent_root);
580 if (IS_ERR(trans))
581 return PTR_ERR(trans);
582
583 /*
584 * See if there is enough pinned space to make this reservation, or if
585 * we have block groups that are going to be freed, allowing us to
586 * possibly do a chunk allocation the next loop through.
587 */
588 if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
589 __percpu_counter_compare(&space_info->total_bytes_pinned,
590 bytes_needed,
591 BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
592 goto commit;
593
594 /*
595 * See if there is some space in the delayed insertion reservation for
596 * this reservation.
597 */
598 if (space_info != delayed_rsv->space_info)
599 goto enospc;
600
601 spin_lock(&delayed_rsv->lock);
602 reclaim_bytes += delayed_rsv->reserved;
603 spin_unlock(&delayed_rsv->lock);
604
605 spin_lock(&delayed_refs_rsv->lock);
606 reclaim_bytes += delayed_refs_rsv->reserved;
607 spin_unlock(&delayed_refs_rsv->lock);
608 if (reclaim_bytes >= bytes_needed)
609 goto commit;
610 bytes_needed -= reclaim_bytes;
611
612 if (__percpu_counter_compare(&space_info->total_bytes_pinned,
613 bytes_needed,
614 BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
615 goto enospc;
616
617commit:
618 return btrfs_commit_transaction(trans);
619enospc:
620 btrfs_end_transaction(trans);
621 return -ENOSPC;
622}
623
624/*
625 * Try to flush some data based on policy set by @state. This is only advisory
626 * and may fail for various reasons. The caller is supposed to examine the
627 * state of @space_info to detect the outcome.
628 */
629static void flush_space(struct btrfs_fs_info *fs_info,
630 struct btrfs_space_info *space_info, u64 num_bytes,
631 int state)
632{
633 struct btrfs_root *root = fs_info->extent_root;
634 struct btrfs_trans_handle *trans;
635 int nr;
636 int ret = 0;
637
638 switch (state) {
639 case FLUSH_DELAYED_ITEMS_NR:
640 case FLUSH_DELAYED_ITEMS:
641 if (state == FLUSH_DELAYED_ITEMS_NR)
642 nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
643 else
644 nr = -1;
645
646 trans = btrfs_join_transaction(root);
647 if (IS_ERR(trans)) {
648 ret = PTR_ERR(trans);
649 break;
650 }
651 ret = btrfs_run_delayed_items_nr(trans, nr);
652 btrfs_end_transaction(trans);
653 break;
654 case FLUSH_DELALLOC:
655 case FLUSH_DELALLOC_WAIT:
656 shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
657 state == FLUSH_DELALLOC_WAIT);
658 break;
659 case FLUSH_DELAYED_REFS_NR:
660 case FLUSH_DELAYED_REFS:
661 trans = btrfs_join_transaction(root);
662 if (IS_ERR(trans)) {
663 ret = PTR_ERR(trans);
664 break;
665 }
666 if (state == FLUSH_DELAYED_REFS_NR)
667 nr = calc_reclaim_items_nr(fs_info, num_bytes);
668 else
669 nr = 0;
670 btrfs_run_delayed_refs(trans, nr);
671 btrfs_end_transaction(trans);
672 break;
673 case ALLOC_CHUNK:
674 case ALLOC_CHUNK_FORCE:
675 trans = btrfs_join_transaction(root);
676 if (IS_ERR(trans)) {
677 ret = PTR_ERR(trans);
678 break;
679 }
680 ret = btrfs_chunk_alloc(trans,
681 btrfs_metadata_alloc_profile(fs_info),
682 (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
683 CHUNK_ALLOC_FORCE);
684 btrfs_end_transaction(trans);
685 if (ret > 0 || ret == -ENOSPC)
686 ret = 0;
687 break;
688 case COMMIT_TRANS:
689 /*
690 * If we have pending delayed iputs then we could free up a
691 * bunch of pinned space, so make sure we run the iputs before
692 * we do our pinned bytes check below.
693 */
694 btrfs_run_delayed_iputs(fs_info);
695 btrfs_wait_on_delayed_iputs(fs_info);
696
697 ret = may_commit_transaction(fs_info, space_info);
698 break;
699 default:
700 ret = -ENOSPC;
701 break;
702 }
703
704 trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
705 ret);
706 return;
707}
708
709static inline u64
710btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
711 struct btrfs_space_info *space_info,
712 bool system_chunk)
713{
714 struct reserve_ticket *ticket;
715 u64 used;
716 u64 expected;
717 u64 to_reclaim = 0;
718
719 list_for_each_entry(ticket, &space_info->tickets, list)
720 to_reclaim += ticket->bytes;
721 list_for_each_entry(ticket, &space_info->priority_tickets, list)
722 to_reclaim += ticket->bytes;
723 if (to_reclaim)
724 return to_reclaim;
725
726 to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
727 if (btrfs_can_overcommit(fs_info, space_info, to_reclaim,
728 BTRFS_RESERVE_FLUSH_ALL, system_chunk))
729 return 0;
730
731 used = btrfs_space_info_used(space_info, true);
732
733 if (btrfs_can_overcommit(fs_info, space_info, SZ_1M,
734 BTRFS_RESERVE_FLUSH_ALL, system_chunk))
735 expected = div_factor_fine(space_info->total_bytes, 95);
736 else
737 expected = div_factor_fine(space_info->total_bytes, 90);
738
739 if (used > expected)
740 to_reclaim = used - expected;
741 else
742 to_reclaim = 0;
743 to_reclaim = min(to_reclaim, space_info->bytes_may_use +
744 space_info->bytes_reserved);
745 return to_reclaim;
746}
747
748static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
749 struct btrfs_space_info *space_info,
750 u64 used, bool system_chunk)
751{
752 u64 thresh = div_factor_fine(space_info->total_bytes, 98);
753
754 /* If we're just plain full then async reclaim just slows us down. */
755 if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
756 return 0;
757
758 if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
759 system_chunk))
760 return 0;
761
762 return (used >= thresh && !btrfs_fs_closing(fs_info) &&
763 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
764}
765
766static bool wake_all_tickets(struct list_head *head)
767{
768 struct reserve_ticket *ticket;
769
770 while (!list_empty(head)) {
771 ticket = list_first_entry(head, struct reserve_ticket, list);
772 list_del_init(&ticket->list);
773 ticket->error = -ENOSPC;
774 wake_up(&ticket->wait);
775 if (ticket->bytes != ticket->orig_bytes)
776 return true;
777 }
778 return false;
779}
780
781/*
782 * This is for normal flushers, we can wait all goddamned day if we want to. We
783 * will loop and continuously try to flush as long as we are making progress.
784 * We count progress as clearing off tickets each time we have to loop.
785 */
786static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
787{
788 struct btrfs_fs_info *fs_info;
789 struct btrfs_space_info *space_info;
790 u64 to_reclaim;
791 int flush_state;
792 int commit_cycles = 0;
793 u64 last_tickets_id;
794
795 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
796 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
797
798 spin_lock(&space_info->lock);
799 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
800 false);
801 if (!to_reclaim) {
802 space_info->flush = 0;
803 spin_unlock(&space_info->lock);
804 return;
805 }
806 last_tickets_id = space_info->tickets_id;
807 spin_unlock(&space_info->lock);
808
809 flush_state = FLUSH_DELAYED_ITEMS_NR;
810 do {
811 flush_space(fs_info, space_info, to_reclaim, flush_state);
812 spin_lock(&space_info->lock);
813 if (list_empty(&space_info->tickets)) {
814 space_info->flush = 0;
815 spin_unlock(&space_info->lock);
816 return;
817 }
818 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
819 space_info,
820 false);
821 if (last_tickets_id == space_info->tickets_id) {
822 flush_state++;
823 } else {
824 last_tickets_id = space_info->tickets_id;
825 flush_state = FLUSH_DELAYED_ITEMS_NR;
826 if (commit_cycles)
827 commit_cycles--;
828 }
829
830 /*
831 * We don't want to force a chunk allocation until we've tried
832 * pretty hard to reclaim space. Think of the case where we
833 * freed up a bunch of space and so have a lot of pinned space
834 * to reclaim. We would rather use that than possibly create a
835 * underutilized metadata chunk. So if this is our first run
836 * through the flushing state machine skip ALLOC_CHUNK_FORCE and
837 * commit the transaction. If nothing has changed the next go
838 * around then we can force a chunk allocation.
839 */
840 if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
841 flush_state++;
842
843 if (flush_state > COMMIT_TRANS) {
844 commit_cycles++;
845 if (commit_cycles > 2) {
846 if (wake_all_tickets(&space_info->tickets)) {
847 flush_state = FLUSH_DELAYED_ITEMS_NR;
848 commit_cycles--;
849 } else {
850 space_info->flush = 0;
851 }
852 } else {
853 flush_state = FLUSH_DELAYED_ITEMS_NR;
854 }
855 }
856 spin_unlock(&space_info->lock);
857 } while (flush_state <= COMMIT_TRANS);
858}
859
860void btrfs_init_async_reclaim_work(struct work_struct *work)
861{
862 INIT_WORK(work, btrfs_async_reclaim_metadata_space);
863}
864
865static const enum btrfs_flush_state priority_flush_states[] = {
866 FLUSH_DELAYED_ITEMS_NR,
867 FLUSH_DELAYED_ITEMS,
868 ALLOC_CHUNK,
869};
870
871static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
872 struct btrfs_space_info *space_info,
873 struct reserve_ticket *ticket)
874{
875 u64 to_reclaim;
876 int flush_state;
877
878 spin_lock(&space_info->lock);
879 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
880 false);
881 if (!to_reclaim) {
882 spin_unlock(&space_info->lock);
883 return;
884 }
885 spin_unlock(&space_info->lock);
886
887 flush_state = 0;
888 do {
889 flush_space(fs_info, space_info, to_reclaim,
890 priority_flush_states[flush_state]);
891 flush_state++;
892 spin_lock(&space_info->lock);
893 if (ticket->bytes == 0) {
894 spin_unlock(&space_info->lock);
895 return;
896 }
897 spin_unlock(&space_info->lock);
898 } while (flush_state < ARRAY_SIZE(priority_flush_states));
899}
900
901static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
902 struct btrfs_space_info *space_info,
903 struct reserve_ticket *ticket)
904
905{
906 DEFINE_WAIT(wait);
907 u64 reclaim_bytes = 0;
908 int ret = 0;
909
910 spin_lock(&space_info->lock);
911 while (ticket->bytes > 0 && ticket->error == 0) {
912 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
913 if (ret) {
914 ret = -EINTR;
915 break;
916 }
917 spin_unlock(&space_info->lock);
918
919 schedule();
920
921 finish_wait(&ticket->wait, &wait);
922 spin_lock(&space_info->lock);
923 }
924 if (!ret)
925 ret = ticket->error;
926 if (!list_empty(&ticket->list))
927 list_del_init(&ticket->list);
928 if (ticket->bytes && ticket->bytes < ticket->orig_bytes)
929 reclaim_bytes = ticket->orig_bytes - ticket->bytes;
930 spin_unlock(&space_info->lock);
931
932 if (reclaim_bytes)
933 btrfs_space_info_add_old_bytes(fs_info, space_info,
934 reclaim_bytes);
935 return ret;
936}
937
938/**
939 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
940 * @root - the root we're allocating for
941 * @space_info - the space info we want to allocate from
942 * @orig_bytes - the number of bytes we want
943 * @flush - whether or not we can flush to make our reservation
944 *
945 * This will reserve orig_bytes number of bytes from the space info associated
946 * with the block_rsv. If there is not enough space it will make an attempt to
947 * flush out space to make room. It will do this by flushing delalloc if
948 * possible or committing the transaction. If flush is 0 then no attempts to
949 * regain reservations will be made and this will fail if there is not enough
950 * space already.
951 */
952static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
953 struct btrfs_space_info *space_info,
954 u64 orig_bytes,
955 enum btrfs_reserve_flush_enum flush,
956 bool system_chunk)
957{
958 struct reserve_ticket ticket;
959 u64 used;
960 u64 reclaim_bytes = 0;
961 int ret = 0;
962
963 ASSERT(orig_bytes);
964 ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
965
966 spin_lock(&space_info->lock);
967 ret = -ENOSPC;
968 used = btrfs_space_info_used(space_info, true);
969
970 /*
971 * If we have enough space then hooray, make our reservation and carry
972 * on. If not see if we can overcommit, and if we can, hooray carry on.
973 * If not things get more complicated.
974 */
975 if (used + orig_bytes <= space_info->total_bytes) {
976 btrfs_space_info_update_bytes_may_use(fs_info, space_info,
977 orig_bytes);
978 trace_btrfs_space_reservation(fs_info, "space_info",
979 space_info->flags, orig_bytes, 1);
980 ret = 0;
981 } else if (btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush,
982 system_chunk)) {
983 btrfs_space_info_update_bytes_may_use(fs_info, space_info,
984 orig_bytes);
985 trace_btrfs_space_reservation(fs_info, "space_info",
986 space_info->flags, orig_bytes, 1);
987 ret = 0;
988 }
989
990 /*
991 * If we couldn't make a reservation then setup our reservation ticket
992 * and kick the async worker if it's not already running.
993 *
994 * If we are a priority flusher then we just need to add our ticket to
995 * the list and we will do our own flushing further down.
996 */
997 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
998 ticket.orig_bytes = orig_bytes;
999 ticket.bytes = orig_bytes;
1000 ticket.error = 0;
1001 init_waitqueue_head(&ticket.wait);
1002 if (flush == BTRFS_RESERVE_FLUSH_ALL) {
1003 list_add_tail(&ticket.list, &space_info->tickets);
1004 if (!space_info->flush) {
1005 space_info->flush = 1;
1006 trace_btrfs_trigger_flush(fs_info,
1007 space_info->flags,
1008 orig_bytes, flush,
1009 "enospc");
1010 queue_work(system_unbound_wq,
1011 &fs_info->async_reclaim_work);
1012 }
1013 } else {
1014 list_add_tail(&ticket.list,
1015 &space_info->priority_tickets);
1016 }
1017 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
1018 used += orig_bytes;
1019 /*
1020 * We will do the space reservation dance during log replay,
1021 * which means we won't have fs_info->fs_root set, so don't do
1022 * the async reclaim as we will panic.
1023 */
1024 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
1025 need_do_async_reclaim(fs_info, space_info,
1026 used, system_chunk) &&
1027 !work_busy(&fs_info->async_reclaim_work)) {
1028 trace_btrfs_trigger_flush(fs_info, space_info->flags,
1029 orig_bytes, flush, "preempt");
1030 queue_work(system_unbound_wq,
1031 &fs_info->async_reclaim_work);
1032 }
1033 }
1034 spin_unlock(&space_info->lock);
1035 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
1036 return ret;
1037
1038 if (flush == BTRFS_RESERVE_FLUSH_ALL)
1039 return wait_reserve_ticket(fs_info, space_info, &ticket);
1040
1041 ret = 0;
1042 priority_reclaim_metadata_space(fs_info, space_info, &ticket);
1043 spin_lock(&space_info->lock);
1044 if (ticket.bytes) {
1045 if (ticket.bytes < orig_bytes)
1046 reclaim_bytes = orig_bytes - ticket.bytes;
1047 list_del_init(&ticket.list);
1048 ret = -ENOSPC;
1049 }
1050 spin_unlock(&space_info->lock);
1051
1052 if (reclaim_bytes)
1053 btrfs_space_info_add_old_bytes(fs_info, space_info,
1054 reclaim_bytes);
1055 ASSERT(list_empty(&ticket.list));
1056 return ret;
1057}
1058
1059/**
1060 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
1061 * @root - the root we're allocating for
1062 * @block_rsv - the block_rsv we're allocating for
1063 * @orig_bytes - the number of bytes we want
1064 * @flush - whether or not we can flush to make our reservation
1065 *
1066 * This will reserve orig_bytes number of bytes from the space info associated
1067 * with the block_rsv. If there is not enough space it will make an attempt to
1068 * flush out space to make room. It will do this by flushing delalloc if
1069 * possible or committing the transaction. If flush is 0 then no attempts to
1070 * regain reservations will be made and this will fail if there is not enough
1071 * space already.
1072 */
1073int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
1074 struct btrfs_block_rsv *block_rsv,
1075 u64 orig_bytes,
1076 enum btrfs_reserve_flush_enum flush)
1077{
1078 struct btrfs_fs_info *fs_info = root->fs_info;
1079 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
1080 int ret;
1081 bool system_chunk = (root == fs_info->chunk_root);
1082
1083 ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
1084 orig_bytes, flush, system_chunk);
1085 if (ret == -ENOSPC &&
1086 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
1087 if (block_rsv != global_rsv &&
1088 !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes))
1089 ret = 0;
1090 }
1091 if (ret == -ENOSPC) {
1092 trace_btrfs_space_reservation(fs_info, "space_info:enospc",
1093 block_rsv->space_info->flags,
1094 orig_bytes, 1);
1095
1096 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
1097 btrfs_dump_space_info(fs_info, block_rsv->space_info,
1098 orig_bytes, 0);
1099 }
1100 return ret;
1101}