blob: 13a4326c8821d97e655b34e00a3daf25c502fb07 [file] [log] [blame]
Josef Bacik280c29082019-06-18 16:09:19 -04001// SPDX-License-Identifier: GPL-2.0
2
3#include "ctree.h"
4#include "space-info.h"
5#include "sysfs.h"
6#include "volumes.h"
Josef Bacik5da6afe2019-06-18 16:09:24 -04007#include "free-space-cache.h"
Josef Bacik0d9764f2019-06-18 16:09:25 -04008#include "ordered-data.h"
9#include "transaction.h"
10#include "math.h"
Josef Bacikaac00232019-06-20 15:37:44 -040011#include "block-group.h"
Josef Bacik280c29082019-06-18 16:09:19 -040012
13u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
14 bool may_use_included)
15{
16 ASSERT(s_info);
17 return s_info->bytes_used + s_info->bytes_reserved +
18 s_info->bytes_pinned + s_info->bytes_readonly +
19 (may_use_included ? s_info->bytes_may_use : 0);
20}
21
22/*
23 * after adding space to the filesystem, we need to clear the full flags
24 * on all the space infos.
25 */
26void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
27{
28 struct list_head *head = &info->space_info;
29 struct btrfs_space_info *found;
30
31 rcu_read_lock();
32 list_for_each_entry_rcu(found, head, list)
33 found->full = 0;
34 rcu_read_unlock();
35}
36
Josef Bacik280c29082019-06-18 16:09:19 -040037static int create_space_info(struct btrfs_fs_info *info, u64 flags)
38{
39
40 struct btrfs_space_info *space_info;
41 int i;
42 int ret;
43
44 space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
45 if (!space_info)
46 return -ENOMEM;
47
48 ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
49 GFP_KERNEL);
50 if (ret) {
51 kfree(space_info);
52 return ret;
53 }
54
55 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
56 INIT_LIST_HEAD(&space_info->block_groups[i]);
57 init_rwsem(&space_info->groups_sem);
58 spin_lock_init(&space_info->lock);
59 space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
60 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
61 init_waitqueue_head(&space_info->wait);
62 INIT_LIST_HEAD(&space_info->ro_bgs);
63 INIT_LIST_HEAD(&space_info->tickets);
64 INIT_LIST_HEAD(&space_info->priority_tickets);
65
David Sterbab8823272019-08-01 18:50:16 +020066 ret = btrfs_sysfs_add_space_info_type(info, space_info);
67 if (ret)
Josef Bacik280c29082019-06-18 16:09:19 -040068 return ret;
Josef Bacik280c29082019-06-18 16:09:19 -040069
70 list_add_rcu(&space_info->list, &info->space_info);
71 if (flags & BTRFS_BLOCK_GROUP_DATA)
72 info->data_sinfo = space_info;
73
74 return ret;
75}
76
77int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
78{
79 struct btrfs_super_block *disk_super;
80 u64 features;
81 u64 flags;
82 int mixed = 0;
83 int ret;
84
85 disk_super = fs_info->super_copy;
86 if (!btrfs_super_root(disk_super))
87 return -EINVAL;
88
89 features = btrfs_super_incompat_flags(disk_super);
90 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
91 mixed = 1;
92
93 flags = BTRFS_BLOCK_GROUP_SYSTEM;
94 ret = create_space_info(fs_info, flags);
95 if (ret)
96 goto out;
97
98 if (mixed) {
99 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
100 ret = create_space_info(fs_info, flags);
101 } else {
102 flags = BTRFS_BLOCK_GROUP_METADATA;
103 ret = create_space_info(fs_info, flags);
104 if (ret)
105 goto out;
106
107 flags = BTRFS_BLOCK_GROUP_DATA;
108 ret = create_space_info(fs_info, flags);
109 }
110out:
111 return ret;
112}
113
114void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
115 u64 total_bytes, u64 bytes_used,
116 u64 bytes_readonly,
117 struct btrfs_space_info **space_info)
118{
119 struct btrfs_space_info *found;
120 int factor;
121
122 factor = btrfs_bg_type_to_factor(flags);
123
124 found = btrfs_find_space_info(info, flags);
125 ASSERT(found);
126 spin_lock(&found->lock);
127 found->total_bytes += total_bytes;
128 found->disk_total += total_bytes * factor;
129 found->bytes_used += bytes_used;
130 found->disk_used += bytes_used * factor;
131 found->bytes_readonly += bytes_readonly;
132 if (total_bytes > 0)
133 found->full = 0;
134 btrfs_space_info_add_new_bytes(info, found,
135 total_bytes - bytes_used -
136 bytes_readonly);
137 spin_unlock(&found->lock);
138 *space_info = found;
139}
140
141struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
142 u64 flags)
143{
144 struct list_head *head = &info->space_info;
145 struct btrfs_space_info *found;
146
147 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
148
149 rcu_read_lock();
150 list_for_each_entry_rcu(found, head, list) {
151 if (found->flags & flags) {
152 rcu_read_unlock();
153 return found;
154 }
155 }
156 rcu_read_unlock();
157 return NULL;
158}
Josef Bacik41783ef2019-06-18 16:09:20 -0400159
160static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
161{
162 return (global->size << 1);
163}
164
Josef Bacik83d731a2019-06-18 16:09:26 -0400165static int can_overcommit(struct btrfs_fs_info *fs_info,
166 struct btrfs_space_info *space_info, u64 bytes,
167 enum btrfs_reserve_flush_enum flush,
168 bool system_chunk)
Josef Bacik41783ef2019-06-18 16:09:20 -0400169{
170 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
171 u64 profile;
172 u64 space_size;
173 u64 avail;
174 u64 used;
175 int factor;
176
177 /* Don't overcommit when in mixed mode. */
178 if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
179 return 0;
180
181 if (system_chunk)
182 profile = btrfs_system_alloc_profile(fs_info);
183 else
184 profile = btrfs_metadata_alloc_profile(fs_info);
185
186 used = btrfs_space_info_used(space_info, false);
187
188 /*
189 * We only want to allow over committing if we have lots of actual space
190 * free, but if we don't have enough space to handle the global reserve
191 * space then we could end up having a real enospc problem when trying
192 * to allocate a chunk or some other such important allocation.
193 */
194 spin_lock(&global_rsv->lock);
195 space_size = calc_global_rsv_need_space(global_rsv);
196 spin_unlock(&global_rsv->lock);
197 if (used + space_size >= space_info->total_bytes)
198 return 0;
199
200 used += space_info->bytes_may_use;
201
202 avail = atomic64_read(&fs_info->free_chunk_space);
203
204 /*
205 * If we have dup, raid1 or raid10 then only half of the free
206 * space is actually usable. For raid56, the space info used
207 * doesn't include the parity drive, so we don't have to
208 * change the math
209 */
210 factor = btrfs_bg_type_to_factor(profile);
211 avail = div_u64(avail, factor);
212
213 /*
214 * If we aren't flushing all things, let us overcommit up to
215 * 1/2th of the space. If we can flush, don't let us overcommit
216 * too much, let it overcommit up to 1/8 of the space.
217 */
218 if (flush == BTRFS_RESERVE_FLUSH_ALL)
219 avail >>= 3;
220 else
221 avail >>= 1;
222
223 if (used + bytes < space_info->total_bytes + avail)
224 return 1;
225 return 0;
226}
Josef Bacikb338b012019-06-18 16:09:22 -0400227
228/*
229 * This is for space we already have accounted in space_info->bytes_may_use, so
230 * basically when we're returning space from block_rsv's.
231 */
232void btrfs_space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
233 struct btrfs_space_info *space_info,
234 u64 num_bytes)
235{
236 struct reserve_ticket *ticket;
237 struct list_head *head;
238 u64 used;
239 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
240 bool check_overcommit = false;
241
242 spin_lock(&space_info->lock);
243 head = &space_info->priority_tickets;
244
245 /*
246 * If we are over our limit then we need to check and see if we can
247 * overcommit, and if we can't then we just need to free up our space
248 * and not satisfy any requests.
249 */
250 used = btrfs_space_info_used(space_info, true);
251 if (used - num_bytes >= space_info->total_bytes)
252 check_overcommit = true;
253again:
254 while (!list_empty(head) && num_bytes) {
255 ticket = list_first_entry(head, struct reserve_ticket,
256 list);
257 /*
258 * We use 0 bytes because this space is already reserved, so
259 * adding the ticket space would be a double count.
260 */
261 if (check_overcommit &&
Josef Bacik83d731a2019-06-18 16:09:26 -0400262 !can_overcommit(fs_info, space_info, 0, flush, false))
Josef Bacikb338b012019-06-18 16:09:22 -0400263 break;
264 if (num_bytes >= ticket->bytes) {
265 list_del_init(&ticket->list);
266 num_bytes -= ticket->bytes;
267 ticket->bytes = 0;
268 space_info->tickets_id++;
269 wake_up(&ticket->wait);
270 } else {
271 ticket->bytes -= num_bytes;
272 num_bytes = 0;
273 }
274 }
275
276 if (num_bytes && head == &space_info->priority_tickets) {
277 head = &space_info->tickets;
278 flush = BTRFS_RESERVE_FLUSH_ALL;
279 goto again;
280 }
281 btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes);
282 trace_btrfs_space_reservation(fs_info, "space_info",
283 space_info->flags, num_bytes, 0);
284 spin_unlock(&space_info->lock);
285}
286
287/*
288 * This is for newly allocated space that isn't accounted in
289 * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent
290 * we use this helper.
291 */
292void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
293 struct btrfs_space_info *space_info,
294 u64 num_bytes)
295{
296 struct reserve_ticket *ticket;
297 struct list_head *head = &space_info->priority_tickets;
298
299again:
300 while (!list_empty(head) && num_bytes) {
301 ticket = list_first_entry(head, struct reserve_ticket,
302 list);
303 if (num_bytes >= ticket->bytes) {
304 trace_btrfs_space_reservation(fs_info, "space_info",
305 space_info->flags,
306 ticket->bytes, 1);
307 list_del_init(&ticket->list);
308 num_bytes -= ticket->bytes;
309 btrfs_space_info_update_bytes_may_use(fs_info,
310 space_info,
311 ticket->bytes);
312 ticket->bytes = 0;
313 space_info->tickets_id++;
314 wake_up(&ticket->wait);
315 } else {
316 trace_btrfs_space_reservation(fs_info, "space_info",
317 space_info->flags,
318 num_bytes, 1);
319 btrfs_space_info_update_bytes_may_use(fs_info,
320 space_info,
321 num_bytes);
322 ticket->bytes -= num_bytes;
323 num_bytes = 0;
324 }
325 }
326
327 if (num_bytes && head == &space_info->priority_tickets) {
328 head = &space_info->tickets;
329 goto again;
330 }
331}
Josef Bacik5da6afe2019-06-18 16:09:24 -0400332
333#define DUMP_BLOCK_RSV(fs_info, rsv_name) \
334do { \
335 struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \
336 spin_lock(&__rsv->lock); \
337 btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \
338 __rsv->size, __rsv->reserved); \
339 spin_unlock(&__rsv->lock); \
340} while (0)
341
342void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
343 struct btrfs_space_info *info, u64 bytes,
344 int dump_block_groups)
345{
346 struct btrfs_block_group_cache *cache;
347 int index = 0;
348
349 spin_lock(&info->lock);
350 btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
351 info->flags,
352 info->total_bytes - btrfs_space_info_used(info, true),
353 info->full ? "" : "not ");
354 btrfs_info(fs_info,
355 "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
356 info->total_bytes, info->bytes_used, info->bytes_pinned,
357 info->bytes_reserved, info->bytes_may_use,
358 info->bytes_readonly);
359 spin_unlock(&info->lock);
360
361 DUMP_BLOCK_RSV(fs_info, global_block_rsv);
362 DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
363 DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
364 DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
365 DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
366
367 if (!dump_block_groups)
368 return;
369
370 down_read(&info->groups_sem);
371again:
372 list_for_each_entry(cache, &info->block_groups[index], list) {
373 spin_lock(&cache->lock);
374 btrfs_info(fs_info,
375 "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
376 cache->key.objectid, cache->key.offset,
377 btrfs_block_group_used(&cache->item), cache->pinned,
378 cache->reserved, cache->ro ? "[readonly]" : "");
379 btrfs_dump_free_space(cache, bytes);
380 spin_unlock(&cache->lock);
381 }
382 if (++index < BTRFS_NR_RAID_TYPES)
383 goto again;
384 up_read(&info->groups_sem);
385}
Josef Bacik0d9764f2019-06-18 16:09:25 -0400386
387static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
388 unsigned long nr_pages, int nr_items)
389{
390 struct super_block *sb = fs_info->sb;
391
392 if (down_read_trylock(&sb->s_umount)) {
393 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
394 up_read(&sb->s_umount);
395 } else {
396 /*
397 * We needn't worry the filesystem going from r/w to r/o though
398 * we don't acquire ->s_umount mutex, because the filesystem
399 * should guarantee the delalloc inodes list be empty after
400 * the filesystem is readonly(all dirty pages are written to
401 * the disk).
402 */
403 btrfs_start_delalloc_roots(fs_info, nr_items);
404 if (!current->journal_info)
405 btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
406 }
407}
408
409static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
410 u64 to_reclaim)
411{
412 u64 bytes;
413 u64 nr;
414
Josef Bacik2bd36e72019-08-22 15:14:33 -0400415 bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
Josef Bacik0d9764f2019-06-18 16:09:25 -0400416 nr = div64_u64(to_reclaim, bytes);
417 if (!nr)
418 nr = 1;
419 return nr;
420}
421
422#define EXTENT_SIZE_PER_ITEM SZ_256K
423
424/*
425 * shrink metadata reservation for delalloc
426 */
427static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
428 u64 orig, bool wait_ordered)
429{
430 struct btrfs_space_info *space_info;
431 struct btrfs_trans_handle *trans;
432 u64 delalloc_bytes;
433 u64 dio_bytes;
434 u64 async_pages;
435 u64 items;
436 long time_left;
437 unsigned long nr_pages;
438 int loops;
439
440 /* Calc the number of the pages we need flush for space reservation */
441 items = calc_reclaim_items_nr(fs_info, to_reclaim);
442 to_reclaim = items * EXTENT_SIZE_PER_ITEM;
443
444 trans = (struct btrfs_trans_handle *)current->journal_info;
445 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
446
447 delalloc_bytes = percpu_counter_sum_positive(
448 &fs_info->delalloc_bytes);
449 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
450 if (delalloc_bytes == 0 && dio_bytes == 0) {
451 if (trans)
452 return;
453 if (wait_ordered)
454 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
455 return;
456 }
457
458 /*
459 * If we are doing more ordered than delalloc we need to just wait on
460 * ordered extents, otherwise we'll waste time trying to flush delalloc
461 * that likely won't give us the space back we need.
462 */
463 if (dio_bytes > delalloc_bytes)
464 wait_ordered = true;
465
466 loops = 0;
467 while ((delalloc_bytes || dio_bytes) && loops < 3) {
468 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
469
470 /*
471 * Triggers inode writeback for up to nr_pages. This will invoke
472 * ->writepages callback and trigger delalloc filling
473 * (btrfs_run_delalloc_range()).
474 */
475 btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
476
477 /*
478 * We need to wait for the compressed pages to start before
479 * we continue.
480 */
481 async_pages = atomic_read(&fs_info->async_delalloc_pages);
482 if (!async_pages)
483 goto skip_async;
484
485 /*
486 * Calculate how many compressed pages we want to be written
487 * before we continue. I.e if there are more async pages than we
488 * require wait_event will wait until nr_pages are written.
489 */
490 if (async_pages <= nr_pages)
491 async_pages = 0;
492 else
493 async_pages -= nr_pages;
494
495 wait_event(fs_info->async_submit_wait,
496 atomic_read(&fs_info->async_delalloc_pages) <=
497 (int)async_pages);
498skip_async:
499 spin_lock(&space_info->lock);
500 if (list_empty(&space_info->tickets) &&
501 list_empty(&space_info->priority_tickets)) {
502 spin_unlock(&space_info->lock);
503 break;
504 }
505 spin_unlock(&space_info->lock);
506
507 loops++;
508 if (wait_ordered && !trans) {
509 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
510 } else {
511 time_left = schedule_timeout_killable(1);
512 if (time_left)
513 break;
514 }
515 delalloc_bytes = percpu_counter_sum_positive(
516 &fs_info->delalloc_bytes);
517 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
518 }
519}
520
521/**
522 * maybe_commit_transaction - possibly commit the transaction if its ok to
523 * @root - the root we're allocating for
524 * @bytes - the number of bytes we want to reserve
525 * @force - force the commit
526 *
527 * This will check to make sure that committing the transaction will actually
528 * get us somewhere and then commit the transaction if it does. Otherwise it
529 * will return -ENOSPC.
530 */
531static int may_commit_transaction(struct btrfs_fs_info *fs_info,
532 struct btrfs_space_info *space_info)
533{
534 struct reserve_ticket *ticket = NULL;
535 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
536 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
537 struct btrfs_trans_handle *trans;
538 u64 bytes_needed;
539 u64 reclaim_bytes = 0;
540
541 trans = (struct btrfs_trans_handle *)current->journal_info;
542 if (trans)
543 return -EAGAIN;
544
545 spin_lock(&space_info->lock);
546 if (!list_empty(&space_info->priority_tickets))
547 ticket = list_first_entry(&space_info->priority_tickets,
548 struct reserve_ticket, list);
549 else if (!list_empty(&space_info->tickets))
550 ticket = list_first_entry(&space_info->tickets,
551 struct reserve_ticket, list);
552 bytes_needed = (ticket) ? ticket->bytes : 0;
553 spin_unlock(&space_info->lock);
554
555 if (!bytes_needed)
556 return 0;
557
558 trans = btrfs_join_transaction(fs_info->extent_root);
559 if (IS_ERR(trans))
560 return PTR_ERR(trans);
561
562 /*
563 * See if there is enough pinned space to make this reservation, or if
564 * we have block groups that are going to be freed, allowing us to
565 * possibly do a chunk allocation the next loop through.
566 */
567 if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
568 __percpu_counter_compare(&space_info->total_bytes_pinned,
569 bytes_needed,
570 BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
571 goto commit;
572
573 /*
574 * See if there is some space in the delayed insertion reservation for
575 * this reservation.
576 */
577 if (space_info != delayed_rsv->space_info)
578 goto enospc;
579
580 spin_lock(&delayed_rsv->lock);
581 reclaim_bytes += delayed_rsv->reserved;
582 spin_unlock(&delayed_rsv->lock);
583
584 spin_lock(&delayed_refs_rsv->lock);
585 reclaim_bytes += delayed_refs_rsv->reserved;
586 spin_unlock(&delayed_refs_rsv->lock);
587 if (reclaim_bytes >= bytes_needed)
588 goto commit;
589 bytes_needed -= reclaim_bytes;
590
591 if (__percpu_counter_compare(&space_info->total_bytes_pinned,
592 bytes_needed,
593 BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
594 goto enospc;
595
596commit:
597 return btrfs_commit_transaction(trans);
598enospc:
599 btrfs_end_transaction(trans);
600 return -ENOSPC;
601}
602
603/*
604 * Try to flush some data based on policy set by @state. This is only advisory
605 * and may fail for various reasons. The caller is supposed to examine the
606 * state of @space_info to detect the outcome.
607 */
608static void flush_space(struct btrfs_fs_info *fs_info,
609 struct btrfs_space_info *space_info, u64 num_bytes,
610 int state)
611{
612 struct btrfs_root *root = fs_info->extent_root;
613 struct btrfs_trans_handle *trans;
614 int nr;
615 int ret = 0;
616
617 switch (state) {
618 case FLUSH_DELAYED_ITEMS_NR:
619 case FLUSH_DELAYED_ITEMS:
620 if (state == FLUSH_DELAYED_ITEMS_NR)
621 nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
622 else
623 nr = -1;
624
625 trans = btrfs_join_transaction(root);
626 if (IS_ERR(trans)) {
627 ret = PTR_ERR(trans);
628 break;
629 }
630 ret = btrfs_run_delayed_items_nr(trans, nr);
631 btrfs_end_transaction(trans);
632 break;
633 case FLUSH_DELALLOC:
634 case FLUSH_DELALLOC_WAIT:
635 shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
636 state == FLUSH_DELALLOC_WAIT);
637 break;
638 case FLUSH_DELAYED_REFS_NR:
639 case FLUSH_DELAYED_REFS:
640 trans = btrfs_join_transaction(root);
641 if (IS_ERR(trans)) {
642 ret = PTR_ERR(trans);
643 break;
644 }
645 if (state == FLUSH_DELAYED_REFS_NR)
646 nr = calc_reclaim_items_nr(fs_info, num_bytes);
647 else
648 nr = 0;
649 btrfs_run_delayed_refs(trans, nr);
650 btrfs_end_transaction(trans);
651 break;
652 case ALLOC_CHUNK:
653 case ALLOC_CHUNK_FORCE:
654 trans = btrfs_join_transaction(root);
655 if (IS_ERR(trans)) {
656 ret = PTR_ERR(trans);
657 break;
658 }
659 ret = btrfs_chunk_alloc(trans,
660 btrfs_metadata_alloc_profile(fs_info),
661 (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
662 CHUNK_ALLOC_FORCE);
663 btrfs_end_transaction(trans);
664 if (ret > 0 || ret == -ENOSPC)
665 ret = 0;
666 break;
Josef Bacik844245b2019-08-01 18:19:33 -0400667 case RUN_DELAYED_IPUTS:
Josef Bacik0d9764f2019-06-18 16:09:25 -0400668 /*
669 * If we have pending delayed iputs then we could free up a
670 * bunch of pinned space, so make sure we run the iputs before
671 * we do our pinned bytes check below.
672 */
673 btrfs_run_delayed_iputs(fs_info);
674 btrfs_wait_on_delayed_iputs(fs_info);
Josef Bacik844245b2019-08-01 18:19:33 -0400675 break;
676 case COMMIT_TRANS:
Josef Bacik0d9764f2019-06-18 16:09:25 -0400677 ret = may_commit_transaction(fs_info, space_info);
678 break;
679 default:
680 ret = -ENOSPC;
681 break;
682 }
683
684 trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
685 ret);
686 return;
687}
688
689static inline u64
690btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
691 struct btrfs_space_info *space_info,
692 bool system_chunk)
693{
694 struct reserve_ticket *ticket;
695 u64 used;
696 u64 expected;
697 u64 to_reclaim = 0;
698
699 list_for_each_entry(ticket, &space_info->tickets, list)
700 to_reclaim += ticket->bytes;
701 list_for_each_entry(ticket, &space_info->priority_tickets, list)
702 to_reclaim += ticket->bytes;
703 if (to_reclaim)
704 return to_reclaim;
705
706 to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
Josef Bacik83d731a2019-06-18 16:09:26 -0400707 if (can_overcommit(fs_info, space_info, to_reclaim,
708 BTRFS_RESERVE_FLUSH_ALL, system_chunk))
Josef Bacik0d9764f2019-06-18 16:09:25 -0400709 return 0;
710
711 used = btrfs_space_info_used(space_info, true);
712
Josef Bacik83d731a2019-06-18 16:09:26 -0400713 if (can_overcommit(fs_info, space_info, SZ_1M,
714 BTRFS_RESERVE_FLUSH_ALL, system_chunk))
Josef Bacik0d9764f2019-06-18 16:09:25 -0400715 expected = div_factor_fine(space_info->total_bytes, 95);
716 else
717 expected = div_factor_fine(space_info->total_bytes, 90);
718
719 if (used > expected)
720 to_reclaim = used - expected;
721 else
722 to_reclaim = 0;
723 to_reclaim = min(to_reclaim, space_info->bytes_may_use +
724 space_info->bytes_reserved);
725 return to_reclaim;
726}
727
728static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
729 struct btrfs_space_info *space_info,
730 u64 used, bool system_chunk)
731{
732 u64 thresh = div_factor_fine(space_info->total_bytes, 98);
733
734 /* If we're just plain full then async reclaim just slows us down. */
735 if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
736 return 0;
737
738 if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
739 system_chunk))
740 return 0;
741
742 return (used >= thresh && !btrfs_fs_closing(fs_info) &&
743 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
744}
745
746static bool wake_all_tickets(struct list_head *head)
747{
748 struct reserve_ticket *ticket;
749
750 while (!list_empty(head)) {
751 ticket = list_first_entry(head, struct reserve_ticket, list);
752 list_del_init(&ticket->list);
753 ticket->error = -ENOSPC;
754 wake_up(&ticket->wait);
755 if (ticket->bytes != ticket->orig_bytes)
756 return true;
757 }
758 return false;
759}
760
761/*
762 * This is for normal flushers, we can wait all goddamned day if we want to. We
763 * will loop and continuously try to flush as long as we are making progress.
764 * We count progress as clearing off tickets each time we have to loop.
765 */
766static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
767{
768 struct btrfs_fs_info *fs_info;
769 struct btrfs_space_info *space_info;
770 u64 to_reclaim;
771 int flush_state;
772 int commit_cycles = 0;
773 u64 last_tickets_id;
774
775 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
776 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
777
778 spin_lock(&space_info->lock);
779 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
780 false);
781 if (!to_reclaim) {
782 space_info->flush = 0;
783 spin_unlock(&space_info->lock);
784 return;
785 }
786 last_tickets_id = space_info->tickets_id;
787 spin_unlock(&space_info->lock);
788
789 flush_state = FLUSH_DELAYED_ITEMS_NR;
790 do {
791 flush_space(fs_info, space_info, to_reclaim, flush_state);
792 spin_lock(&space_info->lock);
793 if (list_empty(&space_info->tickets)) {
794 space_info->flush = 0;
795 spin_unlock(&space_info->lock);
796 return;
797 }
798 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
799 space_info,
800 false);
801 if (last_tickets_id == space_info->tickets_id) {
802 flush_state++;
803 } else {
804 last_tickets_id = space_info->tickets_id;
805 flush_state = FLUSH_DELAYED_ITEMS_NR;
806 if (commit_cycles)
807 commit_cycles--;
808 }
809
810 /*
811 * We don't want to force a chunk allocation until we've tried
812 * pretty hard to reclaim space. Think of the case where we
813 * freed up a bunch of space and so have a lot of pinned space
814 * to reclaim. We would rather use that than possibly create a
815 * underutilized metadata chunk. So if this is our first run
816 * through the flushing state machine skip ALLOC_CHUNK_FORCE and
817 * commit the transaction. If nothing has changed the next go
818 * around then we can force a chunk allocation.
819 */
820 if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
821 flush_state++;
822
823 if (flush_state > COMMIT_TRANS) {
824 commit_cycles++;
825 if (commit_cycles > 2) {
826 if (wake_all_tickets(&space_info->tickets)) {
827 flush_state = FLUSH_DELAYED_ITEMS_NR;
828 commit_cycles--;
829 } else {
830 space_info->flush = 0;
831 }
832 } else {
833 flush_state = FLUSH_DELAYED_ITEMS_NR;
834 }
835 }
836 spin_unlock(&space_info->lock);
837 } while (flush_state <= COMMIT_TRANS);
838}
839
840void btrfs_init_async_reclaim_work(struct work_struct *work)
841{
842 INIT_WORK(work, btrfs_async_reclaim_metadata_space);
843}
844
845static const enum btrfs_flush_state priority_flush_states[] = {
846 FLUSH_DELAYED_ITEMS_NR,
847 FLUSH_DELAYED_ITEMS,
848 ALLOC_CHUNK,
849};
850
Josef Bacikd3984c92019-08-01 18:19:37 -0400851static const enum btrfs_flush_state evict_flush_states[] = {
852 FLUSH_DELAYED_ITEMS_NR,
853 FLUSH_DELAYED_ITEMS,
854 FLUSH_DELAYED_REFS_NR,
855 FLUSH_DELAYED_REFS,
856 FLUSH_DELALLOC,
857 FLUSH_DELALLOC_WAIT,
858 ALLOC_CHUNK,
859 COMMIT_TRANS,
860};
861
Josef Bacik0d9764f2019-06-18 16:09:25 -0400862static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
Josef Bacik9ce2f422019-08-01 18:19:36 -0400863 struct btrfs_space_info *space_info,
864 struct reserve_ticket *ticket,
865 const enum btrfs_flush_state *states,
866 int states_nr)
Josef Bacik0d9764f2019-06-18 16:09:25 -0400867{
868 u64 to_reclaim;
869 int flush_state;
870
871 spin_lock(&space_info->lock);
872 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
873 false);
874 if (!to_reclaim) {
875 spin_unlock(&space_info->lock);
876 return;
877 }
878 spin_unlock(&space_info->lock);
879
880 flush_state = 0;
881 do {
Josef Bacik9ce2f422019-08-01 18:19:36 -0400882 flush_space(fs_info, space_info, to_reclaim, states[flush_state]);
Josef Bacik0d9764f2019-06-18 16:09:25 -0400883 flush_state++;
884 spin_lock(&space_info->lock);
885 if (ticket->bytes == 0) {
886 spin_unlock(&space_info->lock);
887 return;
888 }
889 spin_unlock(&space_info->lock);
Josef Bacik9ce2f422019-08-01 18:19:36 -0400890 } while (flush_state < states_nr);
Josef Bacik0d9764f2019-06-18 16:09:25 -0400891}
892
Josef Bacik374bf9c2019-08-01 18:19:34 -0400893static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
894 struct btrfs_space_info *space_info,
895 struct reserve_ticket *ticket)
Josef Bacik0d9764f2019-06-18 16:09:25 -0400896
897{
898 DEFINE_WAIT(wait);
Josef Bacik0d9764f2019-06-18 16:09:25 -0400899 int ret = 0;
900
901 spin_lock(&space_info->lock);
902 while (ticket->bytes > 0 && ticket->error == 0) {
903 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
904 if (ret) {
Josef Bacik374bf9c2019-08-01 18:19:34 -0400905 ticket->error = -EINTR;
Josef Bacik0d9764f2019-06-18 16:09:25 -0400906 break;
907 }
908 spin_unlock(&space_info->lock);
909
910 schedule();
911
912 finish_wait(&ticket->wait, &wait);
913 spin_lock(&space_info->lock);
914 }
Josef Bacik0d9764f2019-06-18 16:09:25 -0400915 spin_unlock(&space_info->lock);
Josef Bacik0d9764f2019-06-18 16:09:25 -0400916}
917
918/**
Josef Bacik03235272019-08-01 18:19:35 -0400919 * handle_reserve_ticket - do the appropriate flushing and waiting for a ticket
920 * @fs_info - the fs
921 * @space_info - the space_info for the reservation
922 * @ticket - the ticket for the reservation
923 * @flush - how much we can flush
924 *
925 * This does the work of figuring out how to flush for the ticket, waiting for
926 * the reservation, and returning the appropriate error if there is one.
927 */
928static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
929 struct btrfs_space_info *space_info,
930 struct reserve_ticket *ticket,
931 enum btrfs_reserve_flush_enum flush)
932{
933 u64 reclaim_bytes = 0;
934 int ret;
935
Josef Bacikd3984c92019-08-01 18:19:37 -0400936 switch (flush) {
937 case BTRFS_RESERVE_FLUSH_ALL:
Josef Bacik03235272019-08-01 18:19:35 -0400938 wait_reserve_ticket(fs_info, space_info, ticket);
Josef Bacikd3984c92019-08-01 18:19:37 -0400939 break;
940 case BTRFS_RESERVE_FLUSH_LIMIT:
Josef Bacik9ce2f422019-08-01 18:19:36 -0400941 priority_reclaim_metadata_space(fs_info, space_info, ticket,
942 priority_flush_states,
943 ARRAY_SIZE(priority_flush_states));
Josef Bacikd3984c92019-08-01 18:19:37 -0400944 break;
945 case BTRFS_RESERVE_FLUSH_EVICT:
946 priority_reclaim_metadata_space(fs_info, space_info, ticket,
947 evict_flush_states,
948 ARRAY_SIZE(evict_flush_states));
949 break;
950 default:
951 ASSERT(0);
952 break;
953 }
Josef Bacik03235272019-08-01 18:19:35 -0400954
955 spin_lock(&space_info->lock);
956 ret = ticket->error;
957 if (ticket->bytes || ticket->error) {
958 if (ticket->bytes < ticket->orig_bytes)
959 reclaim_bytes = ticket->orig_bytes - ticket->bytes;
960 list_del_init(&ticket->list);
961 if (!ret)
962 ret = -ENOSPC;
963 }
964 spin_unlock(&space_info->lock);
965
966 if (reclaim_bytes)
967 btrfs_space_info_add_old_bytes(fs_info, space_info,
968 reclaim_bytes);
969 ASSERT(list_empty(&ticket->list));
970 return ret;
971}
972
973/**
Josef Bacik0d9764f2019-06-18 16:09:25 -0400974 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
975 * @root - the root we're allocating for
976 * @space_info - the space info we want to allocate from
977 * @orig_bytes - the number of bytes we want
978 * @flush - whether or not we can flush to make our reservation
979 *
980 * This will reserve orig_bytes number of bytes from the space info associated
981 * with the block_rsv. If there is not enough space it will make an attempt to
982 * flush out space to make room. It will do this by flushing delalloc if
983 * possible or committing the transaction. If flush is 0 then no attempts to
984 * regain reservations will be made and this will fail if there is not enough
985 * space already.
986 */
987static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
988 struct btrfs_space_info *space_info,
989 u64 orig_bytes,
990 enum btrfs_reserve_flush_enum flush,
991 bool system_chunk)
992{
993 struct reserve_ticket ticket;
994 u64 used;
Josef Bacik0d9764f2019-06-18 16:09:25 -0400995 int ret = 0;
996
997 ASSERT(orig_bytes);
998 ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
999
1000 spin_lock(&space_info->lock);
1001 ret = -ENOSPC;
1002 used = btrfs_space_info_used(space_info, true);
1003
1004 /*
Goldwyn Rodrigues9b4851b2019-06-25 20:11:31 +02001005 * Carry on if we have enough space (short-circuit) OR call
1006 * can_overcommit() to ensure we can overcommit to continue.
Josef Bacik0d9764f2019-06-18 16:09:25 -04001007 */
Goldwyn Rodrigues9b4851b2019-06-25 20:11:31 +02001008 if ((used + orig_bytes <= space_info->total_bytes) ||
1009 can_overcommit(fs_info, space_info, orig_bytes, flush,
1010 system_chunk)) {
Josef Bacik0d9764f2019-06-18 16:09:25 -04001011 btrfs_space_info_update_bytes_may_use(fs_info, space_info,
1012 orig_bytes);
1013 trace_btrfs_space_reservation(fs_info, "space_info",
1014 space_info->flags, orig_bytes, 1);
1015 ret = 0;
1016 }
1017
1018 /*
1019 * If we couldn't make a reservation then setup our reservation ticket
1020 * and kick the async worker if it's not already running.
1021 *
1022 * If we are a priority flusher then we just need to add our ticket to
1023 * the list and we will do our own flushing further down.
1024 */
1025 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
1026 ticket.orig_bytes = orig_bytes;
1027 ticket.bytes = orig_bytes;
1028 ticket.error = 0;
1029 init_waitqueue_head(&ticket.wait);
1030 if (flush == BTRFS_RESERVE_FLUSH_ALL) {
1031 list_add_tail(&ticket.list, &space_info->tickets);
1032 if (!space_info->flush) {
1033 space_info->flush = 1;
1034 trace_btrfs_trigger_flush(fs_info,
1035 space_info->flags,
1036 orig_bytes, flush,
1037 "enospc");
1038 queue_work(system_unbound_wq,
1039 &fs_info->async_reclaim_work);
1040 }
1041 } else {
1042 list_add_tail(&ticket.list,
1043 &space_info->priority_tickets);
1044 }
1045 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
1046 used += orig_bytes;
1047 /*
1048 * We will do the space reservation dance during log replay,
1049 * which means we won't have fs_info->fs_root set, so don't do
1050 * the async reclaim as we will panic.
1051 */
1052 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
1053 need_do_async_reclaim(fs_info, space_info,
1054 used, system_chunk) &&
1055 !work_busy(&fs_info->async_reclaim_work)) {
1056 trace_btrfs_trigger_flush(fs_info, space_info->flags,
1057 orig_bytes, flush, "preempt");
1058 queue_work(system_unbound_wq,
1059 &fs_info->async_reclaim_work);
1060 }
1061 }
1062 spin_unlock(&space_info->lock);
1063 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
1064 return ret;
1065
Josef Bacik03235272019-08-01 18:19:35 -04001066 return handle_reserve_ticket(fs_info, space_info, &ticket, flush);
Josef Bacik0d9764f2019-06-18 16:09:25 -04001067}
1068
1069/**
1070 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
1071 * @root - the root we're allocating for
1072 * @block_rsv - the block_rsv we're allocating for
1073 * @orig_bytes - the number of bytes we want
1074 * @flush - whether or not we can flush to make our reservation
1075 *
1076 * This will reserve orig_bytes number of bytes from the space info associated
1077 * with the block_rsv. If there is not enough space it will make an attempt to
1078 * flush out space to make room. It will do this by flushing delalloc if
1079 * possible or committing the transaction. If flush is 0 then no attempts to
1080 * regain reservations will be made and this will fail if there is not enough
1081 * space already.
1082 */
1083int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
1084 struct btrfs_block_rsv *block_rsv,
1085 u64 orig_bytes,
1086 enum btrfs_reserve_flush_enum flush)
1087{
1088 struct btrfs_fs_info *fs_info = root->fs_info;
1089 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
1090 int ret;
1091 bool system_chunk = (root == fs_info->chunk_root);
1092
1093 ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
1094 orig_bytes, flush, system_chunk);
1095 if (ret == -ENOSPC &&
1096 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
1097 if (block_rsv != global_rsv &&
1098 !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes))
1099 ret = 0;
1100 }
1101 if (ret == -ENOSPC) {
1102 trace_btrfs_space_reservation(fs_info, "space_info:enospc",
1103 block_rsv->space_info->flags,
1104 orig_bytes, 1);
1105
1106 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
1107 btrfs_dump_space_info(fs_info, block_rsv->space_info,
1108 orig_bytes, 0);
1109 }
1110 return ret;
1111}