blob: 71fdc50b5900fb07d2a09efadc3b486046a1f0ea [file] [log] [blame]
Josef Bacik280c29082019-06-18 16:09:19 -04001// SPDX-License-Identifier: GPL-2.0
2
David Sterba784352f2019-08-21 18:54:28 +02003#include "misc.h"
Josef Bacik280c29082019-06-18 16:09:19 -04004#include "ctree.h"
5#include "space-info.h"
6#include "sysfs.h"
7#include "volumes.h"
Josef Bacik5da6afe2019-06-18 16:09:24 -04008#include "free-space-cache.h"
Josef Bacik0d9764f2019-06-18 16:09:25 -04009#include "ordered-data.h"
10#include "transaction.h"
Josef Bacikaac00232019-06-20 15:37:44 -040011#include "block-group.h"
Josef Bacik280c29082019-06-18 16:09:19 -040012
13u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
14 bool may_use_included)
15{
16 ASSERT(s_info);
17 return s_info->bytes_used + s_info->bytes_reserved +
18 s_info->bytes_pinned + s_info->bytes_readonly +
19 (may_use_included ? s_info->bytes_may_use : 0);
20}
21
22/*
23 * after adding space to the filesystem, we need to clear the full flags
24 * on all the space infos.
25 */
26void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
27{
28 struct list_head *head = &info->space_info;
29 struct btrfs_space_info *found;
30
31 rcu_read_lock();
32 list_for_each_entry_rcu(found, head, list)
33 found->full = 0;
34 rcu_read_unlock();
35}
36
Josef Bacik280c29082019-06-18 16:09:19 -040037static int create_space_info(struct btrfs_fs_info *info, u64 flags)
38{
39
40 struct btrfs_space_info *space_info;
41 int i;
42 int ret;
43
44 space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
45 if (!space_info)
46 return -ENOMEM;
47
48 ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
49 GFP_KERNEL);
50 if (ret) {
51 kfree(space_info);
52 return ret;
53 }
54
55 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
56 INIT_LIST_HEAD(&space_info->block_groups[i]);
57 init_rwsem(&space_info->groups_sem);
58 spin_lock_init(&space_info->lock);
59 space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
60 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
61 init_waitqueue_head(&space_info->wait);
62 INIT_LIST_HEAD(&space_info->ro_bgs);
63 INIT_LIST_HEAD(&space_info->tickets);
64 INIT_LIST_HEAD(&space_info->priority_tickets);
65
David Sterbab8823272019-08-01 18:50:16 +020066 ret = btrfs_sysfs_add_space_info_type(info, space_info);
67 if (ret)
Josef Bacik280c29082019-06-18 16:09:19 -040068 return ret;
Josef Bacik280c29082019-06-18 16:09:19 -040069
70 list_add_rcu(&space_info->list, &info->space_info);
71 if (flags & BTRFS_BLOCK_GROUP_DATA)
72 info->data_sinfo = space_info;
73
74 return ret;
75}
76
77int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
78{
79 struct btrfs_super_block *disk_super;
80 u64 features;
81 u64 flags;
82 int mixed = 0;
83 int ret;
84
85 disk_super = fs_info->super_copy;
86 if (!btrfs_super_root(disk_super))
87 return -EINVAL;
88
89 features = btrfs_super_incompat_flags(disk_super);
90 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
91 mixed = 1;
92
93 flags = BTRFS_BLOCK_GROUP_SYSTEM;
94 ret = create_space_info(fs_info, flags);
95 if (ret)
96 goto out;
97
98 if (mixed) {
99 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
100 ret = create_space_info(fs_info, flags);
101 } else {
102 flags = BTRFS_BLOCK_GROUP_METADATA;
103 ret = create_space_info(fs_info, flags);
104 if (ret)
105 goto out;
106
107 flags = BTRFS_BLOCK_GROUP_DATA;
108 ret = create_space_info(fs_info, flags);
109 }
110out:
111 return ret;
112}
113
114void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
115 u64 total_bytes, u64 bytes_used,
116 u64 bytes_readonly,
117 struct btrfs_space_info **space_info)
118{
119 struct btrfs_space_info *found;
120 int factor;
121
122 factor = btrfs_bg_type_to_factor(flags);
123
124 found = btrfs_find_space_info(info, flags);
125 ASSERT(found);
126 spin_lock(&found->lock);
127 found->total_bytes += total_bytes;
128 found->disk_total += total_bytes * factor;
129 found->bytes_used += bytes_used;
130 found->disk_used += bytes_used * factor;
131 found->bytes_readonly += bytes_readonly;
132 if (total_bytes > 0)
133 found->full = 0;
134 btrfs_space_info_add_new_bytes(info, found,
135 total_bytes - bytes_used -
136 bytes_readonly);
137 spin_unlock(&found->lock);
138 *space_info = found;
139}
140
141struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
142 u64 flags)
143{
144 struct list_head *head = &info->space_info;
145 struct btrfs_space_info *found;
146
147 flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
148
149 rcu_read_lock();
150 list_for_each_entry_rcu(found, head, list) {
151 if (found->flags & flags) {
152 rcu_read_unlock();
153 return found;
154 }
155 }
156 rcu_read_unlock();
157 return NULL;
158}
Josef Bacik41783ef2019-06-18 16:09:20 -0400159
160static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
161{
162 return (global->size << 1);
163}
164
Josef Bacik83d731a2019-06-18 16:09:26 -0400165static int can_overcommit(struct btrfs_fs_info *fs_info,
166 struct btrfs_space_info *space_info, u64 bytes,
167 enum btrfs_reserve_flush_enum flush,
168 bool system_chunk)
Josef Bacik41783ef2019-06-18 16:09:20 -0400169{
170 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
171 u64 profile;
172 u64 space_size;
173 u64 avail;
174 u64 used;
175 int factor;
176
177 /* Don't overcommit when in mixed mode. */
178 if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
179 return 0;
180
181 if (system_chunk)
182 profile = btrfs_system_alloc_profile(fs_info);
183 else
184 profile = btrfs_metadata_alloc_profile(fs_info);
185
186 used = btrfs_space_info_used(space_info, false);
187
188 /*
189 * We only want to allow over committing if we have lots of actual space
190 * free, but if we don't have enough space to handle the global reserve
191 * space then we could end up having a real enospc problem when trying
192 * to allocate a chunk or some other such important allocation.
193 */
194 spin_lock(&global_rsv->lock);
195 space_size = calc_global_rsv_need_space(global_rsv);
196 spin_unlock(&global_rsv->lock);
197 if (used + space_size >= space_info->total_bytes)
198 return 0;
199
200 used += space_info->bytes_may_use;
201
202 avail = atomic64_read(&fs_info->free_chunk_space);
203
204 /*
205 * If we have dup, raid1 or raid10 then only half of the free
206 * space is actually usable. For raid56, the space info used
207 * doesn't include the parity drive, so we don't have to
208 * change the math
209 */
210 factor = btrfs_bg_type_to_factor(profile);
211 avail = div_u64(avail, factor);
212
213 /*
214 * If we aren't flushing all things, let us overcommit up to
215 * 1/2th of the space. If we can flush, don't let us overcommit
216 * too much, let it overcommit up to 1/8 of the space.
217 */
218 if (flush == BTRFS_RESERVE_FLUSH_ALL)
219 avail >>= 3;
220 else
221 avail >>= 1;
222
223 if (used + bytes < space_info->total_bytes + avail)
224 return 1;
225 return 0;
226}
Josef Bacikb338b012019-06-18 16:09:22 -0400227
228/*
229 * This is for space we already have accounted in space_info->bytes_may_use, so
230 * basically when we're returning space from block_rsv's.
231 */
232void btrfs_space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
233 struct btrfs_space_info *space_info,
234 u64 num_bytes)
235{
Josef Bacikb338b012019-06-18 16:09:22 -0400236 struct list_head *head;
Josef Bacikb338b012019-06-18 16:09:22 -0400237 enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
Josef Bacikb338b012019-06-18 16:09:22 -0400238
239 spin_lock(&space_info->lock);
240 head = &space_info->priority_tickets;
Josef Bacik91182642019-08-28 11:15:24 -0400241 btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes);
Josef Bacikb338b012019-06-18 16:09:22 -0400242
Josef Bacikb338b012019-06-18 16:09:22 -0400243again:
Josef Bacik91182642019-08-28 11:15:24 -0400244 while (!list_empty(head)) {
245 struct reserve_ticket *ticket;
246 u64 used = btrfs_space_info_used(space_info, true);
247
248 ticket = list_first_entry(head, struct reserve_ticket, list);
249
250 /* Check and see if our ticket can be satisified now. */
251 if ((used + ticket->bytes <= space_info->total_bytes) ||
252 can_overcommit(fs_info, space_info, ticket->bytes, flush,
253 false)) {
254 btrfs_space_info_update_bytes_may_use(fs_info,
255 space_info,
256 ticket->bytes);
Josef Bacikb338b012019-06-18 16:09:22 -0400257 list_del_init(&ticket->list);
Josef Bacikb338b012019-06-18 16:09:22 -0400258 ticket->bytes = 0;
259 space_info->tickets_id++;
260 wake_up(&ticket->wait);
261 } else {
Josef Bacik91182642019-08-28 11:15:24 -0400262 break;
Josef Bacikb338b012019-06-18 16:09:22 -0400263 }
264 }
265
Josef Bacik91182642019-08-28 11:15:24 -0400266 if (head == &space_info->priority_tickets) {
Josef Bacikb338b012019-06-18 16:09:22 -0400267 head = &space_info->tickets;
268 flush = BTRFS_RESERVE_FLUSH_ALL;
269 goto again;
270 }
Josef Bacikb338b012019-06-18 16:09:22 -0400271 spin_unlock(&space_info->lock);
272}
273
274/*
275 * This is for newly allocated space that isn't accounted in
276 * space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent
277 * we use this helper.
278 */
279void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
280 struct btrfs_space_info *space_info,
281 u64 num_bytes)
282{
283 struct reserve_ticket *ticket;
284 struct list_head *head = &space_info->priority_tickets;
285
286again:
287 while (!list_empty(head) && num_bytes) {
288 ticket = list_first_entry(head, struct reserve_ticket,
289 list);
290 if (num_bytes >= ticket->bytes) {
Josef Bacikb338b012019-06-18 16:09:22 -0400291 list_del_init(&ticket->list);
292 num_bytes -= ticket->bytes;
293 btrfs_space_info_update_bytes_may_use(fs_info,
294 space_info,
295 ticket->bytes);
296 ticket->bytes = 0;
297 space_info->tickets_id++;
298 wake_up(&ticket->wait);
299 } else {
Josef Bacikb338b012019-06-18 16:09:22 -0400300 btrfs_space_info_update_bytes_may_use(fs_info,
301 space_info,
302 num_bytes);
303 ticket->bytes -= num_bytes;
304 num_bytes = 0;
305 }
306 }
307
308 if (num_bytes && head == &space_info->priority_tickets) {
309 head = &space_info->tickets;
310 goto again;
311 }
312}
Josef Bacik5da6afe2019-06-18 16:09:24 -0400313
314#define DUMP_BLOCK_RSV(fs_info, rsv_name) \
315do { \
316 struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \
317 spin_lock(&__rsv->lock); \
318 btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \
319 __rsv->size, __rsv->reserved); \
320 spin_unlock(&__rsv->lock); \
321} while (0)
322
323void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
324 struct btrfs_space_info *info, u64 bytes,
325 int dump_block_groups)
326{
327 struct btrfs_block_group_cache *cache;
328 int index = 0;
329
330 spin_lock(&info->lock);
331 btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
332 info->flags,
333 info->total_bytes - btrfs_space_info_used(info, true),
334 info->full ? "" : "not ");
335 btrfs_info(fs_info,
336 "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
337 info->total_bytes, info->bytes_used, info->bytes_pinned,
338 info->bytes_reserved, info->bytes_may_use,
339 info->bytes_readonly);
340 spin_unlock(&info->lock);
341
342 DUMP_BLOCK_RSV(fs_info, global_block_rsv);
343 DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
344 DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
345 DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
346 DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
347
348 if (!dump_block_groups)
349 return;
350
351 down_read(&info->groups_sem);
352again:
353 list_for_each_entry(cache, &info->block_groups[index], list) {
354 spin_lock(&cache->lock);
355 btrfs_info(fs_info,
356 "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
357 cache->key.objectid, cache->key.offset,
358 btrfs_block_group_used(&cache->item), cache->pinned,
359 cache->reserved, cache->ro ? "[readonly]" : "");
360 btrfs_dump_free_space(cache, bytes);
361 spin_unlock(&cache->lock);
362 }
363 if (++index < BTRFS_NR_RAID_TYPES)
364 goto again;
365 up_read(&info->groups_sem);
366}
Josef Bacik0d9764f2019-06-18 16:09:25 -0400367
368static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
369 unsigned long nr_pages, int nr_items)
370{
371 struct super_block *sb = fs_info->sb;
372
373 if (down_read_trylock(&sb->s_umount)) {
374 writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
375 up_read(&sb->s_umount);
376 } else {
377 /*
378 * We needn't worry the filesystem going from r/w to r/o though
379 * we don't acquire ->s_umount mutex, because the filesystem
380 * should guarantee the delalloc inodes list be empty after
381 * the filesystem is readonly(all dirty pages are written to
382 * the disk).
383 */
384 btrfs_start_delalloc_roots(fs_info, nr_items);
385 if (!current->journal_info)
386 btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
387 }
388}
389
390static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
391 u64 to_reclaim)
392{
393 u64 bytes;
394 u64 nr;
395
Josef Bacik2bd36e72019-08-22 15:14:33 -0400396 bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
Josef Bacik0d9764f2019-06-18 16:09:25 -0400397 nr = div64_u64(to_reclaim, bytes);
398 if (!nr)
399 nr = 1;
400 return nr;
401}
402
403#define EXTENT_SIZE_PER_ITEM SZ_256K
404
405/*
406 * shrink metadata reservation for delalloc
407 */
408static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
409 u64 orig, bool wait_ordered)
410{
411 struct btrfs_space_info *space_info;
412 struct btrfs_trans_handle *trans;
413 u64 delalloc_bytes;
414 u64 dio_bytes;
415 u64 async_pages;
416 u64 items;
417 long time_left;
418 unsigned long nr_pages;
419 int loops;
420
421 /* Calc the number of the pages we need flush for space reservation */
422 items = calc_reclaim_items_nr(fs_info, to_reclaim);
423 to_reclaim = items * EXTENT_SIZE_PER_ITEM;
424
425 trans = (struct btrfs_trans_handle *)current->journal_info;
426 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
427
428 delalloc_bytes = percpu_counter_sum_positive(
429 &fs_info->delalloc_bytes);
430 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
431 if (delalloc_bytes == 0 && dio_bytes == 0) {
432 if (trans)
433 return;
434 if (wait_ordered)
435 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
436 return;
437 }
438
439 /*
440 * If we are doing more ordered than delalloc we need to just wait on
441 * ordered extents, otherwise we'll waste time trying to flush delalloc
442 * that likely won't give us the space back we need.
443 */
444 if (dio_bytes > delalloc_bytes)
445 wait_ordered = true;
446
447 loops = 0;
448 while ((delalloc_bytes || dio_bytes) && loops < 3) {
449 nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
450
451 /*
452 * Triggers inode writeback for up to nr_pages. This will invoke
453 * ->writepages callback and trigger delalloc filling
454 * (btrfs_run_delalloc_range()).
455 */
456 btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
457
458 /*
459 * We need to wait for the compressed pages to start before
460 * we continue.
461 */
462 async_pages = atomic_read(&fs_info->async_delalloc_pages);
463 if (!async_pages)
464 goto skip_async;
465
466 /*
467 * Calculate how many compressed pages we want to be written
468 * before we continue. I.e if there are more async pages than we
469 * require wait_event will wait until nr_pages are written.
470 */
471 if (async_pages <= nr_pages)
472 async_pages = 0;
473 else
474 async_pages -= nr_pages;
475
476 wait_event(fs_info->async_submit_wait,
477 atomic_read(&fs_info->async_delalloc_pages) <=
478 (int)async_pages);
479skip_async:
480 spin_lock(&space_info->lock);
481 if (list_empty(&space_info->tickets) &&
482 list_empty(&space_info->priority_tickets)) {
483 spin_unlock(&space_info->lock);
484 break;
485 }
486 spin_unlock(&space_info->lock);
487
488 loops++;
489 if (wait_ordered && !trans) {
490 btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
491 } else {
492 time_left = schedule_timeout_killable(1);
493 if (time_left)
494 break;
495 }
496 delalloc_bytes = percpu_counter_sum_positive(
497 &fs_info->delalloc_bytes);
498 dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
499 }
500}
501
502/**
503 * maybe_commit_transaction - possibly commit the transaction if its ok to
504 * @root - the root we're allocating for
505 * @bytes - the number of bytes we want to reserve
506 * @force - force the commit
507 *
508 * This will check to make sure that committing the transaction will actually
509 * get us somewhere and then commit the transaction if it does. Otherwise it
510 * will return -ENOSPC.
511 */
512static int may_commit_transaction(struct btrfs_fs_info *fs_info,
513 struct btrfs_space_info *space_info)
514{
515 struct reserve_ticket *ticket = NULL;
516 struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
517 struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
518 struct btrfs_trans_handle *trans;
519 u64 bytes_needed;
520 u64 reclaim_bytes = 0;
521
522 trans = (struct btrfs_trans_handle *)current->journal_info;
523 if (trans)
524 return -EAGAIN;
525
526 spin_lock(&space_info->lock);
527 if (!list_empty(&space_info->priority_tickets))
528 ticket = list_first_entry(&space_info->priority_tickets,
529 struct reserve_ticket, list);
530 else if (!list_empty(&space_info->tickets))
531 ticket = list_first_entry(&space_info->tickets,
532 struct reserve_ticket, list);
533 bytes_needed = (ticket) ? ticket->bytes : 0;
534 spin_unlock(&space_info->lock);
535
536 if (!bytes_needed)
537 return 0;
538
539 trans = btrfs_join_transaction(fs_info->extent_root);
540 if (IS_ERR(trans))
541 return PTR_ERR(trans);
542
543 /*
544 * See if there is enough pinned space to make this reservation, or if
545 * we have block groups that are going to be freed, allowing us to
546 * possibly do a chunk allocation the next loop through.
547 */
548 if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) ||
549 __percpu_counter_compare(&space_info->total_bytes_pinned,
550 bytes_needed,
551 BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
552 goto commit;
553
554 /*
555 * See if there is some space in the delayed insertion reservation for
556 * this reservation.
557 */
558 if (space_info != delayed_rsv->space_info)
559 goto enospc;
560
561 spin_lock(&delayed_rsv->lock);
562 reclaim_bytes += delayed_rsv->reserved;
563 spin_unlock(&delayed_rsv->lock);
564
565 spin_lock(&delayed_refs_rsv->lock);
566 reclaim_bytes += delayed_refs_rsv->reserved;
567 spin_unlock(&delayed_refs_rsv->lock);
568 if (reclaim_bytes >= bytes_needed)
569 goto commit;
570 bytes_needed -= reclaim_bytes;
571
572 if (__percpu_counter_compare(&space_info->total_bytes_pinned,
573 bytes_needed,
574 BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
575 goto enospc;
576
577commit:
578 return btrfs_commit_transaction(trans);
579enospc:
580 btrfs_end_transaction(trans);
581 return -ENOSPC;
582}
583
584/*
585 * Try to flush some data based on policy set by @state. This is only advisory
586 * and may fail for various reasons. The caller is supposed to examine the
587 * state of @space_info to detect the outcome.
588 */
589static void flush_space(struct btrfs_fs_info *fs_info,
590 struct btrfs_space_info *space_info, u64 num_bytes,
591 int state)
592{
593 struct btrfs_root *root = fs_info->extent_root;
594 struct btrfs_trans_handle *trans;
595 int nr;
596 int ret = 0;
597
598 switch (state) {
599 case FLUSH_DELAYED_ITEMS_NR:
600 case FLUSH_DELAYED_ITEMS:
601 if (state == FLUSH_DELAYED_ITEMS_NR)
602 nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
603 else
604 nr = -1;
605
606 trans = btrfs_join_transaction(root);
607 if (IS_ERR(trans)) {
608 ret = PTR_ERR(trans);
609 break;
610 }
611 ret = btrfs_run_delayed_items_nr(trans, nr);
612 btrfs_end_transaction(trans);
613 break;
614 case FLUSH_DELALLOC:
615 case FLUSH_DELALLOC_WAIT:
616 shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
617 state == FLUSH_DELALLOC_WAIT);
618 break;
619 case FLUSH_DELAYED_REFS_NR:
620 case FLUSH_DELAYED_REFS:
621 trans = btrfs_join_transaction(root);
622 if (IS_ERR(trans)) {
623 ret = PTR_ERR(trans);
624 break;
625 }
626 if (state == FLUSH_DELAYED_REFS_NR)
627 nr = calc_reclaim_items_nr(fs_info, num_bytes);
628 else
629 nr = 0;
630 btrfs_run_delayed_refs(trans, nr);
631 btrfs_end_transaction(trans);
632 break;
633 case ALLOC_CHUNK:
634 case ALLOC_CHUNK_FORCE:
635 trans = btrfs_join_transaction(root);
636 if (IS_ERR(trans)) {
637 ret = PTR_ERR(trans);
638 break;
639 }
640 ret = btrfs_chunk_alloc(trans,
641 btrfs_metadata_alloc_profile(fs_info),
642 (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
643 CHUNK_ALLOC_FORCE);
644 btrfs_end_transaction(trans);
645 if (ret > 0 || ret == -ENOSPC)
646 ret = 0;
647 break;
Josef Bacik844245b2019-08-01 18:19:33 -0400648 case RUN_DELAYED_IPUTS:
Josef Bacik0d9764f2019-06-18 16:09:25 -0400649 /*
650 * If we have pending delayed iputs then we could free up a
651 * bunch of pinned space, so make sure we run the iputs before
652 * we do our pinned bytes check below.
653 */
654 btrfs_run_delayed_iputs(fs_info);
655 btrfs_wait_on_delayed_iputs(fs_info);
Josef Bacik844245b2019-08-01 18:19:33 -0400656 break;
657 case COMMIT_TRANS:
Josef Bacik0d9764f2019-06-18 16:09:25 -0400658 ret = may_commit_transaction(fs_info, space_info);
659 break;
660 default:
661 ret = -ENOSPC;
662 break;
663 }
664
665 trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
666 ret);
667 return;
668}
669
670static inline u64
671btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
672 struct btrfs_space_info *space_info,
673 bool system_chunk)
674{
675 struct reserve_ticket *ticket;
676 u64 used;
677 u64 expected;
678 u64 to_reclaim = 0;
679
680 list_for_each_entry(ticket, &space_info->tickets, list)
681 to_reclaim += ticket->bytes;
682 list_for_each_entry(ticket, &space_info->priority_tickets, list)
683 to_reclaim += ticket->bytes;
684 if (to_reclaim)
685 return to_reclaim;
686
687 to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
Josef Bacik83d731a2019-06-18 16:09:26 -0400688 if (can_overcommit(fs_info, space_info, to_reclaim,
689 BTRFS_RESERVE_FLUSH_ALL, system_chunk))
Josef Bacik0d9764f2019-06-18 16:09:25 -0400690 return 0;
691
692 used = btrfs_space_info_used(space_info, true);
693
Josef Bacik83d731a2019-06-18 16:09:26 -0400694 if (can_overcommit(fs_info, space_info, SZ_1M,
695 BTRFS_RESERVE_FLUSH_ALL, system_chunk))
Josef Bacik0d9764f2019-06-18 16:09:25 -0400696 expected = div_factor_fine(space_info->total_bytes, 95);
697 else
698 expected = div_factor_fine(space_info->total_bytes, 90);
699
700 if (used > expected)
701 to_reclaim = used - expected;
702 else
703 to_reclaim = 0;
704 to_reclaim = min(to_reclaim, space_info->bytes_may_use +
705 space_info->bytes_reserved);
706 return to_reclaim;
707}
708
709static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
710 struct btrfs_space_info *space_info,
711 u64 used, bool system_chunk)
712{
713 u64 thresh = div_factor_fine(space_info->total_bytes, 98);
714
715 /* If we're just plain full then async reclaim just slows us down. */
716 if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
717 return 0;
718
719 if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
720 system_chunk))
721 return 0;
722
723 return (used >= thresh && !btrfs_fs_closing(fs_info) &&
724 !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
725}
726
727static bool wake_all_tickets(struct list_head *head)
728{
729 struct reserve_ticket *ticket;
730
731 while (!list_empty(head)) {
732 ticket = list_first_entry(head, struct reserve_ticket, list);
733 list_del_init(&ticket->list);
734 ticket->error = -ENOSPC;
735 wake_up(&ticket->wait);
736 if (ticket->bytes != ticket->orig_bytes)
737 return true;
738 }
739 return false;
740}
741
742/*
743 * This is for normal flushers, we can wait all goddamned day if we want to. We
744 * will loop and continuously try to flush as long as we are making progress.
745 * We count progress as clearing off tickets each time we have to loop.
746 */
747static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
748{
749 struct btrfs_fs_info *fs_info;
750 struct btrfs_space_info *space_info;
751 u64 to_reclaim;
752 int flush_state;
753 int commit_cycles = 0;
754 u64 last_tickets_id;
755
756 fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
757 space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
758
759 spin_lock(&space_info->lock);
760 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
761 false);
762 if (!to_reclaim) {
763 space_info->flush = 0;
764 spin_unlock(&space_info->lock);
765 return;
766 }
767 last_tickets_id = space_info->tickets_id;
768 spin_unlock(&space_info->lock);
769
770 flush_state = FLUSH_DELAYED_ITEMS_NR;
771 do {
772 flush_space(fs_info, space_info, to_reclaim, flush_state);
773 spin_lock(&space_info->lock);
774 if (list_empty(&space_info->tickets)) {
775 space_info->flush = 0;
776 spin_unlock(&space_info->lock);
777 return;
778 }
779 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
780 space_info,
781 false);
782 if (last_tickets_id == space_info->tickets_id) {
783 flush_state++;
784 } else {
785 last_tickets_id = space_info->tickets_id;
786 flush_state = FLUSH_DELAYED_ITEMS_NR;
787 if (commit_cycles)
788 commit_cycles--;
789 }
790
791 /*
792 * We don't want to force a chunk allocation until we've tried
793 * pretty hard to reclaim space. Think of the case where we
794 * freed up a bunch of space and so have a lot of pinned space
795 * to reclaim. We would rather use that than possibly create a
796 * underutilized metadata chunk. So if this is our first run
797 * through the flushing state machine skip ALLOC_CHUNK_FORCE and
798 * commit the transaction. If nothing has changed the next go
799 * around then we can force a chunk allocation.
800 */
801 if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
802 flush_state++;
803
804 if (flush_state > COMMIT_TRANS) {
805 commit_cycles++;
806 if (commit_cycles > 2) {
807 if (wake_all_tickets(&space_info->tickets)) {
808 flush_state = FLUSH_DELAYED_ITEMS_NR;
809 commit_cycles--;
810 } else {
811 space_info->flush = 0;
812 }
813 } else {
814 flush_state = FLUSH_DELAYED_ITEMS_NR;
815 }
816 }
817 spin_unlock(&space_info->lock);
818 } while (flush_state <= COMMIT_TRANS);
819}
820
821void btrfs_init_async_reclaim_work(struct work_struct *work)
822{
823 INIT_WORK(work, btrfs_async_reclaim_metadata_space);
824}
825
826static const enum btrfs_flush_state priority_flush_states[] = {
827 FLUSH_DELAYED_ITEMS_NR,
828 FLUSH_DELAYED_ITEMS,
829 ALLOC_CHUNK,
830};
831
Josef Bacikd3984c92019-08-01 18:19:37 -0400832static const enum btrfs_flush_state evict_flush_states[] = {
833 FLUSH_DELAYED_ITEMS_NR,
834 FLUSH_DELAYED_ITEMS,
835 FLUSH_DELAYED_REFS_NR,
836 FLUSH_DELAYED_REFS,
837 FLUSH_DELALLOC,
838 FLUSH_DELALLOC_WAIT,
839 ALLOC_CHUNK,
840 COMMIT_TRANS,
841};
842
Josef Bacik0d9764f2019-06-18 16:09:25 -0400843static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
Josef Bacik9ce2f422019-08-01 18:19:36 -0400844 struct btrfs_space_info *space_info,
845 struct reserve_ticket *ticket,
846 const enum btrfs_flush_state *states,
847 int states_nr)
Josef Bacik0d9764f2019-06-18 16:09:25 -0400848{
849 u64 to_reclaim;
850 int flush_state;
851
852 spin_lock(&space_info->lock);
853 to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
854 false);
855 if (!to_reclaim) {
856 spin_unlock(&space_info->lock);
857 return;
858 }
859 spin_unlock(&space_info->lock);
860
861 flush_state = 0;
862 do {
Josef Bacik9ce2f422019-08-01 18:19:36 -0400863 flush_space(fs_info, space_info, to_reclaim, states[flush_state]);
Josef Bacik0d9764f2019-06-18 16:09:25 -0400864 flush_state++;
865 spin_lock(&space_info->lock);
866 if (ticket->bytes == 0) {
867 spin_unlock(&space_info->lock);
868 return;
869 }
870 spin_unlock(&space_info->lock);
Josef Bacik9ce2f422019-08-01 18:19:36 -0400871 } while (flush_state < states_nr);
Josef Bacik0d9764f2019-06-18 16:09:25 -0400872}
873
Josef Bacik374bf9c2019-08-01 18:19:34 -0400874static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
875 struct btrfs_space_info *space_info,
876 struct reserve_ticket *ticket)
Josef Bacik0d9764f2019-06-18 16:09:25 -0400877
878{
879 DEFINE_WAIT(wait);
Josef Bacik0d9764f2019-06-18 16:09:25 -0400880 int ret = 0;
881
882 spin_lock(&space_info->lock);
883 while (ticket->bytes > 0 && ticket->error == 0) {
884 ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
885 if (ret) {
Josef Bacik374bf9c2019-08-01 18:19:34 -0400886 ticket->error = -EINTR;
Josef Bacik0d9764f2019-06-18 16:09:25 -0400887 break;
888 }
889 spin_unlock(&space_info->lock);
890
891 schedule();
892
893 finish_wait(&ticket->wait, &wait);
894 spin_lock(&space_info->lock);
895 }
Josef Bacik0d9764f2019-06-18 16:09:25 -0400896 spin_unlock(&space_info->lock);
Josef Bacik0d9764f2019-06-18 16:09:25 -0400897}
898
899/**
Josef Bacik03235272019-08-01 18:19:35 -0400900 * handle_reserve_ticket - do the appropriate flushing and waiting for a ticket
901 * @fs_info - the fs
902 * @space_info - the space_info for the reservation
903 * @ticket - the ticket for the reservation
904 * @flush - how much we can flush
905 *
906 * This does the work of figuring out how to flush for the ticket, waiting for
907 * the reservation, and returning the appropriate error if there is one.
908 */
909static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
910 struct btrfs_space_info *space_info,
911 struct reserve_ticket *ticket,
912 enum btrfs_reserve_flush_enum flush)
913{
914 u64 reclaim_bytes = 0;
915 int ret;
916
Josef Bacikd3984c92019-08-01 18:19:37 -0400917 switch (flush) {
918 case BTRFS_RESERVE_FLUSH_ALL:
Josef Bacik03235272019-08-01 18:19:35 -0400919 wait_reserve_ticket(fs_info, space_info, ticket);
Josef Bacikd3984c92019-08-01 18:19:37 -0400920 break;
921 case BTRFS_RESERVE_FLUSH_LIMIT:
Josef Bacik9ce2f422019-08-01 18:19:36 -0400922 priority_reclaim_metadata_space(fs_info, space_info, ticket,
923 priority_flush_states,
924 ARRAY_SIZE(priority_flush_states));
Josef Bacikd3984c92019-08-01 18:19:37 -0400925 break;
926 case BTRFS_RESERVE_FLUSH_EVICT:
927 priority_reclaim_metadata_space(fs_info, space_info, ticket,
928 evict_flush_states,
929 ARRAY_SIZE(evict_flush_states));
930 break;
931 default:
932 ASSERT(0);
933 break;
934 }
Josef Bacik03235272019-08-01 18:19:35 -0400935
936 spin_lock(&space_info->lock);
937 ret = ticket->error;
938 if (ticket->bytes || ticket->error) {
939 if (ticket->bytes < ticket->orig_bytes)
940 reclaim_bytes = ticket->orig_bytes - ticket->bytes;
941 list_del_init(&ticket->list);
942 if (!ret)
943 ret = -ENOSPC;
944 }
945 spin_unlock(&space_info->lock);
946
947 if (reclaim_bytes)
948 btrfs_space_info_add_old_bytes(fs_info, space_info,
949 reclaim_bytes);
950 ASSERT(list_empty(&ticket->list));
951 return ret;
952}
953
954/**
Josef Bacik0d9764f2019-06-18 16:09:25 -0400955 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
956 * @root - the root we're allocating for
957 * @space_info - the space info we want to allocate from
958 * @orig_bytes - the number of bytes we want
959 * @flush - whether or not we can flush to make our reservation
960 *
961 * This will reserve orig_bytes number of bytes from the space info associated
962 * with the block_rsv. If there is not enough space it will make an attempt to
963 * flush out space to make room. It will do this by flushing delalloc if
964 * possible or committing the transaction. If flush is 0 then no attempts to
965 * regain reservations will be made and this will fail if there is not enough
966 * space already.
967 */
968static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
969 struct btrfs_space_info *space_info,
970 u64 orig_bytes,
971 enum btrfs_reserve_flush_enum flush,
972 bool system_chunk)
973{
974 struct reserve_ticket ticket;
975 u64 used;
Josef Bacik0d9764f2019-06-18 16:09:25 -0400976 int ret = 0;
Josef Bacikef1317a2019-08-22 15:10:54 -0400977 bool pending_tickets;
Josef Bacik0d9764f2019-06-18 16:09:25 -0400978
979 ASSERT(orig_bytes);
980 ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
981
982 spin_lock(&space_info->lock);
983 ret = -ENOSPC;
984 used = btrfs_space_info_used(space_info, true);
Josef Bacikef1317a2019-08-22 15:10:54 -0400985 pending_tickets = !list_empty(&space_info->tickets) ||
986 !list_empty(&space_info->priority_tickets);
Josef Bacik0d9764f2019-06-18 16:09:25 -0400987
988 /*
Goldwyn Rodrigues9b4851b2019-06-25 20:11:31 +0200989 * Carry on if we have enough space (short-circuit) OR call
990 * can_overcommit() to ensure we can overcommit to continue.
Josef Bacik0d9764f2019-06-18 16:09:25 -0400991 */
Josef Bacikef1317a2019-08-22 15:10:54 -0400992 if (!pending_tickets &&
993 ((used + orig_bytes <= space_info->total_bytes) ||
994 can_overcommit(fs_info, space_info, orig_bytes, flush,
995 system_chunk))) {
Josef Bacik0d9764f2019-06-18 16:09:25 -0400996 btrfs_space_info_update_bytes_may_use(fs_info, space_info,
997 orig_bytes);
Josef Bacik0d9764f2019-06-18 16:09:25 -0400998 ret = 0;
999 }
1000
1001 /*
1002 * If we couldn't make a reservation then setup our reservation ticket
1003 * and kick the async worker if it's not already running.
1004 *
1005 * If we are a priority flusher then we just need to add our ticket to
1006 * the list and we will do our own flushing further down.
1007 */
1008 if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
1009 ticket.orig_bytes = orig_bytes;
1010 ticket.bytes = orig_bytes;
1011 ticket.error = 0;
1012 init_waitqueue_head(&ticket.wait);
1013 if (flush == BTRFS_RESERVE_FLUSH_ALL) {
1014 list_add_tail(&ticket.list, &space_info->tickets);
1015 if (!space_info->flush) {
1016 space_info->flush = 1;
1017 trace_btrfs_trigger_flush(fs_info,
1018 space_info->flags,
1019 orig_bytes, flush,
1020 "enospc");
1021 queue_work(system_unbound_wq,
1022 &fs_info->async_reclaim_work);
1023 }
1024 } else {
1025 list_add_tail(&ticket.list,
1026 &space_info->priority_tickets);
1027 }
1028 } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
1029 used += orig_bytes;
1030 /*
1031 * We will do the space reservation dance during log replay,
1032 * which means we won't have fs_info->fs_root set, so don't do
1033 * the async reclaim as we will panic.
1034 */
1035 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
1036 need_do_async_reclaim(fs_info, space_info,
1037 used, system_chunk) &&
1038 !work_busy(&fs_info->async_reclaim_work)) {
1039 trace_btrfs_trigger_flush(fs_info, space_info->flags,
1040 orig_bytes, flush, "preempt");
1041 queue_work(system_unbound_wq,
1042 &fs_info->async_reclaim_work);
1043 }
1044 }
1045 spin_unlock(&space_info->lock);
1046 if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
1047 return ret;
1048
Josef Bacik03235272019-08-01 18:19:35 -04001049 return handle_reserve_ticket(fs_info, space_info, &ticket, flush);
Josef Bacik0d9764f2019-06-18 16:09:25 -04001050}
1051
1052/**
1053 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
1054 * @root - the root we're allocating for
1055 * @block_rsv - the block_rsv we're allocating for
1056 * @orig_bytes - the number of bytes we want
1057 * @flush - whether or not we can flush to make our reservation
1058 *
1059 * This will reserve orig_bytes number of bytes from the space info associated
1060 * with the block_rsv. If there is not enough space it will make an attempt to
1061 * flush out space to make room. It will do this by flushing delalloc if
1062 * possible or committing the transaction. If flush is 0 then no attempts to
1063 * regain reservations will be made and this will fail if there is not enough
1064 * space already.
1065 */
1066int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
1067 struct btrfs_block_rsv *block_rsv,
1068 u64 orig_bytes,
1069 enum btrfs_reserve_flush_enum flush)
1070{
1071 struct btrfs_fs_info *fs_info = root->fs_info;
1072 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
1073 int ret;
1074 bool system_chunk = (root == fs_info->chunk_root);
1075
1076 ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
1077 orig_bytes, flush, system_chunk);
1078 if (ret == -ENOSPC &&
1079 unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
1080 if (block_rsv != global_rsv &&
1081 !btrfs_block_rsv_use_bytes(global_rsv, orig_bytes))
1082 ret = 0;
1083 }
1084 if (ret == -ENOSPC) {
1085 trace_btrfs_space_reservation(fs_info, "space_info:enospc",
1086 block_rsv->space_info->flags,
1087 orig_bytes, 1);
1088
1089 if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
1090 btrfs_dump_space_info(fs_info, block_rsv->space_info,
1091 orig_bytes, 0);
1092 }
1093 return ret;
1094}