Blame - fs/btrfs/space-info.c - SHIFTPHONES/mainline/linux

blob: 9cb511d8cd9da695dfa5fd90a87448fa6037321b [file] [log] [blame]

Josef Bacik	280c2908	2019-06-18 16:09:19 -0400	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2
David Sterba	784352f	2019-08-21 18:54:28 +0200	[diff] [blame]	3	#include "misc.h"
Josef Bacik	280c2908	2019-06-18 16:09:19 -0400	[diff] [blame]	4	#include "ctree.h"
				5	#include "space-info.h"
				6	#include "sysfs.h"
				7	#include "volumes.h"
Josef Bacik	5da6afe	2019-06-18 16:09:24 -0400	[diff] [blame]	8	#include "free-space-cache.h"
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	9	#include "ordered-data.h"
				10	#include "transaction.h"
Josef Bacik	aac0023	2019-06-20 15:37:44 -0400	[diff] [blame]	11	#include "block-group.h"
Josef Bacik	280c2908	2019-06-18 16:09:19 -0400	[diff] [blame]	12
Josef Bacik	4b8b052	2020-02-04 13:18:56 -0500	[diff] [blame]	13	/*
				14	* HOW DOES SPACE RESERVATION WORK
				15	*
				16	* If you want to know about delalloc specifically, there is a separate comment
				17	* for that with the delalloc code. This comment is about how the whole system
				18	* works generally.
				19	*
				20	* BASIC CONCEPTS
				21	*
				22	* 1) space_info. This is the ultimate arbiter of how much space we can use.
				23	* There's a description of the bytes_ fields with the struct declaration,
				24	* refer to that for specifics on each field. Suffice it to say that for
				25	* reservations we care about total_bytes - SUM(space_info->bytes_) when
				26	* determining if there is space to make an allocation. There is a space_info
				27	* for METADATA, SYSTEM, and DATA areas.
				28	*
				29	* 2) block_rsv's. These are basically buckets for every different type of
				30	* metadata reservation we have. You can see the comment in the block_rsv
				31	* code on the rules for each type, but generally block_rsv->reserved is how
				32	* much space is accounted for in space_info->bytes_may_use.
				33	*
				34	* 3) btrfs_calc*_size. These are the worst case calculations we used based
				35	* on the number of items we will want to modify. We have one for changing
				36	* items, and one for inserting new items. Generally we use these helpers to
				37	* determine the size of the block reserves, and then use the actual bytes
				38	* values to adjust the space_info counters.
				39	*
				40	* MAKING RESERVATIONS, THE NORMAL CASE
				41	*
				42	* We call into either btrfs_reserve_data_bytes() or
				43	* btrfs_reserve_metadata_bytes(), depending on which we're looking for, with
				44	* num_bytes we want to reserve.
				45	*
				46	* ->reserve
				47	* space_info->bytes_may_reserve += num_bytes
				48	*
				49	* ->extent allocation
				50	* Call btrfs_add_reserved_bytes() which does
				51	* space_info->bytes_may_reserve -= num_bytes
				52	* space_info->bytes_reserved += extent_bytes
				53	*
				54	* ->insert reference
				55	* Call btrfs_update_block_group() which does
				56	* space_info->bytes_reserved -= extent_bytes
				57	* space_info->bytes_used += extent_bytes
				58	*
				59	* MAKING RESERVATIONS, FLUSHING NORMALLY (non-priority)
				60	*
				61	* Assume we are unable to simply make the reservation because we do not have
				62	* enough space
				63	*
				64	* -> __reserve_bytes
				65	* create a reserve_ticket with ->bytes set to our reservation, add it to
				66	* the tail of space_info->tickets, kick async flush thread
				67	*
				68	* ->handle_reserve_ticket
				69	* wait on ticket->wait for ->bytes to be reduced to 0, or ->error to be set
				70	* on the ticket.
				71	*
				72	* -> btrfs_async_reclaim_metadata_space/btrfs_async_reclaim_data_space
				73	* Flushes various things attempting to free up space.
				74	*
				75	* -> btrfs_try_granting_tickets()
				76	* This is called by anything that either subtracts space from
				77	* space_info->bytes_may_use, ->bytes_pinned, etc, or adds to the
				78	* space_info->total_bytes. This loops through the ->priority_tickets and
				79	* then the ->tickets list checking to see if the reservation can be
				80	* completed. If it can the space is added to space_info->bytes_may_use and
				81	* the ticket is woken up.
				82	*
				83	* -> ticket wakeup
				84	* Check if ->bytes == 0, if it does we got our reservation and we can carry
				85	* on, if not return the appropriate error (ENOSPC, but can be EINTR if we
				86	* were interrupted.)
				87	*
				88	* MAKING RESERVATIONS, FLUSHING HIGH PRIORITY
				89	*
				90	* Same as the above, except we add ourselves to the
				91	* space_info->priority_tickets, and we do not use ticket->wait, we simply
				92	* call flush_space() ourselves for the states that are safe for us to call
				93	* without deadlocking and hope for the best.
				94	*
				95	* THE FLUSHING STATES
				96	*
				97	* Generally speaking we will have two cases for each state, a "nice" state
				98	* and a "ALL THE THINGS" state. In btrfs we delay a lot of work in order to
				99	* reduce the locking over head on the various trees, and even to keep from
				100	* doing any work at all in the case of delayed refs. Each of these delayed
				101	* things however hold reservations, and so letting them run allows us to
				102	* reclaim space so we can make new reservations.
				103	*
				104	* FLUSH_DELAYED_ITEMS
				105	* Every inode has a delayed item to update the inode. Take a simple write
				106	* for example, we would update the inode item at write time to update the
				107	* mtime, and then again at finish_ordered_io() time in order to update the
				108	* isize or bytes. We keep these delayed items to coalesce these operations
				109	* into a single operation done on demand. These are an easy way to reclaim
				110	* metadata space.
				111	*
				112	* FLUSH_DELALLOC
				113	* Look at the delalloc comment to get an idea of how much space is reserved
				114	* for delayed allocation. We can reclaim some of this space simply by
				115	* running delalloc, but usually we need to wait for ordered extents to
				116	* reclaim the bulk of this space.
				117	*
				118	* FLUSH_DELAYED_REFS
				119	* We have a block reserve for the outstanding delayed refs space, and every
				120	* delayed ref operation holds a reservation. Running these is a quick way
				121	* to reclaim space, but we want to hold this until the end because COW can
				122	* churn a lot and we can avoid making some extent tree modifications if we
				123	* are able to delay for as long as possible.
				124	*
				125	* ALLOC_CHUNK
				126	* We will skip this the first time through space reservation, because of
				127	* overcommit and we don't want to have a lot of useless metadata space when
				128	* our worst case reservations will likely never come true.
				129	*
				130	* RUN_DELAYED_IPUTS
				131	* If we're freeing inodes we're likely freeing checksums, file extent
				132	* items, and extent tree items. Loads of space could be freed up by these
				133	* operations, however they won't be usable until the transaction commits.
				134	*
				135	* COMMIT_TRANS
				136	* may_commit_transaction() is the ultimate arbiter on whether we commit the
				137	* transaction or not. In order to avoid constantly churning we do all the
				138	* above flushing first and then commit the transaction as the last resort.
				139	* However we need to take into account things like pinned space that would
				140	* be freed, plus any delayed work we may not have gotten rid of in the case
				141	* of metadata.
				142	*
				143	* OVERCOMMIT
				144	*
				145	* Because we hold so many reservations for metadata we will allow you to
				146	* reserve more space than is currently free in the currently allocate
				147	* metadata space. This only happens with metadata, data does not allow
				148	* overcommitting.
				149	*
				150	* You can see the current logic for when we allow overcommit in
				151	* btrfs_can_overcommit(), but it only applies to unallocated space. If there
				152	* is no unallocated space to be had, all reservations are kept within the
				153	* free space in the allocated metadata chunks.
				154	*
				155	* Because of overcommitting, you generally want to use the
				156	* btrfs_can_overcommit() logic for metadata allocations, as it does the right
				157	* thing with or without extra unallocated space.
				158	*/
				159
David Sterba	e1f60a6	2019-10-01 19:57:39 +0200	[diff] [blame]	160	u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
Josef Bacik	280c2908	2019-06-18 16:09:19 -0400	[diff] [blame]	161	bool may_use_included)
				162	{
				163	ASSERT(s_info);
				164	return s_info->bytes_used + s_info->bytes_reserved +
				165	s_info->bytes_pinned + s_info->bytes_readonly +
				166	(may_use_included ? s_info->bytes_may_use : 0);
				167	}
				168
				169	/*
				170	* after adding space to the filesystem, we need to clear the full flags
				171	* on all the space infos.
				172	*/
				173	void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
				174	{
				175	struct list_head *head = &info->space_info;
				176	struct btrfs_space_info *found;
				177
				178	rcu_read_lock();
				179	list_for_each_entry_rcu(found, head, list)
				180	found->full = 0;
				181	rcu_read_unlock();
				182	}
				183
Josef Bacik	280c2908	2019-06-18 16:09:19 -0400	[diff] [blame]	184	static int create_space_info(struct btrfs_fs_info *info, u64 flags)
				185	{
				186
				187	struct btrfs_space_info *space_info;
				188	int i;
				189	int ret;
				190
				191	space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
				192	if (!space_info)
				193	return -ENOMEM;
				194
				195	ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
				196	GFP_KERNEL);
				197	if (ret) {
				198	kfree(space_info);
				199	return ret;
				200	}
				201
				202	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
				203	INIT_LIST_HEAD(&space_info->block_groups[i]);
				204	init_rwsem(&space_info->groups_sem);
				205	spin_lock_init(&space_info->lock);
				206	space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
				207	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
Josef Bacik	280c2908	2019-06-18 16:09:19 -0400	[diff] [blame]	208	INIT_LIST_HEAD(&space_info->ro_bgs);
				209	INIT_LIST_HEAD(&space_info->tickets);
				210	INIT_LIST_HEAD(&space_info->priority_tickets);
				211
David Sterba	b882327	2019-08-01 18:50:16 +0200	[diff] [blame]	212	ret = btrfs_sysfs_add_space_info_type(info, space_info);
				213	if (ret)
Josef Bacik	280c2908	2019-06-18 16:09:19 -0400	[diff] [blame]	214	return ret;
Josef Bacik	280c2908	2019-06-18 16:09:19 -0400	[diff] [blame]	215
				216	list_add_rcu(&space_info->list, &info->space_info);
				217	if (flags & BTRFS_BLOCK_GROUP_DATA)
				218	info->data_sinfo = space_info;
				219
				220	return ret;
				221	}
				222
				223	int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
				224	{
				225	struct btrfs_super_block *disk_super;
				226	u64 features;
				227	u64 flags;
				228	int mixed = 0;
				229	int ret;
				230
				231	disk_super = fs_info->super_copy;
				232	if (!btrfs_super_root(disk_super))
				233	return -EINVAL;
				234
				235	features = btrfs_super_incompat_flags(disk_super);
				236	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
				237	mixed = 1;
				238
				239	flags = BTRFS_BLOCK_GROUP_SYSTEM;
				240	ret = create_space_info(fs_info, flags);
				241	if (ret)
				242	goto out;
				243
				244	if (mixed) {
				245	flags = BTRFS_BLOCK_GROUP_METADATA \| BTRFS_BLOCK_GROUP_DATA;
				246	ret = create_space_info(fs_info, flags);
				247	} else {
				248	flags = BTRFS_BLOCK_GROUP_METADATA;
				249	ret = create_space_info(fs_info, flags);
				250	if (ret)
				251	goto out;
				252
				253	flags = BTRFS_BLOCK_GROUP_DATA;
				254	ret = create_space_info(fs_info, flags);
				255	}
				256	out:
				257	return ret;
				258	}
				259
				260	void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
				261	u64 total_bytes, u64 bytes_used,
				262	u64 bytes_readonly,
				263	struct btrfs_space_info **space_info)
				264	{
				265	struct btrfs_space_info *found;
				266	int factor;
				267
				268	factor = btrfs_bg_type_to_factor(flags);
				269
				270	found = btrfs_find_space_info(info, flags);
				271	ASSERT(found);
				272	spin_lock(&found->lock);
				273	found->total_bytes += total_bytes;
				274	found->disk_total += total_bytes * factor;
				275	found->bytes_used += bytes_used;
				276	found->disk_used += bytes_used * factor;
				277	found->bytes_readonly += bytes_readonly;
				278	if (total_bytes > 0)
				279	found->full = 0;
Josef Bacik	18fa228	2019-08-22 15:10:58 -0400	[diff] [blame]	280	btrfs_try_granting_tickets(info, found);
Josef Bacik	280c2908	2019-06-18 16:09:19 -0400	[diff] [blame]	281	spin_unlock(&found->lock);
				282	*space_info = found;
				283	}
				284
				285	struct btrfs_space_info btrfs_find_space_info(struct btrfs_fs_info info,
				286	u64 flags)
				287	{
				288	struct list_head *head = &info->space_info;
				289	struct btrfs_space_info *found;
				290
				291	flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
				292
				293	rcu_read_lock();
				294	list_for_each_entry_rcu(found, head, list) {
				295	if (found->flags & flags) {
				296	rcu_read_unlock();
				297	return found;
				298	}
				299	}
				300	rcu_read_unlock();
				301	return NULL;
				302	}
Josef Bacik	41783ef	2019-06-18 16:09:20 -0400	[diff] [blame]	303
				304	static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
				305	{
				306	return (global->size << 1);
				307	}
				308
Josef Bacik	fa121a2	2020-02-21 16:41:10 -0500	[diff] [blame^]	309	static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
				310	struct btrfs_space_info *space_info,
				311	enum btrfs_reserve_flush_enum flush)
Josef Bacik	41783ef	2019-06-18 16:09:20 -0400	[diff] [blame]	312	{
Josef Bacik	41783ef	2019-06-18 16:09:20 -0400	[diff] [blame]	313	u64 profile;
Josef Bacik	41783ef	2019-06-18 16:09:20 -0400	[diff] [blame]	314	u64 avail;
Josef Bacik	41783ef	2019-06-18 16:09:20 -0400	[diff] [blame]	315	int factor;
				316
Josef Bacik	9f24692	2019-11-26 11:25:53 -0500	[diff] [blame]	317	if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
Josef Bacik	41783ef	2019-06-18 16:09:20 -0400	[diff] [blame]	318	profile = btrfs_system_alloc_profile(fs_info);
				319	else
				320	profile = btrfs_metadata_alloc_profile(fs_info);
				321
Josef Bacik	41783ef	2019-06-18 16:09:20 -0400	[diff] [blame]	322	avail = atomic64_read(&fs_info->free_chunk_space);
				323
				324	/*
				325	* If we have dup, raid1 or raid10 then only half of the free
				326	* space is actually usable. For raid56, the space info used
				327	* doesn't include the parity drive, so we don't have to
				328	* change the math
				329	*/
				330	factor = btrfs_bg_type_to_factor(profile);
				331	avail = div_u64(avail, factor);
				332
				333	/*
				334	* If we aren't flushing all things, let us overcommit up to
				335	* 1/2th of the space. If we can flush, don't let us overcommit
				336	* too much, let it overcommit up to 1/8 of the space.
				337	*/
				338	if (flush == BTRFS_RESERVE_FLUSH_ALL)
				339	avail >>= 3;
				340	else
				341	avail >>= 1;
Josef Bacik	fa121a2	2020-02-21 16:41:10 -0500	[diff] [blame^]	342	return avail;
				343	}
				344
				345	int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
				346	struct btrfs_space_info *space_info, u64 bytes,
				347	enum btrfs_reserve_flush_enum flush)
				348	{
				349	u64 avail;
				350	u64 used;
				351
				352	/* Don't overcommit when in mixed mode */
				353	if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
				354	return 0;
				355
				356	used = btrfs_space_info_used(space_info, true);
				357	avail = calc_available_free_space(fs_info, space_info, flush);
Josef Bacik	41783ef	2019-06-18 16:09:20 -0400	[diff] [blame]	358
				359	if (used + bytes < space_info->total_bytes + avail)
				360	return 1;
				361	return 0;
				362	}
Josef Bacik	b338b01	2019-06-18 16:09:22 -0400	[diff] [blame]	363
				364	/*
				365	* This is for space we already have accounted in space_info->bytes_may_use, so
				366	* basically when we're returning space from block_rsv's.
				367	*/
Josef Bacik	18fa228	2019-08-22 15:10:58 -0400	[diff] [blame]	368	void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
				369	struct btrfs_space_info *space_info)
Josef Bacik	b338b01	2019-06-18 16:09:22 -0400	[diff] [blame]	370	{
Josef Bacik	b338b01	2019-06-18 16:09:22 -0400	[diff] [blame]	371	struct list_head *head;
Josef Bacik	b338b01	2019-06-18 16:09:22 -0400	[diff] [blame]	372	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
Josef Bacik	b338b01	2019-06-18 16:09:22 -0400	[diff] [blame]	373
Josef Bacik	18fa228	2019-08-22 15:10:58 -0400	[diff] [blame]	374	lockdep_assert_held(&space_info->lock);
Josef Bacik	b338b01	2019-06-18 16:09:22 -0400	[diff] [blame]	375
Josef Bacik	18fa228	2019-08-22 15:10:58 -0400	[diff] [blame]	376	head = &space_info->priority_tickets;
Josef Bacik	b338b01	2019-06-18 16:09:22 -0400	[diff] [blame]	377	again:
Josef Bacik	9118264	2019-08-28 11:15:24 -0400	[diff] [blame]	378	while (!list_empty(head)) {
				379	struct reserve_ticket *ticket;
				380	u64 used = btrfs_space_info_used(space_info, true);
				381
				382	ticket = list_first_entry(head, struct reserve_ticket, list);
				383
				384	/* Check and see if our ticket can be satisified now. */
				385	if ((used + ticket->bytes <= space_info->total_bytes) \|\|
Josef Bacik	a30a3d2	2020-01-17 09:07:39 -0500	[diff] [blame]	386	btrfs_can_overcommit(fs_info, space_info, ticket->bytes,
				387	flush)) {
Josef Bacik	9118264	2019-08-28 11:15:24 -0400	[diff] [blame]	388	btrfs_space_info_update_bytes_may_use(fs_info,
				389	space_info,
				390	ticket->bytes);
Josef Bacik	b338b01	2019-06-18 16:09:22 -0400	[diff] [blame]	391	list_del_init(&ticket->list);
Josef Bacik	b338b01	2019-06-18 16:09:22 -0400	[diff] [blame]	392	ticket->bytes = 0;
				393	space_info->tickets_id++;
				394	wake_up(&ticket->wait);
				395	} else {
Josef Bacik	9118264	2019-08-28 11:15:24 -0400	[diff] [blame]	396	break;
Josef Bacik	b338b01	2019-06-18 16:09:22 -0400	[diff] [blame]	397	}
				398	}
				399
Josef Bacik	9118264	2019-08-28 11:15:24 -0400	[diff] [blame]	400	if (head == &space_info->priority_tickets) {
Josef Bacik	b338b01	2019-06-18 16:09:22 -0400	[diff] [blame]	401	head = &space_info->tickets;
				402	flush = BTRFS_RESERVE_FLUSH_ALL;
				403	goto again;
				404	}
Josef Bacik	b338b01	2019-06-18 16:09:22 -0400	[diff] [blame]	405	}
Josef Bacik	5da6afe	2019-06-18 16:09:24 -0400	[diff] [blame]	406
				407	#define DUMP_BLOCK_RSV(fs_info, rsv_name) \
				408	do { \
				409	struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \
				410	spin_lock(&__rsv->lock); \
				411	btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \
				412	__rsv->size, __rsv->reserved); \
				413	spin_unlock(&__rsv->lock); \
				414	} while (0)
				415
Josef Bacik	84fe47a	2019-08-22 15:19:04 -0400	[diff] [blame]	416	static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
				417	struct btrfs_space_info *info)
Josef Bacik	5da6afe	2019-06-18 16:09:24 -0400	[diff] [blame]	418	{
Josef Bacik	84fe47a	2019-08-22 15:19:04 -0400	[diff] [blame]	419	lockdep_assert_held(&info->lock);
Josef Bacik	5da6afe	2019-06-18 16:09:24 -0400	[diff] [blame]	420
Josef Bacik	5da6afe	2019-06-18 16:09:24 -0400	[diff] [blame]	421	btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
				422	info->flags,
				423	info->total_bytes - btrfs_space_info_used(info, true),
				424	info->full ? "" : "not ");
				425	btrfs_info(fs_info,
				426	"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
				427	info->total_bytes, info->bytes_used, info->bytes_pinned,
				428	info->bytes_reserved, info->bytes_may_use,
				429	info->bytes_readonly);
Josef Bacik	5da6afe	2019-06-18 16:09:24 -0400	[diff] [blame]	430
				431	DUMP_BLOCK_RSV(fs_info, global_block_rsv);
				432	DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
				433	DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
				434	DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
				435	DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
				436
Josef Bacik	84fe47a	2019-08-22 15:19:04 -0400	[diff] [blame]	437	}
				438
				439	void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
				440	struct btrfs_space_info *info, u64 bytes,
				441	int dump_block_groups)
				442	{
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	443	struct btrfs_block_group *cache;
Josef Bacik	84fe47a	2019-08-22 15:19:04 -0400	[diff] [blame]	444	int index = 0;
				445
				446	spin_lock(&info->lock);
				447	__btrfs_dump_space_info(fs_info, info);
				448	spin_unlock(&info->lock);
				449
Josef Bacik	5da6afe	2019-06-18 16:09:24 -0400	[diff] [blame]	450	if (!dump_block_groups)
				451	return;
				452
				453	down_read(&info->groups_sem);
				454	again:
				455	list_for_each_entry(cache, &info->block_groups[index], list) {
				456	spin_lock(&cache->lock);
				457	btrfs_info(fs_info,
				458	"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	459	cache->start, cache->length, cache->used, cache->pinned,
Josef Bacik	5da6afe	2019-06-18 16:09:24 -0400	[diff] [blame]	460	cache->reserved, cache->ro ? "[readonly]" : "");
				461	btrfs_dump_free_space(cache, bytes);
				462	spin_unlock(&cache->lock);
				463	}
				464	if (++index < BTRFS_NR_RAID_TYPES)
				465	goto again;
				466	up_read(&info->groups_sem);
				467	}
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	468
				469	static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
				470	unsigned long nr_pages, int nr_items)
				471	{
				472	struct super_block *sb = fs_info->sb;
				473
				474	if (down_read_trylock(&sb->s_umount)) {
				475	writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
				476	up_read(&sb->s_umount);
				477	} else {
				478	/*
				479	* We needn't worry the filesystem going from r/w to r/o though
				480	* we don't acquire ->s_umount mutex, because the filesystem
				481	* should guarantee the delalloc inodes list be empty after
				482	* the filesystem is readonly(all dirty pages are written to
				483	* the disk).
				484	*/
				485	btrfs_start_delalloc_roots(fs_info, nr_items);
				486	if (!current->journal_info)
				487	btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
				488	}
				489	}
				490
				491	static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
				492	u64 to_reclaim)
				493	{
				494	u64 bytes;
				495	u64 nr;
				496
Josef Bacik	2bd36e7	2019-08-22 15:14:33 -0400	[diff] [blame]	497	bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	498	nr = div64_u64(to_reclaim, bytes);
				499	if (!nr)
				500	nr = 1;
				501	return nr;
				502	}
				503
				504	#define EXTENT_SIZE_PER_ITEM SZ_256K
				505
				506	/*
				507	* shrink metadata reservation for delalloc
				508	*/
				509	static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
				510	u64 orig, bool wait_ordered)
				511	{
				512	struct btrfs_space_info *space_info;
				513	struct btrfs_trans_handle *trans;
				514	u64 delalloc_bytes;
				515	u64 dio_bytes;
				516	u64 async_pages;
				517	u64 items;
				518	long time_left;
				519	unsigned long nr_pages;
				520	int loops;
				521
				522	/* Calc the number of the pages we need flush for space reservation */
				523	items = calc_reclaim_items_nr(fs_info, to_reclaim);
				524	to_reclaim = items * EXTENT_SIZE_PER_ITEM;
				525
				526	trans = (struct btrfs_trans_handle *)current->journal_info;
				527	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
				528
				529	delalloc_bytes = percpu_counter_sum_positive(
				530	&fs_info->delalloc_bytes);
				531	dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
				532	if (delalloc_bytes == 0 && dio_bytes == 0) {
				533	if (trans)
				534	return;
				535	if (wait_ordered)
				536	btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
				537	return;
				538	}
				539
				540	/*
				541	* If we are doing more ordered than delalloc we need to just wait on
				542	* ordered extents, otherwise we'll waste time trying to flush delalloc
				543	* that likely won't give us the space back we need.
				544	*/
				545	if (dio_bytes > delalloc_bytes)
				546	wait_ordered = true;
				547
				548	loops = 0;
				549	while ((delalloc_bytes \|\| dio_bytes) && loops < 3) {
				550	nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
				551
				552	/*
				553	* Triggers inode writeback for up to nr_pages. This will invoke
				554	* ->writepages callback and trigger delalloc filling
				555	* (btrfs_run_delalloc_range()).
				556	*/
				557	btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
				558
				559	/*
				560	* We need to wait for the compressed pages to start before
				561	* we continue.
				562	*/
				563	async_pages = atomic_read(&fs_info->async_delalloc_pages);
				564	if (!async_pages)
				565	goto skip_async;
				566
				567	/*
				568	* Calculate how many compressed pages we want to be written
				569	* before we continue. I.e if there are more async pages than we
				570	* require wait_event will wait until nr_pages are written.
				571	*/
				572	if (async_pages <= nr_pages)
				573	async_pages = 0;
				574	else
				575	async_pages -= nr_pages;
				576
				577	wait_event(fs_info->async_submit_wait,
				578	atomic_read(&fs_info->async_delalloc_pages) <=
				579	(int)async_pages);
				580	skip_async:
				581	spin_lock(&space_info->lock);
				582	if (list_empty(&space_info->tickets) &&
				583	list_empty(&space_info->priority_tickets)) {
				584	spin_unlock(&space_info->lock);
				585	break;
				586	}
				587	spin_unlock(&space_info->lock);
				588
				589	loops++;
				590	if (wait_ordered && !trans) {
				591	btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
				592	} else {
				593	time_left = schedule_timeout_killable(1);
				594	if (time_left)
				595	break;
				596	}
				597	delalloc_bytes = percpu_counter_sum_positive(
				598	&fs_info->delalloc_bytes);
				599	dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
				600	}
				601	}
				602
				603	/**
				604	* maybe_commit_transaction - possibly commit the transaction if its ok to
				605	* @root - the root we're allocating for
				606	* @bytes - the number of bytes we want to reserve
				607	* @force - force the commit
				608	*
				609	* This will check to make sure that committing the transaction will actually
				610	* get us somewhere and then commit the transaction if it does. Otherwise it
				611	* will return -ENOSPC.
				612	*/
				613	static int may_commit_transaction(struct btrfs_fs_info *fs_info,
				614	struct btrfs_space_info *space_info)
				615	{
				616	struct reserve_ticket *ticket = NULL;
				617	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
				618	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
				619	struct btrfs_trans_handle *trans;
				620	u64 bytes_needed;
				621	u64 reclaim_bytes = 0;
Josef Bacik	00c0135	2019-08-22 15:11:00 -0400	[diff] [blame]	622	u64 cur_free_bytes = 0;
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	623
				624	trans = (struct btrfs_trans_handle *)current->journal_info;
				625	if (trans)
				626	return -EAGAIN;
				627
				628	spin_lock(&space_info->lock);
Josef Bacik	00c0135	2019-08-22 15:11:00 -0400	[diff] [blame]	629	cur_free_bytes = btrfs_space_info_used(space_info, true);
				630	if (cur_free_bytes < space_info->total_bytes)
				631	cur_free_bytes = space_info->total_bytes - cur_free_bytes;
				632	else
				633	cur_free_bytes = 0;
				634
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	635	if (!list_empty(&space_info->priority_tickets))
				636	ticket = list_first_entry(&space_info->priority_tickets,
				637	struct reserve_ticket, list);
				638	else if (!list_empty(&space_info->tickets))
				639	ticket = list_first_entry(&space_info->tickets,
				640	struct reserve_ticket, list);
				641	bytes_needed = (ticket) ? ticket->bytes : 0;
Josef Bacik	00c0135	2019-08-22 15:11:00 -0400	[diff] [blame]	642
				643	if (bytes_needed > cur_free_bytes)
				644	bytes_needed -= cur_free_bytes;
				645	else
				646	bytes_needed = 0;
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	647	spin_unlock(&space_info->lock);
				648
				649	if (!bytes_needed)
				650	return 0;
				651
				652	trans = btrfs_join_transaction(fs_info->extent_root);
				653	if (IS_ERR(trans))
				654	return PTR_ERR(trans);
				655
				656	/*
				657	* See if there is enough pinned space to make this reservation, or if
				658	* we have block groups that are going to be freed, allowing us to
				659	* possibly do a chunk allocation the next loop through.
				660	*/
				661	if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) \|\|
				662	__percpu_counter_compare(&space_info->total_bytes_pinned,
				663	bytes_needed,
				664	BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
				665	goto commit;
				666
				667	/*
				668	* See if there is some space in the delayed insertion reservation for
				669	* this reservation.
				670	*/
				671	if (space_info != delayed_rsv->space_info)
				672	goto enospc;
				673
				674	spin_lock(&delayed_rsv->lock);
				675	reclaim_bytes += delayed_rsv->reserved;
				676	spin_unlock(&delayed_rsv->lock);
				677
				678	spin_lock(&delayed_refs_rsv->lock);
				679	reclaim_bytes += delayed_refs_rsv->reserved;
				680	spin_unlock(&delayed_refs_rsv->lock);
				681	if (reclaim_bytes >= bytes_needed)
				682	goto commit;
				683	bytes_needed -= reclaim_bytes;
				684
				685	if (__percpu_counter_compare(&space_info->total_bytes_pinned,
				686	bytes_needed,
				687	BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
				688	goto enospc;
				689
				690	commit:
				691	return btrfs_commit_transaction(trans);
				692	enospc:
				693	btrfs_end_transaction(trans);
				694	return -ENOSPC;
				695	}
				696
				697	/*
				698	* Try to flush some data based on policy set by @state. This is only advisory
				699	* and may fail for various reasons. The caller is supposed to examine the
				700	* state of @space_info to detect the outcome.
				701	*/
				702	static void flush_space(struct btrfs_fs_info *fs_info,
				703	struct btrfs_space_info *space_info, u64 num_bytes,
				704	int state)
				705	{
				706	struct btrfs_root *root = fs_info->extent_root;
				707	struct btrfs_trans_handle *trans;
				708	int nr;
				709	int ret = 0;
				710
				711	switch (state) {
				712	case FLUSH_DELAYED_ITEMS_NR:
				713	case FLUSH_DELAYED_ITEMS:
				714	if (state == FLUSH_DELAYED_ITEMS_NR)
				715	nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
				716	else
				717	nr = -1;
				718
				719	trans = btrfs_join_transaction(root);
				720	if (IS_ERR(trans)) {
				721	ret = PTR_ERR(trans);
				722	break;
				723	}
				724	ret = btrfs_run_delayed_items_nr(trans, nr);
				725	btrfs_end_transaction(trans);
				726	break;
				727	case FLUSH_DELALLOC:
				728	case FLUSH_DELALLOC_WAIT:
				729	shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
				730	state == FLUSH_DELALLOC_WAIT);
				731	break;
				732	case FLUSH_DELAYED_REFS_NR:
				733	case FLUSH_DELAYED_REFS:
				734	trans = btrfs_join_transaction(root);
				735	if (IS_ERR(trans)) {
				736	ret = PTR_ERR(trans);
				737	break;
				738	}
				739	if (state == FLUSH_DELAYED_REFS_NR)
				740	nr = calc_reclaim_items_nr(fs_info, num_bytes);
				741	else
				742	nr = 0;
				743	btrfs_run_delayed_refs(trans, nr);
				744	btrfs_end_transaction(trans);
				745	break;
				746	case ALLOC_CHUNK:
				747	case ALLOC_CHUNK_FORCE:
				748	trans = btrfs_join_transaction(root);
				749	if (IS_ERR(trans)) {
				750	ret = PTR_ERR(trans);
				751	break;
				752	}
				753	ret = btrfs_chunk_alloc(trans,
				754	btrfs_metadata_alloc_profile(fs_info),
				755	(state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
				756	CHUNK_ALLOC_FORCE);
				757	btrfs_end_transaction(trans);
				758	if (ret > 0 \|\| ret == -ENOSPC)
				759	ret = 0;
				760	break;
Josef Bacik	844245b	2019-08-01 18:19:33 -0400	[diff] [blame]	761	case RUN_DELAYED_IPUTS:
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	762	/*
				763	* If we have pending delayed iputs then we could free up a
				764	* bunch of pinned space, so make sure we run the iputs before
				765	* we do our pinned bytes check below.
				766	*/
				767	btrfs_run_delayed_iputs(fs_info);
				768	btrfs_wait_on_delayed_iputs(fs_info);
Josef Bacik	844245b	2019-08-01 18:19:33 -0400	[diff] [blame]	769	break;
				770	case COMMIT_TRANS:
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	771	ret = may_commit_transaction(fs_info, space_info);
				772	break;
				773	default:
				774	ret = -ENOSPC;
				775	break;
				776	}
				777
				778	trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
				779	ret);
				780	return;
				781	}
				782
				783	static inline u64
				784	btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
Josef Bacik	9f24692	2019-11-26 11:25:53 -0500	[diff] [blame]	785	struct btrfs_space_info *space_info)
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	786	{
				787	struct reserve_ticket *ticket;
				788	u64 used;
Josef Bacik	fa121a2	2020-02-21 16:41:10 -0500	[diff] [blame^]	789	u64 avail;
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	790	u64 expected;
				791	u64 to_reclaim = 0;
				792
				793	list_for_each_entry(ticket, &space_info->tickets, list)
				794	to_reclaim += ticket->bytes;
				795	list_for_each_entry(ticket, &space_info->priority_tickets, list)
				796	to_reclaim += ticket->bytes;
Josef Bacik	fa121a2	2020-02-21 16:41:10 -0500	[diff] [blame^]	797
				798	avail = calc_available_free_space(fs_info, space_info,
				799	BTRFS_RESERVE_FLUSH_ALL);
				800	used = btrfs_space_info_used(space_info, true);
				801
				802	/*
				803	* We may be flushing because suddenly we have less space than we had
				804	* before, and now we're well over-committed based on our current free
				805	* space. If that's the case add in our overage so we make sure to put
				806	* appropriate pressure on the flushing state machine.
				807	*/
				808	if (space_info->total_bytes + avail < used)
				809	to_reclaim += used - (space_info->total_bytes + avail);
				810
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	811	if (to_reclaim)
				812	return to_reclaim;
				813
				814	to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
Josef Bacik	a30a3d2	2020-01-17 09:07:39 -0500	[diff] [blame]	815	if (btrfs_can_overcommit(fs_info, space_info, to_reclaim,
				816	BTRFS_RESERVE_FLUSH_ALL))
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	817	return 0;
				818
				819	used = btrfs_space_info_used(space_info, true);
				820
Josef Bacik	a30a3d2	2020-01-17 09:07:39 -0500	[diff] [blame]	821	if (btrfs_can_overcommit(fs_info, space_info, SZ_1M,
				822	BTRFS_RESERVE_FLUSH_ALL))
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	823	expected = div_factor_fine(space_info->total_bytes, 95);
				824	else
				825	expected = div_factor_fine(space_info->total_bytes, 90);
				826
				827	if (used > expected)
				828	to_reclaim = used - expected;
				829	else
				830	to_reclaim = 0;
				831	to_reclaim = min(to_reclaim, space_info->bytes_may_use +
				832	space_info->bytes_reserved);
				833	return to_reclaim;
				834	}
				835
				836	static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
				837	struct btrfs_space_info *space_info,
Josef Bacik	9f24692	2019-11-26 11:25:53 -0500	[diff] [blame]	838	u64 used)
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	839	{
				840	u64 thresh = div_factor_fine(space_info->total_bytes, 98);
				841
				842	/* If we're just plain full then async reclaim just slows us down. */
				843	if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
				844	return 0;
				845
Josef Bacik	9f24692	2019-11-26 11:25:53 -0500	[diff] [blame]	846	if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info))
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	847	return 0;
				848
				849	return (used >= thresh && !btrfs_fs_closing(fs_info) &&
				850	!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
				851	}
				852
Josef Bacik	2341ccd	2019-08-28 11:12:47 -0400	[diff] [blame]	853	/*
				854	* maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets
				855	* @fs_info - fs_info for this fs
				856	* @space_info - the space info we were flushing
				857	*
				858	* We call this when we've exhausted our flushing ability and haven't made
				859	* progress in satisfying tickets. The reservation code handles tickets in
				860	* order, so if there is a large ticket first and then smaller ones we could
				861	* very well satisfy the smaller tickets. This will attempt to wake up any
				862	* tickets in the list to catch this case.
				863	*
				864	* This function returns true if it was able to make progress by clearing out
				865	* other tickets, or if it stumbles across a ticket that was smaller than the
				866	* first ticket.
				867	*/
				868	static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
				869	struct btrfs_space_info *space_info)
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	870	{
				871	struct reserve_ticket *ticket;
Josef Bacik	2341ccd	2019-08-28 11:12:47 -0400	[diff] [blame]	872	u64 tickets_id = space_info->tickets_id;
				873	u64 first_ticket_bytes = 0;
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	874
Josef Bacik	84fe47a	2019-08-22 15:19:04 -0400	[diff] [blame]	875	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
				876	btrfs_info(fs_info, "cannot satisfy tickets, dumping space info");
				877	__btrfs_dump_space_info(fs_info, space_info);
				878	}
				879
Josef Bacik	2341ccd	2019-08-28 11:12:47 -0400	[diff] [blame]	880	while (!list_empty(&space_info->tickets) &&
				881	tickets_id == space_info->tickets_id) {
				882	ticket = list_first_entry(&space_info->tickets,
				883	struct reserve_ticket, list);
				884
				885	/*
				886	* may_commit_transaction will avoid committing the transaction
				887	* if it doesn't feel like the space reclaimed by the commit
				888	* would result in the ticket succeeding. However if we have a
				889	* smaller ticket in the queue it may be small enough to be
				890	* satisified by committing the transaction, so if any
				891	* subsequent ticket is smaller than the first ticket go ahead
				892	* and send us back for another loop through the enospc flushing
				893	* code.
				894	*/
				895	if (first_ticket_bytes == 0)
				896	first_ticket_bytes = ticket->bytes;
				897	else if (first_ticket_bytes > ticket->bytes)
				898	return true;
				899
Josef Bacik	84fe47a	2019-08-22 15:19:04 -0400	[diff] [blame]	900	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
				901	btrfs_info(fs_info, "failing ticket with %llu bytes",
				902	ticket->bytes);
				903
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	904	list_del_init(&ticket->list);
				905	ticket->error = -ENOSPC;
				906	wake_up(&ticket->wait);
Josef Bacik	2341ccd	2019-08-28 11:12:47 -0400	[diff] [blame]	907
				908	/*
				909	* We're just throwing tickets away, so more flushing may not
				910	* trip over btrfs_try_granting_tickets, so we need to call it
				911	* here to see if we can make progress with the next ticket in
				912	* the list.
				913	*/
				914	btrfs_try_granting_tickets(fs_info, space_info);
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	915	}
Josef Bacik	2341ccd	2019-08-28 11:12:47 -0400	[diff] [blame]	916	return (tickets_id != space_info->tickets_id);
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	917	}
				918
				919	/*
				920	* This is for normal flushers, we can wait all goddamned day if we want to. We
				921	* will loop and continuously try to flush as long as we are making progress.
				922	* We count progress as clearing off tickets each time we have to loop.
				923	*/
				924	static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
				925	{
				926	struct btrfs_fs_info *fs_info;
				927	struct btrfs_space_info *space_info;
				928	u64 to_reclaim;
				929	int flush_state;
				930	int commit_cycles = 0;
				931	u64 last_tickets_id;
				932
				933	fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
				934	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
				935
				936	spin_lock(&space_info->lock);
Josef Bacik	9f24692	2019-11-26 11:25:53 -0500	[diff] [blame]	937	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	938	if (!to_reclaim) {
				939	space_info->flush = 0;
				940	spin_unlock(&space_info->lock);
				941	return;
				942	}
				943	last_tickets_id = space_info->tickets_id;
				944	spin_unlock(&space_info->lock);
				945
				946	flush_state = FLUSH_DELAYED_ITEMS_NR;
				947	do {
				948	flush_space(fs_info, space_info, to_reclaim, flush_state);
				949	spin_lock(&space_info->lock);
				950	if (list_empty(&space_info->tickets)) {
				951	space_info->flush = 0;
				952	spin_unlock(&space_info->lock);
				953	return;
				954	}
				955	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
Josef Bacik	9f24692	2019-11-26 11:25:53 -0500	[diff] [blame]	956	space_info);
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	957	if (last_tickets_id == space_info->tickets_id) {
				958	flush_state++;
				959	} else {
				960	last_tickets_id = space_info->tickets_id;
				961	flush_state = FLUSH_DELAYED_ITEMS_NR;
				962	if (commit_cycles)
				963	commit_cycles--;
				964	}
				965
				966	/*
				967	* We don't want to force a chunk allocation until we've tried
				968	* pretty hard to reclaim space. Think of the case where we
				969	* freed up a bunch of space and so have a lot of pinned space
				970	* to reclaim. We would rather use that than possibly create a
				971	* underutilized metadata chunk. So if this is our first run
				972	* through the flushing state machine skip ALLOC_CHUNK_FORCE and
				973	* commit the transaction. If nothing has changed the next go
				974	* around then we can force a chunk allocation.
				975	*/
				976	if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
				977	flush_state++;
				978
				979	if (flush_state > COMMIT_TRANS) {
				980	commit_cycles++;
				981	if (commit_cycles > 2) {
Josef Bacik	2341ccd	2019-08-28 11:12:47 -0400	[diff] [blame]	982	if (maybe_fail_all_tickets(fs_info, space_info)) {
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	983	flush_state = FLUSH_DELAYED_ITEMS_NR;
				984	commit_cycles--;
				985	} else {
				986	space_info->flush = 0;
				987	}
				988	} else {
				989	flush_state = FLUSH_DELAYED_ITEMS_NR;
				990	}
				991	}
				992	spin_unlock(&space_info->lock);
				993	} while (flush_state <= COMMIT_TRANS);
				994	}
				995
				996	void btrfs_init_async_reclaim_work(struct work_struct *work)
				997	{
				998	INIT_WORK(work, btrfs_async_reclaim_metadata_space);
				999	}
				1000
				1001	static const enum btrfs_flush_state priority_flush_states[] = {
				1002	FLUSH_DELAYED_ITEMS_NR,
				1003	FLUSH_DELAYED_ITEMS,
				1004	ALLOC_CHUNK,
				1005	};
				1006
Josef Bacik	d3984c9	2019-08-01 18:19:37 -0400	[diff] [blame]	1007	static const enum btrfs_flush_state evict_flush_states[] = {
				1008	FLUSH_DELAYED_ITEMS_NR,
				1009	FLUSH_DELAYED_ITEMS,
				1010	FLUSH_DELAYED_REFS_NR,
				1011	FLUSH_DELAYED_REFS,
				1012	FLUSH_DELALLOC,
				1013	FLUSH_DELALLOC_WAIT,
				1014	ALLOC_CHUNK,
				1015	COMMIT_TRANS,
				1016	};
				1017
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	1018	static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
Josef Bacik	9ce2f42	2019-08-01 18:19:36 -0400	[diff] [blame]	1019	struct btrfs_space_info *space_info,
				1020	struct reserve_ticket *ticket,
				1021	const enum btrfs_flush_state *states,
				1022	int states_nr)
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	1023	{
				1024	u64 to_reclaim;
				1025	int flush_state;
				1026
				1027	spin_lock(&space_info->lock);
Josef Bacik	9f24692	2019-11-26 11:25:53 -0500	[diff] [blame]	1028	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	1029	if (!to_reclaim) {
				1030	spin_unlock(&space_info->lock);
				1031	return;
				1032	}
				1033	spin_unlock(&space_info->lock);
				1034
				1035	flush_state = 0;
				1036	do {
Josef Bacik	9ce2f42	2019-08-01 18:19:36 -0400	[diff] [blame]	1037	flush_space(fs_info, space_info, to_reclaim, states[flush_state]);
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	1038	flush_state++;
				1039	spin_lock(&space_info->lock);
				1040	if (ticket->bytes == 0) {
				1041	spin_unlock(&space_info->lock);
				1042	return;
				1043	}
				1044	spin_unlock(&space_info->lock);
Josef Bacik	9ce2f42	2019-08-01 18:19:36 -0400	[diff] [blame]	1045	} while (flush_state < states_nr);
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	1046	}
				1047
Josef Bacik	374bf9c	2019-08-01 18:19:34 -0400	[diff] [blame]	1048	static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
				1049	struct btrfs_space_info *space_info,
				1050	struct reserve_ticket *ticket)
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	1051
				1052	{
				1053	DEFINE_WAIT(wait);
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	1054	int ret = 0;
				1055
				1056	spin_lock(&space_info->lock);
				1057	while (ticket->bytes > 0 && ticket->error == 0) {
				1058	ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
				1059	if (ret) {
Filipe Manana	0cab7ac	2019-10-25 10:53:41 +0100	[diff] [blame]	1060	/*
				1061	* Delete us from the list. After we unlock the space
				1062	* info, we don't want the async reclaim job to reserve
				1063	* space for this ticket. If that would happen, then the
				1064	* ticket's task would not known that space was reserved
				1065	* despite getting an error, resulting in a space leak
				1066	* (bytes_may_use counter of our space_info).
				1067	*/
				1068	list_del_init(&ticket->list);
Josef Bacik	374bf9c	2019-08-01 18:19:34 -0400	[diff] [blame]	1069	ticket->error = -EINTR;
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	1070	break;
				1071	}
				1072	spin_unlock(&space_info->lock);
				1073
				1074	schedule();
				1075
				1076	finish_wait(&ticket->wait, &wait);
				1077	spin_lock(&space_info->lock);
				1078	}
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	1079	spin_unlock(&space_info->lock);
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	1080	}
				1081
				1082	/**
Josef Bacik	0323527	2019-08-01 18:19:35 -0400	[diff] [blame]	1083	* handle_reserve_ticket - do the appropriate flushing and waiting for a ticket
				1084	* @fs_info - the fs
				1085	* @space_info - the space_info for the reservation
				1086	* @ticket - the ticket for the reservation
				1087	* @flush - how much we can flush
				1088	*
				1089	* This does the work of figuring out how to flush for the ticket, waiting for
				1090	* the reservation, and returning the appropriate error if there is one.
				1091	*/
				1092	static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
				1093	struct btrfs_space_info *space_info,
				1094	struct reserve_ticket *ticket,
				1095	enum btrfs_reserve_flush_enum flush)
				1096	{
Josef Bacik	0323527	2019-08-01 18:19:35 -0400	[diff] [blame]	1097	int ret;
				1098
Josef Bacik	d3984c9	2019-08-01 18:19:37 -0400	[diff] [blame]	1099	switch (flush) {
				1100	case BTRFS_RESERVE_FLUSH_ALL:
Josef Bacik	0323527	2019-08-01 18:19:35 -0400	[diff] [blame]	1101	wait_reserve_ticket(fs_info, space_info, ticket);
Josef Bacik	d3984c9	2019-08-01 18:19:37 -0400	[diff] [blame]	1102	break;
				1103	case BTRFS_RESERVE_FLUSH_LIMIT:
Josef Bacik	9ce2f42	2019-08-01 18:19:36 -0400	[diff] [blame]	1104	priority_reclaim_metadata_space(fs_info, space_info, ticket,
				1105	priority_flush_states,
				1106	ARRAY_SIZE(priority_flush_states));
Josef Bacik	d3984c9	2019-08-01 18:19:37 -0400	[diff] [blame]	1107	break;
				1108	case BTRFS_RESERVE_FLUSH_EVICT:
				1109	priority_reclaim_metadata_space(fs_info, space_info, ticket,
				1110	evict_flush_states,
				1111	ARRAY_SIZE(evict_flush_states));
				1112	break;
				1113	default:
				1114	ASSERT(0);
				1115	break;
				1116	}
Josef Bacik	0323527	2019-08-01 18:19:35 -0400	[diff] [blame]	1117
				1118	spin_lock(&space_info->lock);
				1119	ret = ticket->error;
				1120	if (ticket->bytes \|\| ticket->error) {
Filipe Manana	0cab7ac	2019-10-25 10:53:41 +0100	[diff] [blame]	1121	/*
				1122	* Need to delete here for priority tickets. For regular tickets
				1123	* either the async reclaim job deletes the ticket from the list
				1124	* or we delete it ourselves at wait_reserve_ticket().
				1125	*/
Josef Bacik	0323527	2019-08-01 18:19:35 -0400	[diff] [blame]	1126	list_del_init(&ticket->list);
				1127	if (!ret)
				1128	ret = -ENOSPC;
				1129	}
				1130	spin_unlock(&space_info->lock);
Josef Bacik	0323527	2019-08-01 18:19:35 -0400	[diff] [blame]	1131	ASSERT(list_empty(&ticket->list));
Filipe Manana	0cab7ac	2019-10-25 10:53:41 +0100	[diff] [blame]	1132	/*
				1133	* Check that we can't have an error set if the reservation succeeded,
				1134	* as that would confuse tasks and lead them to error out without
				1135	* releasing reserved space (if an error happens the expectation is that
				1136	* space wasn't reserved at all).
				1137	*/
				1138	ASSERT(!(ticket->bytes == 0 && ticket->error));
Josef Bacik	0323527	2019-08-01 18:19:35 -0400	[diff] [blame]	1139	return ret;
				1140	}
				1141
				1142	/**
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	1143	* reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
				1144	* @root - the root we're allocating for
				1145	* @space_info - the space info we want to allocate from
				1146	* @orig_bytes - the number of bytes we want
				1147	* @flush - whether or not we can flush to make our reservation
				1148	*
				1149	* This will reserve orig_bytes number of bytes from the space info associated
				1150	* with the block_rsv. If there is not enough space it will make an attempt to
				1151	* flush out space to make room. It will do this by flushing delalloc if
				1152	* possible or committing the transaction. If flush is 0 then no attempts to
				1153	* regain reservations will be made and this will fail if there is not enough
				1154	* space already.
				1155	*/
				1156	static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
				1157	struct btrfs_space_info *space_info,
				1158	u64 orig_bytes,
Josef Bacik	9f24692	2019-11-26 11:25:53 -0500	[diff] [blame]	1159	enum btrfs_reserve_flush_enum flush)
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	1160	{
				1161	struct reserve_ticket ticket;
				1162	u64 used;
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	1163	int ret = 0;
Josef Bacik	ef1317a	2019-08-22 15:10:54 -0400	[diff] [blame]	1164	bool pending_tickets;
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	1165
				1166	ASSERT(orig_bytes);
				1167	ASSERT(!current->journal_info \|\| flush != BTRFS_RESERVE_FLUSH_ALL);
				1168
				1169	spin_lock(&space_info->lock);
				1170	ret = -ENOSPC;
				1171	used = btrfs_space_info_used(space_info, true);
Josef Bacik	ef1317a	2019-08-22 15:10:54 -0400	[diff] [blame]	1172	pending_tickets = !list_empty(&space_info->tickets) \|\|
				1173	!list_empty(&space_info->priority_tickets);
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	1174
				1175	/*
Goldwyn Rodrigues	9b4851b	2019-06-25 20:11:31 +0200	[diff] [blame]	1176	* Carry on if we have enough space (short-circuit) OR call
				1177	* can_overcommit() to ensure we can overcommit to continue.
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	1178	*/
Josef Bacik	ef1317a	2019-08-22 15:10:54 -0400	[diff] [blame]	1179	if (!pending_tickets &&
				1180	((used + orig_bytes <= space_info->total_bytes) \|\|
Josef Bacik	a30a3d2	2020-01-17 09:07:39 -0500	[diff] [blame]	1181	btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) {
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	1182	btrfs_space_info_update_bytes_may_use(fs_info, space_info,
				1183	orig_bytes);
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	1184	ret = 0;
				1185	}
				1186
				1187	/*
				1188	* If we couldn't make a reservation then setup our reservation ticket
				1189	* and kick the async worker if it's not already running.
				1190	*
				1191	* If we are a priority flusher then we just need to add our ticket to
				1192	* the list and we will do our own flushing further down.
				1193	*/
				1194	if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	1195	ticket.bytes = orig_bytes;
				1196	ticket.error = 0;
				1197	init_waitqueue_head(&ticket.wait);
				1198	if (flush == BTRFS_RESERVE_FLUSH_ALL) {
				1199	list_add_tail(&ticket.list, &space_info->tickets);
				1200	if (!space_info->flush) {
				1201	space_info->flush = 1;
				1202	trace_btrfs_trigger_flush(fs_info,
				1203	space_info->flags,
				1204	orig_bytes, flush,
				1205	"enospc");
				1206	queue_work(system_unbound_wq,
				1207	&fs_info->async_reclaim_work);
				1208	}
				1209	} else {
				1210	list_add_tail(&ticket.list,
				1211	&space_info->priority_tickets);
				1212	}
				1213	} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
				1214	used += orig_bytes;
				1215	/*
				1216	* We will do the space reservation dance during log replay,
				1217	* which means we won't have fs_info->fs_root set, so don't do
				1218	* the async reclaim as we will panic.
				1219	*/
				1220	if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
Josef Bacik	9f24692	2019-11-26 11:25:53 -0500	[diff] [blame]	1221	need_do_async_reclaim(fs_info, space_info, used) &&
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	1222	!work_busy(&fs_info->async_reclaim_work)) {
				1223	trace_btrfs_trigger_flush(fs_info, space_info->flags,
				1224	orig_bytes, flush, "preempt");
				1225	queue_work(system_unbound_wq,
				1226	&fs_info->async_reclaim_work);
				1227	}
				1228	}
				1229	spin_unlock(&space_info->lock);
				1230	if (!ret \|\| flush == BTRFS_RESERVE_NO_FLUSH)
				1231	return ret;
				1232
Josef Bacik	0323527	2019-08-01 18:19:35 -0400	[diff] [blame]	1233	return handle_reserve_ticket(fs_info, space_info, &ticket, flush);
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	1234	}
				1235
				1236	/**
				1237	* reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
				1238	* @root - the root we're allocating for
				1239	* @block_rsv - the block_rsv we're allocating for
				1240	* @orig_bytes - the number of bytes we want
				1241	* @flush - whether or not we can flush to make our reservation
				1242	*
				1243	* This will reserve orig_bytes number of bytes from the space info associated
				1244	* with the block_rsv. If there is not enough space it will make an attempt to
				1245	* flush out space to make room. It will do this by flushing delalloc if
				1246	* possible or committing the transaction. If flush is 0 then no attempts to
				1247	* regain reservations will be made and this will fail if there is not enough
				1248	* space already.
				1249	*/
				1250	int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
				1251	struct btrfs_block_rsv *block_rsv,
				1252	u64 orig_bytes,
				1253	enum btrfs_reserve_flush_enum flush)
				1254	{
				1255	struct btrfs_fs_info *fs_info = root->fs_info;
				1256	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
				1257	int ret;
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	1258
				1259	ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
Josef Bacik	9f24692	2019-11-26 11:25:53 -0500	[diff] [blame]	1260	orig_bytes, flush);
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	1261	if (ret == -ENOSPC &&
				1262	unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
				1263	if (block_rsv != global_rsv &&
				1264	!btrfs_block_rsv_use_bytes(global_rsv, orig_bytes))
				1265	ret = 0;
				1266	}
				1267	if (ret == -ENOSPC) {
				1268	trace_btrfs_space_reservation(fs_info, "space_info:enospc",
				1269	block_rsv->space_info->flags,
				1270	orig_bytes, 1);
				1271
				1272	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
				1273	btrfs_dump_space_info(fs_info, block_rsv->space_info,
				1274	orig_bytes, 0);
				1275	}
				1276	return ret;
				1277	}