Blame - fs/btrfs/space-info.c - SHIFTPHONES/kernel/common

blob: 13a4326c8821d97e655b34e00a3daf25c502fb07 [file] [log] [blame]

Josef Bacik	280c2908	2019-06-18 16:09:19 -0400	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2
				3	#include "ctree.h"
				4	#include "space-info.h"
				5	#include "sysfs.h"
				6	#include "volumes.h"
Josef Bacik	5da6afe	2019-06-18 16:09:24 -0400	[diff] [blame]	7	#include "free-space-cache.h"
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	8	#include "ordered-data.h"
				9	#include "transaction.h"
				10	#include "math.h"
Josef Bacik	aac0023	2019-06-20 15:37:44 -0400	[diff] [blame]	11	#include "block-group.h"
Josef Bacik	280c2908	2019-06-18 16:09:19 -0400	[diff] [blame]	12
				13	u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
				14	bool may_use_included)
				15	{
				16	ASSERT(s_info);
				17	return s_info->bytes_used + s_info->bytes_reserved +
				18	s_info->bytes_pinned + s_info->bytes_readonly +
				19	(may_use_included ? s_info->bytes_may_use : 0);
				20	}
				21
				22	/*
				23	* after adding space to the filesystem, we need to clear the full flags
				24	* on all the space infos.
				25	*/
				26	void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
				27	{
				28	struct list_head *head = &info->space_info;
				29	struct btrfs_space_info *found;
				30
				31	rcu_read_lock();
				32	list_for_each_entry_rcu(found, head, list)
				33	found->full = 0;
				34	rcu_read_unlock();
				35	}
				36
Josef Bacik	280c2908	2019-06-18 16:09:19 -0400	[diff] [blame]	37	static int create_space_info(struct btrfs_fs_info *info, u64 flags)
				38	{
				39
				40	struct btrfs_space_info *space_info;
				41	int i;
				42	int ret;
				43
				44	space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
				45	if (!space_info)
				46	return -ENOMEM;
				47
				48	ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
				49	GFP_KERNEL);
				50	if (ret) {
				51	kfree(space_info);
				52	return ret;
				53	}
				54
				55	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
				56	INIT_LIST_HEAD(&space_info->block_groups[i]);
				57	init_rwsem(&space_info->groups_sem);
				58	spin_lock_init(&space_info->lock);
				59	space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
				60	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
				61	init_waitqueue_head(&space_info->wait);
				62	INIT_LIST_HEAD(&space_info->ro_bgs);
				63	INIT_LIST_HEAD(&space_info->tickets);
				64	INIT_LIST_HEAD(&space_info->priority_tickets);
				65
David Sterba	b882327	2019-08-01 18:50:16 +0200	[diff] [blame]	66	ret = btrfs_sysfs_add_space_info_type(info, space_info);
				67	if (ret)
Josef Bacik	280c2908	2019-06-18 16:09:19 -0400	[diff] [blame]	68	return ret;
Josef Bacik	280c2908	2019-06-18 16:09:19 -0400	[diff] [blame]	69
				70	list_add_rcu(&space_info->list, &info->space_info);
				71	if (flags & BTRFS_BLOCK_GROUP_DATA)
				72	info->data_sinfo = space_info;
				73
				74	return ret;
				75	}
				76
				77	int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
				78	{
				79	struct btrfs_super_block *disk_super;
				80	u64 features;
				81	u64 flags;
				82	int mixed = 0;
				83	int ret;
				84
				85	disk_super = fs_info->super_copy;
				86	if (!btrfs_super_root(disk_super))
				87	return -EINVAL;
				88
				89	features = btrfs_super_incompat_flags(disk_super);
				90	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
				91	mixed = 1;
				92
				93	flags = BTRFS_BLOCK_GROUP_SYSTEM;
				94	ret = create_space_info(fs_info, flags);
				95	if (ret)
				96	goto out;
				97
				98	if (mixed) {
				99	flags = BTRFS_BLOCK_GROUP_METADATA \| BTRFS_BLOCK_GROUP_DATA;
				100	ret = create_space_info(fs_info, flags);
				101	} else {
				102	flags = BTRFS_BLOCK_GROUP_METADATA;
				103	ret = create_space_info(fs_info, flags);
				104	if (ret)
				105	goto out;
				106
				107	flags = BTRFS_BLOCK_GROUP_DATA;
				108	ret = create_space_info(fs_info, flags);
				109	}
				110	out:
				111	return ret;
				112	}
				113
				114	void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
				115	u64 total_bytes, u64 bytes_used,
				116	u64 bytes_readonly,
				117	struct btrfs_space_info **space_info)
				118	{
				119	struct btrfs_space_info *found;
				120	int factor;
				121
				122	factor = btrfs_bg_type_to_factor(flags);
				123
				124	found = btrfs_find_space_info(info, flags);
				125	ASSERT(found);
				126	spin_lock(&found->lock);
				127	found->total_bytes += total_bytes;
				128	found->disk_total += total_bytes * factor;
				129	found->bytes_used += bytes_used;
				130	found->disk_used += bytes_used * factor;
				131	found->bytes_readonly += bytes_readonly;
				132	if (total_bytes > 0)
				133	found->full = 0;
				134	btrfs_space_info_add_new_bytes(info, found,
				135	total_bytes - bytes_used -
				136	bytes_readonly);
				137	spin_unlock(&found->lock);
				138	*space_info = found;
				139	}
				140
				141	struct btrfs_space_info btrfs_find_space_info(struct btrfs_fs_info info,
				142	u64 flags)
				143	{
				144	struct list_head *head = &info->space_info;
				145	struct btrfs_space_info *found;
				146
				147	flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
				148
				149	rcu_read_lock();
				150	list_for_each_entry_rcu(found, head, list) {
				151	if (found->flags & flags) {
				152	rcu_read_unlock();
				153	return found;
				154	}
				155	}
				156	rcu_read_unlock();
				157	return NULL;
				158	}
Josef Bacik	41783ef	2019-06-18 16:09:20 -0400	[diff] [blame]	159
				160	static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
				161	{
				162	return (global->size << 1);
				163	}
				164
Josef Bacik	83d731a	2019-06-18 16:09:26 -0400	[diff] [blame]	165	static int can_overcommit(struct btrfs_fs_info *fs_info,
				166	struct btrfs_space_info *space_info, u64 bytes,
				167	enum btrfs_reserve_flush_enum flush,
				168	bool system_chunk)
Josef Bacik	41783ef	2019-06-18 16:09:20 -0400	[diff] [blame]	169	{
				170	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
				171	u64 profile;
				172	u64 space_size;
				173	u64 avail;
				174	u64 used;
				175	int factor;
				176
				177	/* Don't overcommit when in mixed mode. */
				178	if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
				179	return 0;
				180
				181	if (system_chunk)
				182	profile = btrfs_system_alloc_profile(fs_info);
				183	else
				184	profile = btrfs_metadata_alloc_profile(fs_info);
				185
				186	used = btrfs_space_info_used(space_info, false);
				187
				188	/*
				189	* We only want to allow over committing if we have lots of actual space
				190	* free, but if we don't have enough space to handle the global reserve
				191	* space then we could end up having a real enospc problem when trying
				192	* to allocate a chunk or some other such important allocation.
				193	*/
				194	spin_lock(&global_rsv->lock);
				195	space_size = calc_global_rsv_need_space(global_rsv);
				196	spin_unlock(&global_rsv->lock);
				197	if (used + space_size >= space_info->total_bytes)
				198	return 0;
				199
				200	used += space_info->bytes_may_use;
				201
				202	avail = atomic64_read(&fs_info->free_chunk_space);
				203
				204	/*
				205	* If we have dup, raid1 or raid10 then only half of the free
				206	* space is actually usable. For raid56, the space info used
				207	* doesn't include the parity drive, so we don't have to
				208	* change the math
				209	*/
				210	factor = btrfs_bg_type_to_factor(profile);
				211	avail = div_u64(avail, factor);
				212
				213	/*
				214	* If we aren't flushing all things, let us overcommit up to
				215	* 1/2th of the space. If we can flush, don't let us overcommit
				216	* too much, let it overcommit up to 1/8 of the space.
				217	*/
				218	if (flush == BTRFS_RESERVE_FLUSH_ALL)
				219	avail >>= 3;
				220	else
				221	avail >>= 1;
				222
				223	if (used + bytes < space_info->total_bytes + avail)
				224	return 1;
				225	return 0;
				226	}
Josef Bacik	b338b01	2019-06-18 16:09:22 -0400	[diff] [blame]	227
				228	/*
				229	* This is for space we already have accounted in space_info->bytes_may_use, so
				230	* basically when we're returning space from block_rsv's.
				231	*/
				232	void btrfs_space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
				233	struct btrfs_space_info *space_info,
				234	u64 num_bytes)
				235	{
				236	struct reserve_ticket *ticket;
				237	struct list_head *head;
				238	u64 used;
				239	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
				240	bool check_overcommit = false;
				241
				242	spin_lock(&space_info->lock);
				243	head = &space_info->priority_tickets;
				244
				245	/*
				246	* If we are over our limit then we need to check and see if we can
				247	* overcommit, and if we can't then we just need to free up our space
				248	* and not satisfy any requests.
				249	*/
				250	used = btrfs_space_info_used(space_info, true);
				251	if (used - num_bytes >= space_info->total_bytes)
				252	check_overcommit = true;
				253	again:
				254	while (!list_empty(head) && num_bytes) {
				255	ticket = list_first_entry(head, struct reserve_ticket,
				256	list);
				257	/*
				258	* We use 0 bytes because this space is already reserved, so
				259	* adding the ticket space would be a double count.
				260	*/
				261	if (check_overcommit &&
Josef Bacik	83d731a	2019-06-18 16:09:26 -0400	[diff] [blame]	262	!can_overcommit(fs_info, space_info, 0, flush, false))
Josef Bacik	b338b01	2019-06-18 16:09:22 -0400	[diff] [blame]	263	break;
				264	if (num_bytes >= ticket->bytes) {
				265	list_del_init(&ticket->list);
				266	num_bytes -= ticket->bytes;
				267	ticket->bytes = 0;
				268	space_info->tickets_id++;
				269	wake_up(&ticket->wait);
				270	} else {
				271	ticket->bytes -= num_bytes;
				272	num_bytes = 0;
				273	}
				274	}
				275
				276	if (num_bytes && head == &space_info->priority_tickets) {
				277	head = &space_info->tickets;
				278	flush = BTRFS_RESERVE_FLUSH_ALL;
				279	goto again;
				280	}
				281	btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes);
				282	trace_btrfs_space_reservation(fs_info, "space_info",
				283	space_info->flags, num_bytes, 0);
				284	spin_unlock(&space_info->lock);
				285	}
				286
				287	/*
				288	* This is for newly allocated space that isn't accounted in
				289	* space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent
				290	* we use this helper.
				291	*/
				292	void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
				293	struct btrfs_space_info *space_info,
				294	u64 num_bytes)
				295	{
				296	struct reserve_ticket *ticket;
				297	struct list_head *head = &space_info->priority_tickets;
				298
				299	again:
				300	while (!list_empty(head) && num_bytes) {
				301	ticket = list_first_entry(head, struct reserve_ticket,
				302	list);
				303	if (num_bytes >= ticket->bytes) {
				304	trace_btrfs_space_reservation(fs_info, "space_info",
				305	space_info->flags,
				306	ticket->bytes, 1);
				307	list_del_init(&ticket->list);
				308	num_bytes -= ticket->bytes;
				309	btrfs_space_info_update_bytes_may_use(fs_info,
				310	space_info,
				311	ticket->bytes);
				312	ticket->bytes = 0;
				313	space_info->tickets_id++;
				314	wake_up(&ticket->wait);
				315	} else {
				316	trace_btrfs_space_reservation(fs_info, "space_info",
				317	space_info->flags,
				318	num_bytes, 1);
				319	btrfs_space_info_update_bytes_may_use(fs_info,
				320	space_info,
				321	num_bytes);
				322	ticket->bytes -= num_bytes;
				323	num_bytes = 0;
				324	}
				325	}
				326
				327	if (num_bytes && head == &space_info->priority_tickets) {
				328	head = &space_info->tickets;
				329	goto again;
				330	}
				331	}
Josef Bacik	5da6afe	2019-06-18 16:09:24 -0400	[diff] [blame]	332
				333	#define DUMP_BLOCK_RSV(fs_info, rsv_name) \
				334	do { \
				335	struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \
				336	spin_lock(&__rsv->lock); \
				337	btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \
				338	__rsv->size, __rsv->reserved); \
				339	spin_unlock(&__rsv->lock); \
				340	} while (0)
				341
				342	void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
				343	struct btrfs_space_info *info, u64 bytes,
				344	int dump_block_groups)
				345	{
				346	struct btrfs_block_group_cache *cache;
				347	int index = 0;
				348
				349	spin_lock(&info->lock);
				350	btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
				351	info->flags,
				352	info->total_bytes - btrfs_space_info_used(info, true),
				353	info->full ? "" : "not ");
				354	btrfs_info(fs_info,
				355	"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
				356	info->total_bytes, info->bytes_used, info->bytes_pinned,
				357	info->bytes_reserved, info->bytes_may_use,
				358	info->bytes_readonly);
				359	spin_unlock(&info->lock);
				360
				361	DUMP_BLOCK_RSV(fs_info, global_block_rsv);
				362	DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
				363	DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
				364	DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
				365	DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
				366
				367	if (!dump_block_groups)
				368	return;
				369
				370	down_read(&info->groups_sem);
				371	again:
				372	list_for_each_entry(cache, &info->block_groups[index], list) {
				373	spin_lock(&cache->lock);
				374	btrfs_info(fs_info,
				375	"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
				376	cache->key.objectid, cache->key.offset,
				377	btrfs_block_group_used(&cache->item), cache->pinned,
				378	cache->reserved, cache->ro ? "[readonly]" : "");
				379	btrfs_dump_free_space(cache, bytes);
				380	spin_unlock(&cache->lock);
				381	}
				382	if (++index < BTRFS_NR_RAID_TYPES)
				383	goto again;
				384	up_read(&info->groups_sem);
				385	}
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	386
				387	static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
				388	unsigned long nr_pages, int nr_items)
				389	{
				390	struct super_block *sb = fs_info->sb;
				391
				392	if (down_read_trylock(&sb->s_umount)) {
				393	writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
				394	up_read(&sb->s_umount);
				395	} else {
				396	/*
				397	* We needn't worry the filesystem going from r/w to r/o though
				398	* we don't acquire ->s_umount mutex, because the filesystem
				399	* should guarantee the delalloc inodes list be empty after
				400	* the filesystem is readonly(all dirty pages are written to
				401	* the disk).
				402	*/
				403	btrfs_start_delalloc_roots(fs_info, nr_items);
				404	if (!current->journal_info)
				405	btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
				406	}
				407	}
				408
				409	static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
				410	u64 to_reclaim)
				411	{
				412	u64 bytes;
				413	u64 nr;
				414
Josef Bacik	2bd36e7	2019-08-22 15:14:33 -0400	[diff] [blame^]	415	bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	416	nr = div64_u64(to_reclaim, bytes);
				417	if (!nr)
				418	nr = 1;
				419	return nr;
				420	}
				421
				422	#define EXTENT_SIZE_PER_ITEM SZ_256K
				423
				424	/*
				425	* shrink metadata reservation for delalloc
				426	*/
				427	static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
				428	u64 orig, bool wait_ordered)
				429	{
				430	struct btrfs_space_info *space_info;
				431	struct btrfs_trans_handle *trans;
				432	u64 delalloc_bytes;
				433	u64 dio_bytes;
				434	u64 async_pages;
				435	u64 items;
				436	long time_left;
				437	unsigned long nr_pages;
				438	int loops;
				439
				440	/* Calc the number of the pages we need flush for space reservation */
				441	items = calc_reclaim_items_nr(fs_info, to_reclaim);
				442	to_reclaim = items * EXTENT_SIZE_PER_ITEM;
				443
				444	trans = (struct btrfs_trans_handle *)current->journal_info;
				445	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
				446
				447	delalloc_bytes = percpu_counter_sum_positive(
				448	&fs_info->delalloc_bytes);
				449	dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
				450	if (delalloc_bytes == 0 && dio_bytes == 0) {
				451	if (trans)
				452	return;
				453	if (wait_ordered)
				454	btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
				455	return;
				456	}
				457
				458	/*
				459	* If we are doing more ordered than delalloc we need to just wait on
				460	* ordered extents, otherwise we'll waste time trying to flush delalloc
				461	* that likely won't give us the space back we need.
				462	*/
				463	if (dio_bytes > delalloc_bytes)
				464	wait_ordered = true;
				465
				466	loops = 0;
				467	while ((delalloc_bytes \|\| dio_bytes) && loops < 3) {
				468	nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
				469
				470	/*
				471	* Triggers inode writeback for up to nr_pages. This will invoke
				472	* ->writepages callback and trigger delalloc filling
				473	* (btrfs_run_delalloc_range()).
				474	*/
				475	btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
				476
				477	/*
				478	* We need to wait for the compressed pages to start before
				479	* we continue.
				480	*/
				481	async_pages = atomic_read(&fs_info->async_delalloc_pages);
				482	if (!async_pages)
				483	goto skip_async;
				484
				485	/*
				486	* Calculate how many compressed pages we want to be written
				487	* before we continue. I.e if there are more async pages than we
				488	* require wait_event will wait until nr_pages are written.
				489	*/
				490	if (async_pages <= nr_pages)
				491	async_pages = 0;
				492	else
				493	async_pages -= nr_pages;
				494
				495	wait_event(fs_info->async_submit_wait,
				496	atomic_read(&fs_info->async_delalloc_pages) <=
				497	(int)async_pages);
				498	skip_async:
				499	spin_lock(&space_info->lock);
				500	if (list_empty(&space_info->tickets) &&
				501	list_empty(&space_info->priority_tickets)) {
				502	spin_unlock(&space_info->lock);
				503	break;
				504	}
				505	spin_unlock(&space_info->lock);
				506
				507	loops++;
				508	if (wait_ordered && !trans) {
				509	btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
				510	} else {
				511	time_left = schedule_timeout_killable(1);
				512	if (time_left)
				513	break;
				514	}
				515	delalloc_bytes = percpu_counter_sum_positive(
				516	&fs_info->delalloc_bytes);
				517	dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
				518	}
				519	}
				520
				521	/**
				522	* maybe_commit_transaction - possibly commit the transaction if its ok to
				523	* @root - the root we're allocating for
				524	* @bytes - the number of bytes we want to reserve
				525	* @force - force the commit
				526	*
				527	* This will check to make sure that committing the transaction will actually
				528	* get us somewhere and then commit the transaction if it does. Otherwise it
				529	* will return -ENOSPC.
				530	*/
				531	static int may_commit_transaction(struct btrfs_fs_info *fs_info,
				532	struct btrfs_space_info *space_info)
				533	{
				534	struct reserve_ticket *ticket = NULL;
				535	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
				536	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
				537	struct btrfs_trans_handle *trans;
				538	u64 bytes_needed;
				539	u64 reclaim_bytes = 0;
				540
				541	trans = (struct btrfs_trans_handle *)current->journal_info;
				542	if (trans)
				543	return -EAGAIN;
				544
				545	spin_lock(&space_info->lock);
				546	if (!list_empty(&space_info->priority_tickets))
				547	ticket = list_first_entry(&space_info->priority_tickets,
				548	struct reserve_ticket, list);
				549	else if (!list_empty(&space_info->tickets))
				550	ticket = list_first_entry(&space_info->tickets,
				551	struct reserve_ticket, list);
				552	bytes_needed = (ticket) ? ticket->bytes : 0;
				553	spin_unlock(&space_info->lock);
				554
				555	if (!bytes_needed)
				556	return 0;
				557
				558	trans = btrfs_join_transaction(fs_info->extent_root);
				559	if (IS_ERR(trans))
				560	return PTR_ERR(trans);
				561
				562	/*
				563	* See if there is enough pinned space to make this reservation, or if
				564	* we have block groups that are going to be freed, allowing us to
				565	* possibly do a chunk allocation the next loop through.
				566	*/
				567	if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) \|\|
				568	__percpu_counter_compare(&space_info->total_bytes_pinned,
				569	bytes_needed,
				570	BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
				571	goto commit;
				572
				573	/*
				574	* See if there is some space in the delayed insertion reservation for
				575	* this reservation.
				576	*/
				577	if (space_info != delayed_rsv->space_info)
				578	goto enospc;
				579
				580	spin_lock(&delayed_rsv->lock);
				581	reclaim_bytes += delayed_rsv->reserved;
				582	spin_unlock(&delayed_rsv->lock);
				583
				584	spin_lock(&delayed_refs_rsv->lock);
				585	reclaim_bytes += delayed_refs_rsv->reserved;
				586	spin_unlock(&delayed_refs_rsv->lock);
				587	if (reclaim_bytes >= bytes_needed)
				588	goto commit;
				589	bytes_needed -= reclaim_bytes;
				590
				591	if (__percpu_counter_compare(&space_info->total_bytes_pinned,
				592	bytes_needed,
				593	BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
				594	goto enospc;
				595
				596	commit:
				597	return btrfs_commit_transaction(trans);
				598	enospc:
				599	btrfs_end_transaction(trans);
				600	return -ENOSPC;
				601	}
				602
				603	/*
				604	* Try to flush some data based on policy set by @state. This is only advisory
				605	* and may fail for various reasons. The caller is supposed to examine the
				606	* state of @space_info to detect the outcome.
				607	*/
				608	static void flush_space(struct btrfs_fs_info *fs_info,
				609	struct btrfs_space_info *space_info, u64 num_bytes,
				610	int state)
				611	{
				612	struct btrfs_root *root = fs_info->extent_root;
				613	struct btrfs_trans_handle *trans;
				614	int nr;
				615	int ret = 0;
				616
				617	switch (state) {
				618	case FLUSH_DELAYED_ITEMS_NR:
				619	case FLUSH_DELAYED_ITEMS:
				620	if (state == FLUSH_DELAYED_ITEMS_NR)
				621	nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
				622	else
				623	nr = -1;
				624
				625	trans = btrfs_join_transaction(root);
				626	if (IS_ERR(trans)) {
				627	ret = PTR_ERR(trans);
				628	break;
				629	}
				630	ret = btrfs_run_delayed_items_nr(trans, nr);
				631	btrfs_end_transaction(trans);
				632	break;
				633	case FLUSH_DELALLOC:
				634	case FLUSH_DELALLOC_WAIT:
				635	shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
				636	state == FLUSH_DELALLOC_WAIT);
				637	break;
				638	case FLUSH_DELAYED_REFS_NR:
				639	case FLUSH_DELAYED_REFS:
				640	trans = btrfs_join_transaction(root);
				641	if (IS_ERR(trans)) {
				642	ret = PTR_ERR(trans);
				643	break;
				644	}
				645	if (state == FLUSH_DELAYED_REFS_NR)
				646	nr = calc_reclaim_items_nr(fs_info, num_bytes);
				647	else
				648	nr = 0;
				649	btrfs_run_delayed_refs(trans, nr);
				650	btrfs_end_transaction(trans);
				651	break;
				652	case ALLOC_CHUNK:
				653	case ALLOC_CHUNK_FORCE:
				654	trans = btrfs_join_transaction(root);
				655	if (IS_ERR(trans)) {
				656	ret = PTR_ERR(trans);
				657	break;
				658	}
				659	ret = btrfs_chunk_alloc(trans,
				660	btrfs_metadata_alloc_profile(fs_info),
				661	(state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
				662	CHUNK_ALLOC_FORCE);
				663	btrfs_end_transaction(trans);
				664	if (ret > 0 \|\| ret == -ENOSPC)
				665	ret = 0;
				666	break;
Josef Bacik	844245b	2019-08-01 18:19:33 -0400	[diff] [blame]	667	case RUN_DELAYED_IPUTS:
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	668	/*
				669	* If we have pending delayed iputs then we could free up a
				670	* bunch of pinned space, so make sure we run the iputs before
				671	* we do our pinned bytes check below.
				672	*/
				673	btrfs_run_delayed_iputs(fs_info);
				674	btrfs_wait_on_delayed_iputs(fs_info);
Josef Bacik	844245b	2019-08-01 18:19:33 -0400	[diff] [blame]	675	break;
				676	case COMMIT_TRANS:
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	677	ret = may_commit_transaction(fs_info, space_info);
				678	break;
				679	default:
				680	ret = -ENOSPC;
				681	break;
				682	}
				683
				684	trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
				685	ret);
				686	return;
				687	}
				688
				689	static inline u64
				690	btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
				691	struct btrfs_space_info *space_info,
				692	bool system_chunk)
				693	{
				694	struct reserve_ticket *ticket;
				695	u64 used;
				696	u64 expected;
				697	u64 to_reclaim = 0;
				698
				699	list_for_each_entry(ticket, &space_info->tickets, list)
				700	to_reclaim += ticket->bytes;
				701	list_for_each_entry(ticket, &space_info->priority_tickets, list)
				702	to_reclaim += ticket->bytes;
				703	if (to_reclaim)
				704	return to_reclaim;
				705
				706	to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
Josef Bacik	83d731a	2019-06-18 16:09:26 -0400	[diff] [blame]	707	if (can_overcommit(fs_info, space_info, to_reclaim,
				708	BTRFS_RESERVE_FLUSH_ALL, system_chunk))
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	709	return 0;
				710
				711	used = btrfs_space_info_used(space_info, true);
				712
Josef Bacik	83d731a	2019-06-18 16:09:26 -0400	[diff] [blame]	713	if (can_overcommit(fs_info, space_info, SZ_1M,
				714	BTRFS_RESERVE_FLUSH_ALL, system_chunk))
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	715	expected = div_factor_fine(space_info->total_bytes, 95);
				716	else
				717	expected = div_factor_fine(space_info->total_bytes, 90);
				718
				719	if (used > expected)
				720	to_reclaim = used - expected;
				721	else
				722	to_reclaim = 0;
				723	to_reclaim = min(to_reclaim, space_info->bytes_may_use +
				724	space_info->bytes_reserved);
				725	return to_reclaim;
				726	}
				727
				728	static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
				729	struct btrfs_space_info *space_info,
				730	u64 used, bool system_chunk)
				731	{
				732	u64 thresh = div_factor_fine(space_info->total_bytes, 98);
				733
				734	/* If we're just plain full then async reclaim just slows us down. */
				735	if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
				736	return 0;
				737
				738	if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
				739	system_chunk))
				740	return 0;
				741
				742	return (used >= thresh && !btrfs_fs_closing(fs_info) &&
				743	!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
				744	}
				745
				746	static bool wake_all_tickets(struct list_head *head)
				747	{
				748	struct reserve_ticket *ticket;
				749
				750	while (!list_empty(head)) {
				751	ticket = list_first_entry(head, struct reserve_ticket, list);
				752	list_del_init(&ticket->list);
				753	ticket->error = -ENOSPC;
				754	wake_up(&ticket->wait);
				755	if (ticket->bytes != ticket->orig_bytes)
				756	return true;
				757	}
				758	return false;
				759	}
				760
				761	/*
				762	* This is for normal flushers, we can wait all goddamned day if we want to. We
				763	* will loop and continuously try to flush as long as we are making progress.
				764	* We count progress as clearing off tickets each time we have to loop.
				765	*/
				766	static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
				767	{
				768	struct btrfs_fs_info *fs_info;
				769	struct btrfs_space_info *space_info;
				770	u64 to_reclaim;
				771	int flush_state;
				772	int commit_cycles = 0;
				773	u64 last_tickets_id;
				774
				775	fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
				776	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
				777
				778	spin_lock(&space_info->lock);
				779	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
				780	false);
				781	if (!to_reclaim) {
				782	space_info->flush = 0;
				783	spin_unlock(&space_info->lock);
				784	return;
				785	}
				786	last_tickets_id = space_info->tickets_id;
				787	spin_unlock(&space_info->lock);
				788
				789	flush_state = FLUSH_DELAYED_ITEMS_NR;
				790	do {
				791	flush_space(fs_info, space_info, to_reclaim, flush_state);
				792	spin_lock(&space_info->lock);
				793	if (list_empty(&space_info->tickets)) {
				794	space_info->flush = 0;
				795	spin_unlock(&space_info->lock);
				796	return;
				797	}
				798	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
				799	space_info,
				800	false);
				801	if (last_tickets_id == space_info->tickets_id) {
				802	flush_state++;
				803	} else {
				804	last_tickets_id = space_info->tickets_id;
				805	flush_state = FLUSH_DELAYED_ITEMS_NR;
				806	if (commit_cycles)
				807	commit_cycles--;
				808	}
				809
				810	/*
				811	* We don't want to force a chunk allocation until we've tried
				812	* pretty hard to reclaim space. Think of the case where we
				813	* freed up a bunch of space and so have a lot of pinned space
				814	* to reclaim. We would rather use that than possibly create a
				815	* underutilized metadata chunk. So if this is our first run
				816	* through the flushing state machine skip ALLOC_CHUNK_FORCE and
				817	* commit the transaction. If nothing has changed the next go
				818	* around then we can force a chunk allocation.
				819	*/
				820	if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
				821	flush_state++;
				822
				823	if (flush_state > COMMIT_TRANS) {
				824	commit_cycles++;
				825	if (commit_cycles > 2) {
				826	if (wake_all_tickets(&space_info->tickets)) {
				827	flush_state = FLUSH_DELAYED_ITEMS_NR;
				828	commit_cycles--;
				829	} else {
				830	space_info->flush = 0;
				831	}
				832	} else {
				833	flush_state = FLUSH_DELAYED_ITEMS_NR;
				834	}
				835	}
				836	spin_unlock(&space_info->lock);
				837	} while (flush_state <= COMMIT_TRANS);
				838	}
				839
				840	void btrfs_init_async_reclaim_work(struct work_struct *work)
				841	{
				842	INIT_WORK(work, btrfs_async_reclaim_metadata_space);
				843	}
				844
				845	static const enum btrfs_flush_state priority_flush_states[] = {
				846	FLUSH_DELAYED_ITEMS_NR,
				847	FLUSH_DELAYED_ITEMS,
				848	ALLOC_CHUNK,
				849	};
				850
Josef Bacik	d3984c9	2019-08-01 18:19:37 -0400	[diff] [blame]	851	static const enum btrfs_flush_state evict_flush_states[] = {
				852	FLUSH_DELAYED_ITEMS_NR,
				853	FLUSH_DELAYED_ITEMS,
				854	FLUSH_DELAYED_REFS_NR,
				855	FLUSH_DELAYED_REFS,
				856	FLUSH_DELALLOC,
				857	FLUSH_DELALLOC_WAIT,
				858	ALLOC_CHUNK,
				859	COMMIT_TRANS,
				860	};
				861
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	862	static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
Josef Bacik	9ce2f42	2019-08-01 18:19:36 -0400	[diff] [blame]	863	struct btrfs_space_info *space_info,
				864	struct reserve_ticket *ticket,
				865	const enum btrfs_flush_state *states,
				866	int states_nr)
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	867	{
				868	u64 to_reclaim;
				869	int flush_state;
				870
				871	spin_lock(&space_info->lock);
				872	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
				873	false);
				874	if (!to_reclaim) {
				875	spin_unlock(&space_info->lock);
				876	return;
				877	}
				878	spin_unlock(&space_info->lock);
				879
				880	flush_state = 0;
				881	do {
Josef Bacik	9ce2f42	2019-08-01 18:19:36 -0400	[diff] [blame]	882	flush_space(fs_info, space_info, to_reclaim, states[flush_state]);
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	883	flush_state++;
				884	spin_lock(&space_info->lock);
				885	if (ticket->bytes == 0) {
				886	spin_unlock(&space_info->lock);
				887	return;
				888	}
				889	spin_unlock(&space_info->lock);
Josef Bacik	9ce2f42	2019-08-01 18:19:36 -0400	[diff] [blame]	890	} while (flush_state < states_nr);
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	891	}
				892
Josef Bacik	374bf9c	2019-08-01 18:19:34 -0400	[diff] [blame]	893	static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
				894	struct btrfs_space_info *space_info,
				895	struct reserve_ticket *ticket)
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	896
				897	{
				898	DEFINE_WAIT(wait);
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	899	int ret = 0;
				900
				901	spin_lock(&space_info->lock);
				902	while (ticket->bytes > 0 && ticket->error == 0) {
				903	ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
				904	if (ret) {
Josef Bacik	374bf9c	2019-08-01 18:19:34 -0400	[diff] [blame]	905	ticket->error = -EINTR;
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	906	break;
				907	}
				908	spin_unlock(&space_info->lock);
				909
				910	schedule();
				911
				912	finish_wait(&ticket->wait, &wait);
				913	spin_lock(&space_info->lock);
				914	}
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	915	spin_unlock(&space_info->lock);
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	916	}
				917
				918	/**
Josef Bacik	0323527	2019-08-01 18:19:35 -0400	[diff] [blame]	919	* handle_reserve_ticket - do the appropriate flushing and waiting for a ticket
				920	* @fs_info - the fs
				921	* @space_info - the space_info for the reservation
				922	* @ticket - the ticket for the reservation
				923	* @flush - how much we can flush
				924	*
				925	* This does the work of figuring out how to flush for the ticket, waiting for
				926	* the reservation, and returning the appropriate error if there is one.
				927	*/
				928	static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
				929	struct btrfs_space_info *space_info,
				930	struct reserve_ticket *ticket,
				931	enum btrfs_reserve_flush_enum flush)
				932	{
				933	u64 reclaim_bytes = 0;
				934	int ret;
				935
Josef Bacik	d3984c9	2019-08-01 18:19:37 -0400	[diff] [blame]	936	switch (flush) {
				937	case BTRFS_RESERVE_FLUSH_ALL:
Josef Bacik	0323527	2019-08-01 18:19:35 -0400	[diff] [blame]	938	wait_reserve_ticket(fs_info, space_info, ticket);
Josef Bacik	d3984c9	2019-08-01 18:19:37 -0400	[diff] [blame]	939	break;
				940	case BTRFS_RESERVE_FLUSH_LIMIT:
Josef Bacik	9ce2f42	2019-08-01 18:19:36 -0400	[diff] [blame]	941	priority_reclaim_metadata_space(fs_info, space_info, ticket,
				942	priority_flush_states,
				943	ARRAY_SIZE(priority_flush_states));
Josef Bacik	d3984c9	2019-08-01 18:19:37 -0400	[diff] [blame]	944	break;
				945	case BTRFS_RESERVE_FLUSH_EVICT:
				946	priority_reclaim_metadata_space(fs_info, space_info, ticket,
				947	evict_flush_states,
				948	ARRAY_SIZE(evict_flush_states));
				949	break;
				950	default:
				951	ASSERT(0);
				952	break;
				953	}
Josef Bacik	0323527	2019-08-01 18:19:35 -0400	[diff] [blame]	954
				955	spin_lock(&space_info->lock);
				956	ret = ticket->error;
				957	if (ticket->bytes \|\| ticket->error) {
				958	if (ticket->bytes < ticket->orig_bytes)
				959	reclaim_bytes = ticket->orig_bytes - ticket->bytes;
				960	list_del_init(&ticket->list);
				961	if (!ret)
				962	ret = -ENOSPC;
				963	}
				964	spin_unlock(&space_info->lock);
				965
				966	if (reclaim_bytes)
				967	btrfs_space_info_add_old_bytes(fs_info, space_info,
				968	reclaim_bytes);
				969	ASSERT(list_empty(&ticket->list));
				970	return ret;
				971	}
				972
				973	/**
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	974	* reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
				975	* @root - the root we're allocating for
				976	* @space_info - the space info we want to allocate from
				977	* @orig_bytes - the number of bytes we want
				978	* @flush - whether or not we can flush to make our reservation
				979	*
				980	* This will reserve orig_bytes number of bytes from the space info associated
				981	* with the block_rsv. If there is not enough space it will make an attempt to
				982	* flush out space to make room. It will do this by flushing delalloc if
				983	* possible or committing the transaction. If flush is 0 then no attempts to
				984	* regain reservations will be made and this will fail if there is not enough
				985	* space already.
				986	*/
				987	static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
				988	struct btrfs_space_info *space_info,
				989	u64 orig_bytes,
				990	enum btrfs_reserve_flush_enum flush,
				991	bool system_chunk)
				992	{
				993	struct reserve_ticket ticket;
				994	u64 used;
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	995	int ret = 0;
				996
				997	ASSERT(orig_bytes);
				998	ASSERT(!current->journal_info \|\| flush != BTRFS_RESERVE_FLUSH_ALL);
				999
				1000	spin_lock(&space_info->lock);
				1001	ret = -ENOSPC;
				1002	used = btrfs_space_info_used(space_info, true);
				1003
				1004	/*
Goldwyn Rodrigues	9b4851b	2019-06-25 20:11:31 +0200	[diff] [blame]	1005	* Carry on if we have enough space (short-circuit) OR call
				1006	* can_overcommit() to ensure we can overcommit to continue.
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	1007	*/
Goldwyn Rodrigues	9b4851b	2019-06-25 20:11:31 +0200	[diff] [blame]	1008	if ((used + orig_bytes <= space_info->total_bytes) \|\|
				1009	can_overcommit(fs_info, space_info, orig_bytes, flush,
				1010	system_chunk)) {
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	1011	btrfs_space_info_update_bytes_may_use(fs_info, space_info,
				1012	orig_bytes);
				1013	trace_btrfs_space_reservation(fs_info, "space_info",
				1014	space_info->flags, orig_bytes, 1);
				1015	ret = 0;
				1016	}
				1017
				1018	/*
				1019	* If we couldn't make a reservation then setup our reservation ticket
				1020	* and kick the async worker if it's not already running.
				1021	*
				1022	* If we are a priority flusher then we just need to add our ticket to
				1023	* the list and we will do our own flushing further down.
				1024	*/
				1025	if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
				1026	ticket.orig_bytes = orig_bytes;
				1027	ticket.bytes = orig_bytes;
				1028	ticket.error = 0;
				1029	init_waitqueue_head(&ticket.wait);
				1030	if (flush == BTRFS_RESERVE_FLUSH_ALL) {
				1031	list_add_tail(&ticket.list, &space_info->tickets);
				1032	if (!space_info->flush) {
				1033	space_info->flush = 1;
				1034	trace_btrfs_trigger_flush(fs_info,
				1035	space_info->flags,
				1036	orig_bytes, flush,
				1037	"enospc");
				1038	queue_work(system_unbound_wq,
				1039	&fs_info->async_reclaim_work);
				1040	}
				1041	} else {
				1042	list_add_tail(&ticket.list,
				1043	&space_info->priority_tickets);
				1044	}
				1045	} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
				1046	used += orig_bytes;
				1047	/*
				1048	* We will do the space reservation dance during log replay,
				1049	* which means we won't have fs_info->fs_root set, so don't do
				1050	* the async reclaim as we will panic.
				1051	*/
				1052	if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
				1053	need_do_async_reclaim(fs_info, space_info,
				1054	used, system_chunk) &&
				1055	!work_busy(&fs_info->async_reclaim_work)) {
				1056	trace_btrfs_trigger_flush(fs_info, space_info->flags,
				1057	orig_bytes, flush, "preempt");
				1058	queue_work(system_unbound_wq,
				1059	&fs_info->async_reclaim_work);
				1060	}
				1061	}
				1062	spin_unlock(&space_info->lock);
				1063	if (!ret \|\| flush == BTRFS_RESERVE_NO_FLUSH)
				1064	return ret;
				1065
Josef Bacik	0323527	2019-08-01 18:19:35 -0400	[diff] [blame]	1066	return handle_reserve_ticket(fs_info, space_info, &ticket, flush);
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame]	1067	}
				1068
				1069	/**
				1070	* reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
				1071	* @root - the root we're allocating for
				1072	* @block_rsv - the block_rsv we're allocating for
				1073	* @orig_bytes - the number of bytes we want
				1074	* @flush - whether or not we can flush to make our reservation
				1075	*
				1076	* This will reserve orig_bytes number of bytes from the space info associated
				1077	* with the block_rsv. If there is not enough space it will make an attempt to
				1078	* flush out space to make room. It will do this by flushing delalloc if
				1079	* possible or committing the transaction. If flush is 0 then no attempts to
				1080	* regain reservations will be made and this will fail if there is not enough
				1081	* space already.
				1082	*/
				1083	int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
				1084	struct btrfs_block_rsv *block_rsv,
				1085	u64 orig_bytes,
				1086	enum btrfs_reserve_flush_enum flush)
				1087	{
				1088	struct btrfs_fs_info *fs_info = root->fs_info;
				1089	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
				1090	int ret;
				1091	bool system_chunk = (root == fs_info->chunk_root);
				1092
				1093	ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
				1094	orig_bytes, flush, system_chunk);
				1095	if (ret == -ENOSPC &&
				1096	unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
				1097	if (block_rsv != global_rsv &&
				1098	!btrfs_block_rsv_use_bytes(global_rsv, orig_bytes))
				1099	ret = 0;
				1100	}
				1101	if (ret == -ENOSPC) {
				1102	trace_btrfs_space_reservation(fs_info, "space_info:enospc",
				1103	block_rsv->space_info->flags,
				1104	orig_bytes, 1);
				1105
				1106	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
				1107	btrfs_dump_space_info(fs_info, block_rsv->space_info,
				1108	orig_bytes, 0);
				1109	}
				1110	return ret;
				1111	}