Blame - fs/btrfs/space-info.c - SHIFTPHONES/kernel/common

blob: 1ac58d7e7790b59913cd3f68d4a39cee0c0b2a13 [file] [log] [blame]

Josef Bacik	280c2908	2019-06-18 16:09:19 -0400	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2
				3	#include "ctree.h"
				4	#include "space-info.h"
				5	#include "sysfs.h"
				6	#include "volumes.h"
Josef Bacik	5da6afe	2019-06-18 16:09:24 -0400	[diff] [blame]	7	#include "free-space-cache.h"
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame^]	8	#include "ordered-data.h"
				9	#include "transaction.h"
				10	#include "math.h"
Josef Bacik	280c2908	2019-06-18 16:09:19 -0400	[diff] [blame]	11
				12	u64 btrfs_space_info_used(struct btrfs_space_info *s_info,
				13	bool may_use_included)
				14	{
				15	ASSERT(s_info);
				16	return s_info->bytes_used + s_info->bytes_reserved +
				17	s_info->bytes_pinned + s_info->bytes_readonly +
				18	(may_use_included ? s_info->bytes_may_use : 0);
				19	}
				20
				21	/*
				22	* after adding space to the filesystem, we need to clear the full flags
				23	* on all the space infos.
				24	*/
				25	void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
				26	{
				27	struct list_head *head = &info->space_info;
				28	struct btrfs_space_info *found;
				29
				30	rcu_read_lock();
				31	list_for_each_entry_rcu(found, head, list)
				32	found->full = 0;
				33	rcu_read_unlock();
				34	}
				35
				36	static const char *alloc_name(u64 flags)
				37	{
				38	switch (flags) {
				39	case BTRFS_BLOCK_GROUP_METADATA\|BTRFS_BLOCK_GROUP_DATA:
				40	return "mixed";
				41	case BTRFS_BLOCK_GROUP_METADATA:
				42	return "metadata";
				43	case BTRFS_BLOCK_GROUP_DATA:
				44	return "data";
				45	case BTRFS_BLOCK_GROUP_SYSTEM:
				46	return "system";
				47	default:
				48	WARN_ON(1);
				49	return "invalid-combination";
				50	};
				51	}
				52
				53	static int create_space_info(struct btrfs_fs_info *info, u64 flags)
				54	{
				55
				56	struct btrfs_space_info *space_info;
				57	int i;
				58	int ret;
				59
				60	space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
				61	if (!space_info)
				62	return -ENOMEM;
				63
				64	ret = percpu_counter_init(&space_info->total_bytes_pinned, 0,
				65	GFP_KERNEL);
				66	if (ret) {
				67	kfree(space_info);
				68	return ret;
				69	}
				70
				71	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
				72	INIT_LIST_HEAD(&space_info->block_groups[i]);
				73	init_rwsem(&space_info->groups_sem);
				74	spin_lock_init(&space_info->lock);
				75	space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
				76	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
				77	init_waitqueue_head(&space_info->wait);
				78	INIT_LIST_HEAD(&space_info->ro_bgs);
				79	INIT_LIST_HEAD(&space_info->tickets);
				80	INIT_LIST_HEAD(&space_info->priority_tickets);
				81
				82	ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype,
				83	info->space_info_kobj, "%s",
				84	alloc_name(space_info->flags));
				85	if (ret) {
				86	kobject_put(&space_info->kobj);
				87	return ret;
				88	}
				89
				90	list_add_rcu(&space_info->list, &info->space_info);
				91	if (flags & BTRFS_BLOCK_GROUP_DATA)
				92	info->data_sinfo = space_info;
				93
				94	return ret;
				95	}
				96
				97	int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
				98	{
				99	struct btrfs_super_block *disk_super;
				100	u64 features;
				101	u64 flags;
				102	int mixed = 0;
				103	int ret;
				104
				105	disk_super = fs_info->super_copy;
				106	if (!btrfs_super_root(disk_super))
				107	return -EINVAL;
				108
				109	features = btrfs_super_incompat_flags(disk_super);
				110	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
				111	mixed = 1;
				112
				113	flags = BTRFS_BLOCK_GROUP_SYSTEM;
				114	ret = create_space_info(fs_info, flags);
				115	if (ret)
				116	goto out;
				117
				118	if (mixed) {
				119	flags = BTRFS_BLOCK_GROUP_METADATA \| BTRFS_BLOCK_GROUP_DATA;
				120	ret = create_space_info(fs_info, flags);
				121	} else {
				122	flags = BTRFS_BLOCK_GROUP_METADATA;
				123	ret = create_space_info(fs_info, flags);
				124	if (ret)
				125	goto out;
				126
				127	flags = BTRFS_BLOCK_GROUP_DATA;
				128	ret = create_space_info(fs_info, flags);
				129	}
				130	out:
				131	return ret;
				132	}
				133
				134	void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags,
				135	u64 total_bytes, u64 bytes_used,
				136	u64 bytes_readonly,
				137	struct btrfs_space_info **space_info)
				138	{
				139	struct btrfs_space_info *found;
				140	int factor;
				141
				142	factor = btrfs_bg_type_to_factor(flags);
				143
				144	found = btrfs_find_space_info(info, flags);
				145	ASSERT(found);
				146	spin_lock(&found->lock);
				147	found->total_bytes += total_bytes;
				148	found->disk_total += total_bytes * factor;
				149	found->bytes_used += bytes_used;
				150	found->disk_used += bytes_used * factor;
				151	found->bytes_readonly += bytes_readonly;
				152	if (total_bytes > 0)
				153	found->full = 0;
				154	btrfs_space_info_add_new_bytes(info, found,
				155	total_bytes - bytes_used -
				156	bytes_readonly);
				157	spin_unlock(&found->lock);
				158	*space_info = found;
				159	}
				160
				161	struct btrfs_space_info btrfs_find_space_info(struct btrfs_fs_info info,
				162	u64 flags)
				163	{
				164	struct list_head *head = &info->space_info;
				165	struct btrfs_space_info *found;
				166
				167	flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
				168
				169	rcu_read_lock();
				170	list_for_each_entry_rcu(found, head, list) {
				171	if (found->flags & flags) {
				172	rcu_read_unlock();
				173	return found;
				174	}
				175	}
				176	rcu_read_unlock();
				177	return NULL;
				178	}
Josef Bacik	41783ef	2019-06-18 16:09:20 -0400	[diff] [blame]	179
				180	static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
				181	{
				182	return (global->size << 1);
				183	}
				184
				185	int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
				186	struct btrfs_space_info *space_info, u64 bytes,
				187	enum btrfs_reserve_flush_enum flush,
				188	bool system_chunk)
				189	{
				190	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
				191	u64 profile;
				192	u64 space_size;
				193	u64 avail;
				194	u64 used;
				195	int factor;
				196
				197	/* Don't overcommit when in mixed mode. */
				198	if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
				199	return 0;
				200
				201	if (system_chunk)
				202	profile = btrfs_system_alloc_profile(fs_info);
				203	else
				204	profile = btrfs_metadata_alloc_profile(fs_info);
				205
				206	used = btrfs_space_info_used(space_info, false);
				207
				208	/*
				209	* We only want to allow over committing if we have lots of actual space
				210	* free, but if we don't have enough space to handle the global reserve
				211	* space then we could end up having a real enospc problem when trying
				212	* to allocate a chunk or some other such important allocation.
				213	*/
				214	spin_lock(&global_rsv->lock);
				215	space_size = calc_global_rsv_need_space(global_rsv);
				216	spin_unlock(&global_rsv->lock);
				217	if (used + space_size >= space_info->total_bytes)
				218	return 0;
				219
				220	used += space_info->bytes_may_use;
				221
				222	avail = atomic64_read(&fs_info->free_chunk_space);
				223
				224	/*
				225	* If we have dup, raid1 or raid10 then only half of the free
				226	* space is actually usable. For raid56, the space info used
				227	* doesn't include the parity drive, so we don't have to
				228	* change the math
				229	*/
				230	factor = btrfs_bg_type_to_factor(profile);
				231	avail = div_u64(avail, factor);
				232
				233	/*
				234	* If we aren't flushing all things, let us overcommit up to
				235	* 1/2th of the space. If we can flush, don't let us overcommit
				236	* too much, let it overcommit up to 1/8 of the space.
				237	*/
				238	if (flush == BTRFS_RESERVE_FLUSH_ALL)
				239	avail >>= 3;
				240	else
				241	avail >>= 1;
				242
				243	if (used + bytes < space_info->total_bytes + avail)
				244	return 1;
				245	return 0;
				246	}
Josef Bacik	b338b01	2019-06-18 16:09:22 -0400	[diff] [blame]	247
				248	/*
				249	* This is for space we already have accounted in space_info->bytes_may_use, so
				250	* basically when we're returning space from block_rsv's.
				251	*/
				252	void btrfs_space_info_add_old_bytes(struct btrfs_fs_info *fs_info,
				253	struct btrfs_space_info *space_info,
				254	u64 num_bytes)
				255	{
				256	struct reserve_ticket *ticket;
				257	struct list_head *head;
				258	u64 used;
				259	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
				260	bool check_overcommit = false;
				261
				262	spin_lock(&space_info->lock);
				263	head = &space_info->priority_tickets;
				264
				265	/*
				266	* If we are over our limit then we need to check and see if we can
				267	* overcommit, and if we can't then we just need to free up our space
				268	* and not satisfy any requests.
				269	*/
				270	used = btrfs_space_info_used(space_info, true);
				271	if (used - num_bytes >= space_info->total_bytes)
				272	check_overcommit = true;
				273	again:
				274	while (!list_empty(head) && num_bytes) {
				275	ticket = list_first_entry(head, struct reserve_ticket,
				276	list);
				277	/*
				278	* We use 0 bytes because this space is already reserved, so
				279	* adding the ticket space would be a double count.
				280	*/
				281	if (check_overcommit &&
				282	!btrfs_can_overcommit(fs_info, space_info, 0, flush,
				283	false))
				284	break;
				285	if (num_bytes >= ticket->bytes) {
				286	list_del_init(&ticket->list);
				287	num_bytes -= ticket->bytes;
				288	ticket->bytes = 0;
				289	space_info->tickets_id++;
				290	wake_up(&ticket->wait);
				291	} else {
				292	ticket->bytes -= num_bytes;
				293	num_bytes = 0;
				294	}
				295	}
				296
				297	if (num_bytes && head == &space_info->priority_tickets) {
				298	head = &space_info->tickets;
				299	flush = BTRFS_RESERVE_FLUSH_ALL;
				300	goto again;
				301	}
				302	btrfs_space_info_update_bytes_may_use(fs_info, space_info, -num_bytes);
				303	trace_btrfs_space_reservation(fs_info, "space_info",
				304	space_info->flags, num_bytes, 0);
				305	spin_unlock(&space_info->lock);
				306	}
				307
				308	/*
				309	* This is for newly allocated space that isn't accounted in
				310	* space_info->bytes_may_use yet. So if we allocate a chunk or unpin an extent
				311	* we use this helper.
				312	*/
				313	void btrfs_space_info_add_new_bytes(struct btrfs_fs_info *fs_info,
				314	struct btrfs_space_info *space_info,
				315	u64 num_bytes)
				316	{
				317	struct reserve_ticket *ticket;
				318	struct list_head *head = &space_info->priority_tickets;
				319
				320	again:
				321	while (!list_empty(head) && num_bytes) {
				322	ticket = list_first_entry(head, struct reserve_ticket,
				323	list);
				324	if (num_bytes >= ticket->bytes) {
				325	trace_btrfs_space_reservation(fs_info, "space_info",
				326	space_info->flags,
				327	ticket->bytes, 1);
				328	list_del_init(&ticket->list);
				329	num_bytes -= ticket->bytes;
				330	btrfs_space_info_update_bytes_may_use(fs_info,
				331	space_info,
				332	ticket->bytes);
				333	ticket->bytes = 0;
				334	space_info->tickets_id++;
				335	wake_up(&ticket->wait);
				336	} else {
				337	trace_btrfs_space_reservation(fs_info, "space_info",
				338	space_info->flags,
				339	num_bytes, 1);
				340	btrfs_space_info_update_bytes_may_use(fs_info,
				341	space_info,
				342	num_bytes);
				343	ticket->bytes -= num_bytes;
				344	num_bytes = 0;
				345	}
				346	}
				347
				348	if (num_bytes && head == &space_info->priority_tickets) {
				349	head = &space_info->tickets;
				350	goto again;
				351	}
				352	}
Josef Bacik	5da6afe	2019-06-18 16:09:24 -0400	[diff] [blame]	353
				354	#define DUMP_BLOCK_RSV(fs_info, rsv_name) \
				355	do { \
				356	struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \
				357	spin_lock(&__rsv->lock); \
				358	btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \
				359	__rsv->size, __rsv->reserved); \
				360	spin_unlock(&__rsv->lock); \
				361	} while (0)
				362
				363	void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
				364	struct btrfs_space_info *info, u64 bytes,
				365	int dump_block_groups)
				366	{
				367	struct btrfs_block_group_cache *cache;
				368	int index = 0;
				369
				370	spin_lock(&info->lock);
				371	btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
				372	info->flags,
				373	info->total_bytes - btrfs_space_info_used(info, true),
				374	info->full ? "" : "not ");
				375	btrfs_info(fs_info,
				376	"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
				377	info->total_bytes, info->bytes_used, info->bytes_pinned,
				378	info->bytes_reserved, info->bytes_may_use,
				379	info->bytes_readonly);
				380	spin_unlock(&info->lock);
				381
				382	DUMP_BLOCK_RSV(fs_info, global_block_rsv);
				383	DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
				384	DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
				385	DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
				386	DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
				387
				388	if (!dump_block_groups)
				389	return;
				390
				391	down_read(&info->groups_sem);
				392	again:
				393	list_for_each_entry(cache, &info->block_groups[index], list) {
				394	spin_lock(&cache->lock);
				395	btrfs_info(fs_info,
				396	"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s",
				397	cache->key.objectid, cache->key.offset,
				398	btrfs_block_group_used(&cache->item), cache->pinned,
				399	cache->reserved, cache->ro ? "[readonly]" : "");
				400	btrfs_dump_free_space(cache, bytes);
				401	spin_unlock(&cache->lock);
				402	}
				403	if (++index < BTRFS_NR_RAID_TYPES)
				404	goto again;
				405	up_read(&info->groups_sem);
				406	}
Josef Bacik	0d9764f	2019-06-18 16:09:25 -0400	[diff] [blame^]	407
				408	static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info,
				409	unsigned long nr_pages, int nr_items)
				410	{
				411	struct super_block *sb = fs_info->sb;
				412
				413	if (down_read_trylock(&sb->s_umount)) {
				414	writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
				415	up_read(&sb->s_umount);
				416	} else {
				417	/*
				418	* We needn't worry the filesystem going from r/w to r/o though
				419	* we don't acquire ->s_umount mutex, because the filesystem
				420	* should guarantee the delalloc inodes list be empty after
				421	* the filesystem is readonly(all dirty pages are written to
				422	* the disk).
				423	*/
				424	btrfs_start_delalloc_roots(fs_info, nr_items);
				425	if (!current->journal_info)
				426	btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1);
				427	}
				428	}
				429
				430	static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
				431	u64 to_reclaim)
				432	{
				433	u64 bytes;
				434	u64 nr;
				435
				436	bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
				437	nr = div64_u64(to_reclaim, bytes);
				438	if (!nr)
				439	nr = 1;
				440	return nr;
				441	}
				442
				443	#define EXTENT_SIZE_PER_ITEM SZ_256K
				444
				445	/*
				446	* shrink metadata reservation for delalloc
				447	*/
				448	static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
				449	u64 orig, bool wait_ordered)
				450	{
				451	struct btrfs_space_info *space_info;
				452	struct btrfs_trans_handle *trans;
				453	u64 delalloc_bytes;
				454	u64 dio_bytes;
				455	u64 async_pages;
				456	u64 items;
				457	long time_left;
				458	unsigned long nr_pages;
				459	int loops;
				460
				461	/* Calc the number of the pages we need flush for space reservation */
				462	items = calc_reclaim_items_nr(fs_info, to_reclaim);
				463	to_reclaim = items * EXTENT_SIZE_PER_ITEM;
				464
				465	trans = (struct btrfs_trans_handle *)current->journal_info;
				466	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
				467
				468	delalloc_bytes = percpu_counter_sum_positive(
				469	&fs_info->delalloc_bytes);
				470	dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
				471	if (delalloc_bytes == 0 && dio_bytes == 0) {
				472	if (trans)
				473	return;
				474	if (wait_ordered)
				475	btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
				476	return;
				477	}
				478
				479	/*
				480	* If we are doing more ordered than delalloc we need to just wait on
				481	* ordered extents, otherwise we'll waste time trying to flush delalloc
				482	* that likely won't give us the space back we need.
				483	*/
				484	if (dio_bytes > delalloc_bytes)
				485	wait_ordered = true;
				486
				487	loops = 0;
				488	while ((delalloc_bytes \|\| dio_bytes) && loops < 3) {
				489	nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
				490
				491	/*
				492	* Triggers inode writeback for up to nr_pages. This will invoke
				493	* ->writepages callback and trigger delalloc filling
				494	* (btrfs_run_delalloc_range()).
				495	*/
				496	btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items);
				497
				498	/*
				499	* We need to wait for the compressed pages to start before
				500	* we continue.
				501	*/
				502	async_pages = atomic_read(&fs_info->async_delalloc_pages);
				503	if (!async_pages)
				504	goto skip_async;
				505
				506	/*
				507	* Calculate how many compressed pages we want to be written
				508	* before we continue. I.e if there are more async pages than we
				509	* require wait_event will wait until nr_pages are written.
				510	*/
				511	if (async_pages <= nr_pages)
				512	async_pages = 0;
				513	else
				514	async_pages -= nr_pages;
				515
				516	wait_event(fs_info->async_submit_wait,
				517	atomic_read(&fs_info->async_delalloc_pages) <=
				518	(int)async_pages);
				519	skip_async:
				520	spin_lock(&space_info->lock);
				521	if (list_empty(&space_info->tickets) &&
				522	list_empty(&space_info->priority_tickets)) {
				523	spin_unlock(&space_info->lock);
				524	break;
				525	}
				526	spin_unlock(&space_info->lock);
				527
				528	loops++;
				529	if (wait_ordered && !trans) {
				530	btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
				531	} else {
				532	time_left = schedule_timeout_killable(1);
				533	if (time_left)
				534	break;
				535	}
				536	delalloc_bytes = percpu_counter_sum_positive(
				537	&fs_info->delalloc_bytes);
				538	dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes);
				539	}
				540	}
				541
				542	/**
				543	* maybe_commit_transaction - possibly commit the transaction if its ok to
				544	* @root - the root we're allocating for
				545	* @bytes - the number of bytes we want to reserve
				546	* @force - force the commit
				547	*
				548	* This will check to make sure that committing the transaction will actually
				549	* get us somewhere and then commit the transaction if it does. Otherwise it
				550	* will return -ENOSPC.
				551	*/
				552	static int may_commit_transaction(struct btrfs_fs_info *fs_info,
				553	struct btrfs_space_info *space_info)
				554	{
				555	struct reserve_ticket *ticket = NULL;
				556	struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
				557	struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
				558	struct btrfs_trans_handle *trans;
				559	u64 bytes_needed;
				560	u64 reclaim_bytes = 0;
				561
				562	trans = (struct btrfs_trans_handle *)current->journal_info;
				563	if (trans)
				564	return -EAGAIN;
				565
				566	spin_lock(&space_info->lock);
				567	if (!list_empty(&space_info->priority_tickets))
				568	ticket = list_first_entry(&space_info->priority_tickets,
				569	struct reserve_ticket, list);
				570	else if (!list_empty(&space_info->tickets))
				571	ticket = list_first_entry(&space_info->tickets,
				572	struct reserve_ticket, list);
				573	bytes_needed = (ticket) ? ticket->bytes : 0;
				574	spin_unlock(&space_info->lock);
				575
				576	if (!bytes_needed)
				577	return 0;
				578
				579	trans = btrfs_join_transaction(fs_info->extent_root);
				580	if (IS_ERR(trans))
				581	return PTR_ERR(trans);
				582
				583	/*
				584	* See if there is enough pinned space to make this reservation, or if
				585	* we have block groups that are going to be freed, allowing us to
				586	* possibly do a chunk allocation the next loop through.
				587	*/
				588	if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) \|\|
				589	__percpu_counter_compare(&space_info->total_bytes_pinned,
				590	bytes_needed,
				591	BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0)
				592	goto commit;
				593
				594	/*
				595	* See if there is some space in the delayed insertion reservation for
				596	* this reservation.
				597	*/
				598	if (space_info != delayed_rsv->space_info)
				599	goto enospc;
				600
				601	spin_lock(&delayed_rsv->lock);
				602	reclaim_bytes += delayed_rsv->reserved;
				603	spin_unlock(&delayed_rsv->lock);
				604
				605	spin_lock(&delayed_refs_rsv->lock);
				606	reclaim_bytes += delayed_refs_rsv->reserved;
				607	spin_unlock(&delayed_refs_rsv->lock);
				608	if (reclaim_bytes >= bytes_needed)
				609	goto commit;
				610	bytes_needed -= reclaim_bytes;
				611
				612	if (__percpu_counter_compare(&space_info->total_bytes_pinned,
				613	bytes_needed,
				614	BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0)
				615	goto enospc;
				616
				617	commit:
				618	return btrfs_commit_transaction(trans);
				619	enospc:
				620	btrfs_end_transaction(trans);
				621	return -ENOSPC;
				622	}
				623
				624	/*
				625	* Try to flush some data based on policy set by @state. This is only advisory
				626	* and may fail for various reasons. The caller is supposed to examine the
				627	* state of @space_info to detect the outcome.
				628	*/
				629	static void flush_space(struct btrfs_fs_info *fs_info,
				630	struct btrfs_space_info *space_info, u64 num_bytes,
				631	int state)
				632	{
				633	struct btrfs_root *root = fs_info->extent_root;
				634	struct btrfs_trans_handle *trans;
				635	int nr;
				636	int ret = 0;
				637
				638	switch (state) {
				639	case FLUSH_DELAYED_ITEMS_NR:
				640	case FLUSH_DELAYED_ITEMS:
				641	if (state == FLUSH_DELAYED_ITEMS_NR)
				642	nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
				643	else
				644	nr = -1;
				645
				646	trans = btrfs_join_transaction(root);
				647	if (IS_ERR(trans)) {
				648	ret = PTR_ERR(trans);
				649	break;
				650	}
				651	ret = btrfs_run_delayed_items_nr(trans, nr);
				652	btrfs_end_transaction(trans);
				653	break;
				654	case FLUSH_DELALLOC:
				655	case FLUSH_DELALLOC_WAIT:
				656	shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
				657	state == FLUSH_DELALLOC_WAIT);
				658	break;
				659	case FLUSH_DELAYED_REFS_NR:
				660	case FLUSH_DELAYED_REFS:
				661	trans = btrfs_join_transaction(root);
				662	if (IS_ERR(trans)) {
				663	ret = PTR_ERR(trans);
				664	break;
				665	}
				666	if (state == FLUSH_DELAYED_REFS_NR)
				667	nr = calc_reclaim_items_nr(fs_info, num_bytes);
				668	else
				669	nr = 0;
				670	btrfs_run_delayed_refs(trans, nr);
				671	btrfs_end_transaction(trans);
				672	break;
				673	case ALLOC_CHUNK:
				674	case ALLOC_CHUNK_FORCE:
				675	trans = btrfs_join_transaction(root);
				676	if (IS_ERR(trans)) {
				677	ret = PTR_ERR(trans);
				678	break;
				679	}
				680	ret = btrfs_chunk_alloc(trans,
				681	btrfs_metadata_alloc_profile(fs_info),
				682	(state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
				683	CHUNK_ALLOC_FORCE);
				684	btrfs_end_transaction(trans);
				685	if (ret > 0 \|\| ret == -ENOSPC)
				686	ret = 0;
				687	break;
				688	case COMMIT_TRANS:
				689	/*
				690	* If we have pending delayed iputs then we could free up a
				691	* bunch of pinned space, so make sure we run the iputs before
				692	* we do our pinned bytes check below.
				693	*/
				694	btrfs_run_delayed_iputs(fs_info);
				695	btrfs_wait_on_delayed_iputs(fs_info);
				696
				697	ret = may_commit_transaction(fs_info, space_info);
				698	break;
				699	default:
				700	ret = -ENOSPC;
				701	break;
				702	}
				703
				704	trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
				705	ret);
				706	return;
				707	}
				708
				709	static inline u64
				710	btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
				711	struct btrfs_space_info *space_info,
				712	bool system_chunk)
				713	{
				714	struct reserve_ticket *ticket;
				715	u64 used;
				716	u64 expected;
				717	u64 to_reclaim = 0;
				718
				719	list_for_each_entry(ticket, &space_info->tickets, list)
				720	to_reclaim += ticket->bytes;
				721	list_for_each_entry(ticket, &space_info->priority_tickets, list)
				722	to_reclaim += ticket->bytes;
				723	if (to_reclaim)
				724	return to_reclaim;
				725
				726	to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
				727	if (btrfs_can_overcommit(fs_info, space_info, to_reclaim,
				728	BTRFS_RESERVE_FLUSH_ALL, system_chunk))
				729	return 0;
				730
				731	used = btrfs_space_info_used(space_info, true);
				732
				733	if (btrfs_can_overcommit(fs_info, space_info, SZ_1M,
				734	BTRFS_RESERVE_FLUSH_ALL, system_chunk))
				735	expected = div_factor_fine(space_info->total_bytes, 95);
				736	else
				737	expected = div_factor_fine(space_info->total_bytes, 90);
				738
				739	if (used > expected)
				740	to_reclaim = used - expected;
				741	else
				742	to_reclaim = 0;
				743	to_reclaim = min(to_reclaim, space_info->bytes_may_use +
				744	space_info->bytes_reserved);
				745	return to_reclaim;
				746	}
				747
				748	static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
				749	struct btrfs_space_info *space_info,
				750	u64 used, bool system_chunk)
				751	{
				752	u64 thresh = div_factor_fine(space_info->total_bytes, 98);
				753
				754	/* If we're just plain full then async reclaim just slows us down. */
				755	if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
				756	return 0;
				757
				758	if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
				759	system_chunk))
				760	return 0;
				761
				762	return (used >= thresh && !btrfs_fs_closing(fs_info) &&
				763	!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
				764	}
				765
				766	static bool wake_all_tickets(struct list_head *head)
				767	{
				768	struct reserve_ticket *ticket;
				769
				770	while (!list_empty(head)) {
				771	ticket = list_first_entry(head, struct reserve_ticket, list);
				772	list_del_init(&ticket->list);
				773	ticket->error = -ENOSPC;
				774	wake_up(&ticket->wait);
				775	if (ticket->bytes != ticket->orig_bytes)
				776	return true;
				777	}
				778	return false;
				779	}
				780
				781	/*
				782	* This is for normal flushers, we can wait all goddamned day if we want to. We
				783	* will loop and continuously try to flush as long as we are making progress.
				784	* We count progress as clearing off tickets each time we have to loop.
				785	*/
				786	static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
				787	{
				788	struct btrfs_fs_info *fs_info;
				789	struct btrfs_space_info *space_info;
				790	u64 to_reclaim;
				791	int flush_state;
				792	int commit_cycles = 0;
				793	u64 last_tickets_id;
				794
				795	fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
				796	space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
				797
				798	spin_lock(&space_info->lock);
				799	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
				800	false);
				801	if (!to_reclaim) {
				802	space_info->flush = 0;
				803	spin_unlock(&space_info->lock);
				804	return;
				805	}
				806	last_tickets_id = space_info->tickets_id;
				807	spin_unlock(&space_info->lock);
				808
				809	flush_state = FLUSH_DELAYED_ITEMS_NR;
				810	do {
				811	flush_space(fs_info, space_info, to_reclaim, flush_state);
				812	spin_lock(&space_info->lock);
				813	if (list_empty(&space_info->tickets)) {
				814	space_info->flush = 0;
				815	spin_unlock(&space_info->lock);
				816	return;
				817	}
				818	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
				819	space_info,
				820	false);
				821	if (last_tickets_id == space_info->tickets_id) {
				822	flush_state++;
				823	} else {
				824	last_tickets_id = space_info->tickets_id;
				825	flush_state = FLUSH_DELAYED_ITEMS_NR;
				826	if (commit_cycles)
				827	commit_cycles--;
				828	}
				829
				830	/*
				831	* We don't want to force a chunk allocation until we've tried
				832	* pretty hard to reclaim space. Think of the case where we
				833	* freed up a bunch of space and so have a lot of pinned space
				834	* to reclaim. We would rather use that than possibly create a
				835	* underutilized metadata chunk. So if this is our first run
				836	* through the flushing state machine skip ALLOC_CHUNK_FORCE and
				837	* commit the transaction. If nothing has changed the next go
				838	* around then we can force a chunk allocation.
				839	*/
				840	if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
				841	flush_state++;
				842
				843	if (flush_state > COMMIT_TRANS) {
				844	commit_cycles++;
				845	if (commit_cycles > 2) {
				846	if (wake_all_tickets(&space_info->tickets)) {
				847	flush_state = FLUSH_DELAYED_ITEMS_NR;
				848	commit_cycles--;
				849	} else {
				850	space_info->flush = 0;
				851	}
				852	} else {
				853	flush_state = FLUSH_DELAYED_ITEMS_NR;
				854	}
				855	}
				856	spin_unlock(&space_info->lock);
				857	} while (flush_state <= COMMIT_TRANS);
				858	}
				859
				860	void btrfs_init_async_reclaim_work(struct work_struct *work)
				861	{
				862	INIT_WORK(work, btrfs_async_reclaim_metadata_space);
				863	}
				864
				865	static const enum btrfs_flush_state priority_flush_states[] = {
				866	FLUSH_DELAYED_ITEMS_NR,
				867	FLUSH_DELAYED_ITEMS,
				868	ALLOC_CHUNK,
				869	};
				870
				871	static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
				872	struct btrfs_space_info *space_info,
				873	struct reserve_ticket *ticket)
				874	{
				875	u64 to_reclaim;
				876	int flush_state;
				877
				878	spin_lock(&space_info->lock);
				879	to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
				880	false);
				881	if (!to_reclaim) {
				882	spin_unlock(&space_info->lock);
				883	return;
				884	}
				885	spin_unlock(&space_info->lock);
				886
				887	flush_state = 0;
				888	do {
				889	flush_space(fs_info, space_info, to_reclaim,
				890	priority_flush_states[flush_state]);
				891	flush_state++;
				892	spin_lock(&space_info->lock);
				893	if (ticket->bytes == 0) {
				894	spin_unlock(&space_info->lock);
				895	return;
				896	}
				897	spin_unlock(&space_info->lock);
				898	} while (flush_state < ARRAY_SIZE(priority_flush_states));
				899	}
				900
				901	static int wait_reserve_ticket(struct btrfs_fs_info *fs_info,
				902	struct btrfs_space_info *space_info,
				903	struct reserve_ticket *ticket)
				904
				905	{
				906	DEFINE_WAIT(wait);
				907	u64 reclaim_bytes = 0;
				908	int ret = 0;
				909
				910	spin_lock(&space_info->lock);
				911	while (ticket->bytes > 0 && ticket->error == 0) {
				912	ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
				913	if (ret) {
				914	ret = -EINTR;
				915	break;
				916	}
				917	spin_unlock(&space_info->lock);
				918
				919	schedule();
				920
				921	finish_wait(&ticket->wait, &wait);
				922	spin_lock(&space_info->lock);
				923	}
				924	if (!ret)
				925	ret = ticket->error;
				926	if (!list_empty(&ticket->list))
				927	list_del_init(&ticket->list);
				928	if (ticket->bytes && ticket->bytes < ticket->orig_bytes)
				929	reclaim_bytes = ticket->orig_bytes - ticket->bytes;
				930	spin_unlock(&space_info->lock);
				931
				932	if (reclaim_bytes)
				933	btrfs_space_info_add_old_bytes(fs_info, space_info,
				934	reclaim_bytes);
				935	return ret;
				936	}
				937
				938	/**
				939	* reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
				940	* @root - the root we're allocating for
				941	* @space_info - the space info we want to allocate from
				942	* @orig_bytes - the number of bytes we want
				943	* @flush - whether or not we can flush to make our reservation
				944	*
				945	* This will reserve orig_bytes number of bytes from the space info associated
				946	* with the block_rsv. If there is not enough space it will make an attempt to
				947	* flush out space to make room. It will do this by flushing delalloc if
				948	* possible or committing the transaction. If flush is 0 then no attempts to
				949	* regain reservations will be made and this will fail if there is not enough
				950	* space already.
				951	*/
				952	static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
				953	struct btrfs_space_info *space_info,
				954	u64 orig_bytes,
				955	enum btrfs_reserve_flush_enum flush,
				956	bool system_chunk)
				957	{
				958	struct reserve_ticket ticket;
				959	u64 used;
				960	u64 reclaim_bytes = 0;
				961	int ret = 0;
				962
				963	ASSERT(orig_bytes);
				964	ASSERT(!current->journal_info \|\| flush != BTRFS_RESERVE_FLUSH_ALL);
				965
				966	spin_lock(&space_info->lock);
				967	ret = -ENOSPC;
				968	used = btrfs_space_info_used(space_info, true);
				969
				970	/*
				971	* If we have enough space then hooray, make our reservation and carry
				972	* on. If not see if we can overcommit, and if we can, hooray carry on.
				973	* If not things get more complicated.
				974	*/
				975	if (used + orig_bytes <= space_info->total_bytes) {
				976	btrfs_space_info_update_bytes_may_use(fs_info, space_info,
				977	orig_bytes);
				978	trace_btrfs_space_reservation(fs_info, "space_info",
				979	space_info->flags, orig_bytes, 1);
				980	ret = 0;
				981	} else if (btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush,
				982	system_chunk)) {
				983	btrfs_space_info_update_bytes_may_use(fs_info, space_info,
				984	orig_bytes);
				985	trace_btrfs_space_reservation(fs_info, "space_info",
				986	space_info->flags, orig_bytes, 1);
				987	ret = 0;
				988	}
				989
				990	/*
				991	* If we couldn't make a reservation then setup our reservation ticket
				992	* and kick the async worker if it's not already running.
				993	*
				994	* If we are a priority flusher then we just need to add our ticket to
				995	* the list and we will do our own flushing further down.
				996	*/
				997	if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
				998	ticket.orig_bytes = orig_bytes;
				999	ticket.bytes = orig_bytes;
				1000	ticket.error = 0;
				1001	init_waitqueue_head(&ticket.wait);
				1002	if (flush == BTRFS_RESERVE_FLUSH_ALL) {
				1003	list_add_tail(&ticket.list, &space_info->tickets);
				1004	if (!space_info->flush) {
				1005	space_info->flush = 1;
				1006	trace_btrfs_trigger_flush(fs_info,
				1007	space_info->flags,
				1008	orig_bytes, flush,
				1009	"enospc");
				1010	queue_work(system_unbound_wq,
				1011	&fs_info->async_reclaim_work);
				1012	}
				1013	} else {
				1014	list_add_tail(&ticket.list,
				1015	&space_info->priority_tickets);
				1016	}
				1017	} else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
				1018	used += orig_bytes;
				1019	/*
				1020	* We will do the space reservation dance during log replay,
				1021	* which means we won't have fs_info->fs_root set, so don't do
				1022	* the async reclaim as we will panic.
				1023	*/
				1024	if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
				1025	need_do_async_reclaim(fs_info, space_info,
				1026	used, system_chunk) &&
				1027	!work_busy(&fs_info->async_reclaim_work)) {
				1028	trace_btrfs_trigger_flush(fs_info, space_info->flags,
				1029	orig_bytes, flush, "preempt");
				1030	queue_work(system_unbound_wq,
				1031	&fs_info->async_reclaim_work);
				1032	}
				1033	}
				1034	spin_unlock(&space_info->lock);
				1035	if (!ret \|\| flush == BTRFS_RESERVE_NO_FLUSH)
				1036	return ret;
				1037
				1038	if (flush == BTRFS_RESERVE_FLUSH_ALL)
				1039	return wait_reserve_ticket(fs_info, space_info, &ticket);
				1040
				1041	ret = 0;
				1042	priority_reclaim_metadata_space(fs_info, space_info, &ticket);
				1043	spin_lock(&space_info->lock);
				1044	if (ticket.bytes) {
				1045	if (ticket.bytes < orig_bytes)
				1046	reclaim_bytes = orig_bytes - ticket.bytes;
				1047	list_del_init(&ticket.list);
				1048	ret = -ENOSPC;
				1049	}
				1050	spin_unlock(&space_info->lock);
				1051
				1052	if (reclaim_bytes)
				1053	btrfs_space_info_add_old_bytes(fs_info, space_info,
				1054	reclaim_bytes);
				1055	ASSERT(list_empty(&ticket.list));
				1056	return ret;
				1057	}
				1058
				1059	/**
				1060	* reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
				1061	* @root - the root we're allocating for
				1062	* @block_rsv - the block_rsv we're allocating for
				1063	* @orig_bytes - the number of bytes we want
				1064	* @flush - whether or not we can flush to make our reservation
				1065	*
				1066	* This will reserve orig_bytes number of bytes from the space info associated
				1067	* with the block_rsv. If there is not enough space it will make an attempt to
				1068	* flush out space to make room. It will do this by flushing delalloc if
				1069	* possible or committing the transaction. If flush is 0 then no attempts to
				1070	* regain reservations will be made and this will fail if there is not enough
				1071	* space already.
				1072	*/
				1073	int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
				1074	struct btrfs_block_rsv *block_rsv,
				1075	u64 orig_bytes,
				1076	enum btrfs_reserve_flush_enum flush)
				1077	{
				1078	struct btrfs_fs_info *fs_info = root->fs_info;
				1079	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
				1080	int ret;
				1081	bool system_chunk = (root == fs_info->chunk_root);
				1082
				1083	ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
				1084	orig_bytes, flush, system_chunk);
				1085	if (ret == -ENOSPC &&
				1086	unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
				1087	if (block_rsv != global_rsv &&
				1088	!btrfs_block_rsv_use_bytes(global_rsv, orig_bytes))
				1089	ret = 0;
				1090	}
				1091	if (ret == -ENOSPC) {
				1092	trace_btrfs_space_reservation(fs_info, "space_info:enospc",
				1093	block_rsv->space_info->flags,
				1094	orig_bytes, 1);
				1095
				1096	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
				1097	btrfs_dump_space_info(fs_info, block_rsv->space_info,
				1098	orig_bytes, 0);
				1099	}
				1100	return ret;
				1101	}