Blame - fs/btrfs/block-group.c - SHIFTPHONES/mainline/linux

blob: 444e9c89ff3e9a798be1c863df88154e321dc380 [file] [log] [blame]

Josef Bacik	2e405ad	2019-06-20 15:37:45 -0400	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2
Johannes Thumshirn	2ca0ec7	2021-10-14 18:39:02 +0900	[diff] [blame]	3	#include <linux/list_sort.h>
David Sterba	784352f	2019-08-21 18:54:28 +0200	[diff] [blame]	4	#include "misc.h"
Josef Bacik	2e405ad	2019-06-20 15:37:45 -0400	[diff] [blame]	5	#include "ctree.h"
				6	#include "block-group.h"
Josef Bacik	3eeb322	2019-06-20 15:37:47 -0400	[diff] [blame]	7	#include "space-info.h"
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	8	#include "disk-io.h"
				9	#include "free-space-cache.h"
				10	#include "free-space-tree.h"
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	11	#include "volumes.h"
				12	#include "transaction.h"
				13	#include "ref-verify.h"
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	14	#include "sysfs.h"
				15	#include "tree-log.h"
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	16	#include "delalloc-space.h"
Dennis Zhou	b0643e5	2019-12-13 16:22:14 -0800	[diff] [blame]	17	#include "discard.h"
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	18	#include "raid56.h"
Naohiro Aota	08e11a3	2021-02-04 19:21:50 +0900	[diff] [blame]	19	#include "zoned.h"
Josef Bacik	2e405ad	2019-06-20 15:37:45 -0400	[diff] [blame]	20
Josef Bacik	878d7b6	2019-06-20 15:38:05 -0400	[diff] [blame]	21	/*
				22	* Return target flags in extended format or 0 if restripe for this chunk_type
				23	* is not in progress
				24	*
				25	* Should be called with balance_lock held
				26	*/
Josef Bacik	e11c040	2019-06-20 15:38:07 -0400	[diff] [blame]	27	static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
Josef Bacik	878d7b6	2019-06-20 15:38:05 -0400	[diff] [blame]	28	{
				29	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
				30	u64 target = 0;
				31
				32	if (!bctl)
				33	return 0;
				34
				35	if (flags & BTRFS_BLOCK_GROUP_DATA &&
				36	bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
				37	target = BTRFS_BLOCK_GROUP_DATA \| bctl->data.target;
				38	} else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
				39	bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
				40	target = BTRFS_BLOCK_GROUP_SYSTEM \| bctl->sys.target;
				41	} else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
				42	bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
				43	target = BTRFS_BLOCK_GROUP_METADATA \| bctl->meta.target;
				44	}
				45
				46	return target;
				47	}
				48
				49	/*
				50	* @flags: available profiles in extended format (see ctree.h)
				51	*
				52	* Return reduced profile in chunk format. If profile changing is in progress
				53	* (either running or paused) picks the target profile (if it's already
				54	* available), otherwise falls back to plain reducing.
				55	*/
				56	static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
				57	{
				58	u64 num_devices = fs_info->fs_devices->rw_devices;
				59	u64 target;
				60	u64 raid_type;
				61	u64 allowed = 0;
				62
				63	/*
				64	* See if restripe for this chunk_type is in progress, if so try to
				65	* reduce to the target profile
				66	*/
				67	spin_lock(&fs_info->balance_lock);
Josef Bacik	e11c040	2019-06-20 15:38:07 -0400	[diff] [blame]	68	target = get_restripe_target(fs_info, flags);
Josef Bacik	878d7b6	2019-06-20 15:38:05 -0400	[diff] [blame]	69	if (target) {
Josef Bacik	162e0a1	2020-07-21 10:48:46 -0400	[diff] [blame]	70	spin_unlock(&fs_info->balance_lock);
				71	return extended_to_chunk(target);
Josef Bacik	878d7b6	2019-06-20 15:38:05 -0400	[diff] [blame]	72	}
				73	spin_unlock(&fs_info->balance_lock);
				74
				75	/* First, mask out the RAID levels which aren't possible */
				76	for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
				77	if (num_devices >= btrfs_raid_array[raid_type].devs_min)
				78	allowed \|= btrfs_raid_array[raid_type].bg_flag;
				79	}
				80	allowed &= flags;
				81
				82	if (allowed & BTRFS_BLOCK_GROUP_RAID6)
				83	allowed = BTRFS_BLOCK_GROUP_RAID6;
				84	else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
				85	allowed = BTRFS_BLOCK_GROUP_RAID5;
				86	else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
				87	allowed = BTRFS_BLOCK_GROUP_RAID10;
				88	else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
				89	allowed = BTRFS_BLOCK_GROUP_RAID1;
				90	else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
				91	allowed = BTRFS_BLOCK_GROUP_RAID0;
				92
				93	flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
				94
				95	return extended_to_chunk(flags \| allowed);
				96	}
				97
Johannes Thumshirn	ef0a82d	2020-01-02 17:14:57 +0100	[diff] [blame]	98	u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
Josef Bacik	878d7b6	2019-06-20 15:38:05 -0400	[diff] [blame]	99	{
				100	unsigned seq;
				101	u64 flags;
				102
				103	do {
				104	flags = orig_flags;
				105	seq = read_seqbegin(&fs_info->profiles_lock);
				106
				107	if (flags & BTRFS_BLOCK_GROUP_DATA)
				108	flags \|= fs_info->avail_data_alloc_bits;
				109	else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
				110	flags \|= fs_info->avail_system_alloc_bits;
				111	else if (flags & BTRFS_BLOCK_GROUP_METADATA)
				112	flags \|= fs_info->avail_metadata_alloc_bits;
				113	} while (read_seqretry(&fs_info->profiles_lock, seq));
				114
				115	return btrfs_reduce_alloc_profile(fs_info, flags);
				116	}
				117
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	118	void btrfs_get_block_group(struct btrfs_block_group *cache)
Josef Bacik	3cad128	2019-06-20 15:37:46 -0400	[diff] [blame]	119	{
Josef Bacik	48aaeeb	2020-07-06 09:14:11 -0400	[diff] [blame]	120	refcount_inc(&cache->refs);
Josef Bacik	3cad128	2019-06-20 15:37:46 -0400	[diff] [blame]	121	}
				122
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	123	void btrfs_put_block_group(struct btrfs_block_group *cache)
Josef Bacik	3cad128	2019-06-20 15:37:46 -0400	[diff] [blame]	124	{
Josef Bacik	48aaeeb	2020-07-06 09:14:11 -0400	[diff] [blame]	125	if (refcount_dec_and_test(&cache->refs)) {
Josef Bacik	3cad128	2019-06-20 15:37:46 -0400	[diff] [blame]	126	WARN_ON(cache->pinned > 0);
				127	WARN_ON(cache->reserved > 0);
				128
				129	/*
Dennis Zhou	b0643e5	2019-12-13 16:22:14 -0800	[diff] [blame]	130	* A block_group shouldn't be on the discard_list anymore.
				131	* Remove the block_group from the discard_list to prevent us
				132	* from causing a panic due to NULL pointer dereference.
				133	*/
				134	if (WARN_ON(!list_empty(&cache->discard_list)))
				135	btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
				136	cache);
				137
				138	/*
Josef Bacik	3cad128	2019-06-20 15:37:46 -0400	[diff] [blame]	139	* If not empty, someone is still holding mutex of
				140	* full_stripe_lock, which can only be released by caller.
				141	* And it will definitely cause use-after-free when caller
				142	* tries to release full stripe lock.
				143	*
				144	* No better way to resolve, but only to warn.
				145	*/
				146	WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
				147	kfree(cache->free_space_ctl);
Naohiro Aota	dafc340d	2021-08-19 21:19:16 +0900	[diff] [blame]	148	kfree(cache->physical_map);
Josef Bacik	3cad128	2019-06-20 15:37:46 -0400	[diff] [blame]	149	kfree(cache);
				150	}
				151	}
				152
Josef Bacik	2e405ad	2019-06-20 15:37:45 -0400	[diff] [blame]	153	/*
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	154	* This adds the block group to the fs_info rb tree for the block group cache
				155	*/
				156	static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	157	struct btrfs_block_group *block_group)
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	158	{
				159	struct rb_node **p;
				160	struct rb_node *parent = NULL;
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	161	struct btrfs_block_group *cache;
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	162
Qu Wenruo	9afc664	2020-05-05 07:58:20 +0800	[diff] [blame]	163	ASSERT(block_group->length != 0);
				164
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	165	spin_lock(&info->block_group_cache_lock);
				166	p = &info->block_group_cache_tree.rb_node;
				167
				168	while (*p) {
				169	parent = *p;
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	170	cache = rb_entry(parent, struct btrfs_block_group, cache_node);
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	171	if (block_group->start < cache->start) {
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	172	p = &(*p)->rb_left;
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	173	} else if (block_group->start > cache->start) {
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	174	p = &(*p)->rb_right;
				175	} else {
				176	spin_unlock(&info->block_group_cache_lock);
				177	return -EEXIST;
				178	}
				179	}
				180
				181	rb_link_node(&block_group->cache_node, parent, p);
				182	rb_insert_color(&block_group->cache_node,
				183	&info->block_group_cache_tree);
				184
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	185	if (info->first_logical_byte > block_group->start)
				186	info->first_logical_byte = block_group->start;
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	187
				188	spin_unlock(&info->block_group_cache_lock);
				189
				190	return 0;
				191	}
				192
				193	/*
Josef Bacik	2e405ad	2019-06-20 15:37:45 -0400	[diff] [blame]	194	* This will return the block group at or after bytenr if contains is 0, else
				195	* it will return the block group that contains the bytenr
				196	*/
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	197	static struct btrfs_block_group *block_group_cache_tree_search(
Josef Bacik	2e405ad	2019-06-20 15:37:45 -0400	[diff] [blame]	198	struct btrfs_fs_info *info, u64 bytenr, int contains)
				199	{
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	200	struct btrfs_block_group cache, ret = NULL;
Josef Bacik	2e405ad	2019-06-20 15:37:45 -0400	[diff] [blame]	201	struct rb_node *n;
				202	u64 end, start;
				203
				204	spin_lock(&info->block_group_cache_lock);
				205	n = info->block_group_cache_tree.rb_node;
				206
				207	while (n) {
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	208	cache = rb_entry(n, struct btrfs_block_group, cache_node);
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	209	end = cache->start + cache->length - 1;
				210	start = cache->start;
Josef Bacik	2e405ad	2019-06-20 15:37:45 -0400	[diff] [blame]	211
				212	if (bytenr < start) {
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	213	if (!contains && (!ret \|\| start < ret->start))
Josef Bacik	2e405ad	2019-06-20 15:37:45 -0400	[diff] [blame]	214	ret = cache;
				215	n = n->rb_left;
				216	} else if (bytenr > start) {
				217	if (contains && bytenr <= end) {
				218	ret = cache;
				219	break;
				220	}
				221	n = n->rb_right;
				222	} else {
				223	ret = cache;
				224	break;
				225	}
				226	}
				227	if (ret) {
				228	btrfs_get_block_group(ret);
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	229	if (bytenr == 0 && info->first_logical_byte > ret->start)
				230	info->first_logical_byte = ret->start;
Josef Bacik	2e405ad	2019-06-20 15:37:45 -0400	[diff] [blame]	231	}
				232	spin_unlock(&info->block_group_cache_lock);
				233
				234	return ret;
				235	}
				236
				237	/*
				238	* Return the block group that starts at or after bytenr
				239	*/
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	240	struct btrfs_block_group *btrfs_lookup_first_block_group(
Josef Bacik	2e405ad	2019-06-20 15:37:45 -0400	[diff] [blame]	241	struct btrfs_fs_info *info, u64 bytenr)
				242	{
				243	return block_group_cache_tree_search(info, bytenr, 0);
				244	}
				245
				246	/*
				247	* Return the block group that contains the given bytenr
				248	*/
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	249	struct btrfs_block_group *btrfs_lookup_block_group(
Josef Bacik	2e405ad	2019-06-20 15:37:45 -0400	[diff] [blame]	250	struct btrfs_fs_info *info, u64 bytenr)
				251	{
				252	return block_group_cache_tree_search(info, bytenr, 1);
				253	}
				254
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	255	struct btrfs_block_group *btrfs_next_block_group(
				256	struct btrfs_block_group *cache)
Josef Bacik	2e405ad	2019-06-20 15:37:45 -0400	[diff] [blame]	257	{
				258	struct btrfs_fs_info *fs_info = cache->fs_info;
				259	struct rb_node *node;
				260
				261	spin_lock(&fs_info->block_group_cache_lock);
				262
				263	/* If our block group was removed, we need a full search. */
				264	if (RB_EMPTY_NODE(&cache->cache_node)) {
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	265	const u64 next_bytenr = cache->start + cache->length;
Josef Bacik	2e405ad	2019-06-20 15:37:45 -0400	[diff] [blame]	266
				267	spin_unlock(&fs_info->block_group_cache_lock);
				268	btrfs_put_block_group(cache);
				269	cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
				270	}
				271	node = rb_next(&cache->cache_node);
				272	btrfs_put_block_group(cache);
				273	if (node) {
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	274	cache = rb_entry(node, struct btrfs_block_group, cache_node);
Josef Bacik	2e405ad	2019-06-20 15:37:45 -0400	[diff] [blame]	275	btrfs_get_block_group(cache);
				276	} else
				277	cache = NULL;
				278	spin_unlock(&fs_info->block_group_cache_lock);
				279	return cache;
				280	}
Josef Bacik	3eeb322	2019-06-20 15:37:47 -0400	[diff] [blame]	281
				282	bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
				283	{
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	284	struct btrfs_block_group *bg;
Josef Bacik	3eeb322	2019-06-20 15:37:47 -0400	[diff] [blame]	285	bool ret = true;
				286
				287	bg = btrfs_lookup_block_group(fs_info, bytenr);
				288	if (!bg)
				289	return false;
				290
				291	spin_lock(&bg->lock);
				292	if (bg->ro)
				293	ret = false;
				294	else
				295	atomic_inc(&bg->nocow_writers);
				296	spin_unlock(&bg->lock);
				297
				298	/* No put on block group, done by btrfs_dec_nocow_writers */
				299	if (!ret)
				300	btrfs_put_block_group(bg);
				301
				302	return ret;
				303	}
				304
				305	void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
				306	{
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	307	struct btrfs_block_group *bg;
Josef Bacik	3eeb322	2019-06-20 15:37:47 -0400	[diff] [blame]	308
				309	bg = btrfs_lookup_block_group(fs_info, bytenr);
				310	ASSERT(bg);
				311	if (atomic_dec_and_test(&bg->nocow_writers))
				312	wake_up_var(&bg->nocow_writers);
				313	/*
				314	* Once for our lookup and once for the lookup done by a previous call
				315	* to btrfs_inc_nocow_writers()
				316	*/
				317	btrfs_put_block_group(bg);
				318	btrfs_put_block_group(bg);
				319	}
				320
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	321	void btrfs_wait_nocow_writers(struct btrfs_block_group *bg)
Josef Bacik	3eeb322	2019-06-20 15:37:47 -0400	[diff] [blame]	322	{
				323	wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
				324	}
				325
				326	void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
				327	const u64 start)
				328	{
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	329	struct btrfs_block_group *bg;
Josef Bacik	3eeb322	2019-06-20 15:37:47 -0400	[diff] [blame]	330
				331	bg = btrfs_lookup_block_group(fs_info, start);
				332	ASSERT(bg);
				333	if (atomic_dec_and_test(&bg->reservations))
				334	wake_up_var(&bg->reservations);
				335	btrfs_put_block_group(bg);
				336	}
				337
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	338	void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg)
Josef Bacik	3eeb322	2019-06-20 15:37:47 -0400	[diff] [blame]	339	{
				340	struct btrfs_space_info *space_info = bg->space_info;
				341
				342	ASSERT(bg->ro);
				343
				344	if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
				345	return;
				346
				347	/*
				348	* Our block group is read only but before we set it to read only,
				349	* some task might have had allocated an extent from it already, but it
				350	* has not yet created a respective ordered extent (and added it to a
				351	* root's list of ordered extents).
				352	* Therefore wait for any task currently allocating extents, since the
				353	* block group's reservations counter is incremented while a read lock
				354	* on the groups' semaphore is held and decremented after releasing
				355	* the read access on that semaphore and creating the ordered extent.
				356	*/
				357	down_write(&space_info->groups_sem);
				358	up_write(&space_info->groups_sem);
				359
				360	wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
				361	}
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	362
				363	struct btrfs_caching_control *btrfs_get_caching_control(
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	364	struct btrfs_block_group *cache)
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	365	{
				366	struct btrfs_caching_control *ctl;
				367
				368	spin_lock(&cache->lock);
				369	if (!cache->caching_ctl) {
				370	spin_unlock(&cache->lock);
				371	return NULL;
				372	}
				373
				374	ctl = cache->caching_ctl;
				375	refcount_inc(&ctl->count);
				376	spin_unlock(&cache->lock);
				377	return ctl;
				378	}
				379
				380	void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
				381	{
				382	if (refcount_dec_and_test(&ctl->count))
				383	kfree(ctl);
				384	}
				385
				386	/*
				387	* When we wait for progress in the block group caching, its because our
				388	* allocation attempt failed at least once. So, we must sleep and let some
				389	* progress happen before we try again.
				390	*
				391	* This function will sleep at least once waiting for new free space to show
				392	* up, and then it will check the block group free space numbers for our min
				393	* num_bytes. Another option is to have it go ahead and look in the rbtree for
				394	* a free extent of a given size, but this is a good start.
				395	*
				396	* Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
				397	* any of the information in this block group.
				398	*/
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	399	void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	400	u64 num_bytes)
				401	{
				402	struct btrfs_caching_control *caching_ctl;
				403
				404	caching_ctl = btrfs_get_caching_control(cache);
				405	if (!caching_ctl)
				406	return;
				407
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	408	wait_event(caching_ctl->wait, btrfs_block_group_done(cache) \|\|
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	409	(cache->free_space_ctl->free_space >= num_bytes));
				410
				411	btrfs_put_caching_control(caching_ctl);
				412	}
				413
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	414	int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	415	{
				416	struct btrfs_caching_control *caching_ctl;
				417	int ret = 0;
				418
				419	caching_ctl = btrfs_get_caching_control(cache);
				420	if (!caching_ctl)
				421	return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
				422
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	423	wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	424	if (cache->cached == BTRFS_CACHE_ERROR)
				425	ret = -EIO;
				426	btrfs_put_caching_control(caching_ctl);
				427	return ret;
				428	}
				429
Josef Bacik	e747853	2020-10-23 09:58:10 -0400	[diff] [blame]	430	static bool space_cache_v1_done(struct btrfs_block_group *cache)
				431	{
				432	bool ret;
				433
				434	spin_lock(&cache->lock);
				435	ret = cache->cached != BTRFS_CACHE_FAST;
				436	spin_unlock(&cache->lock);
				437
				438	return ret;
				439	}
				440
				441	void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache,
				442	struct btrfs_caching_control *caching_ctl)
				443	{
				444	wait_event(caching_ctl->wait, space_cache_v1_done(cache));
				445	}
				446
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	447	#ifdef CONFIG_BTRFS_DEBUG
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	448	static void fragment_free_space(struct btrfs_block_group *block_group)
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	449	{
				450	struct btrfs_fs_info *fs_info = block_group->fs_info;
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	451	u64 start = block_group->start;
				452	u64 len = block_group->length;
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	453	u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
				454	fs_info->nodesize : fs_info->sectorsize;
				455	u64 step = chunk << 1;
				456
				457	while (len > chunk) {
				458	btrfs_remove_free_space(block_group, start, chunk);
				459	start += step;
				460	if (len < step)
				461	len = 0;
				462	else
				463	len -= step;
				464	}
				465	}
				466	#endif
				467
				468	/*
				469	* This is only called by btrfs_cache_block_group, since we could have freed
				470	* extents we need to check the pinned_extents for any extents that can't be
				471	* used yet since their free space will be released as soon as the transaction
				472	* commits.
				473	*/
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	474	u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end)
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	475	{
				476	struct btrfs_fs_info *info = block_group->fs_info;
				477	u64 extent_start, extent_end, size, total_added = 0;
				478	int ret;
				479
				480	while (start < end) {
Nikolay Borisov	fe119a6	2020-01-20 16:09:18 +0200	[diff] [blame]	481	ret = find_first_extent_bit(&info->excluded_extents, start,
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	482	&extent_start, &extent_end,
				483	EXTENT_DIRTY \| EXTENT_UPTODATE,
				484	NULL);
				485	if (ret)
				486	break;
				487
				488	if (extent_start <= start) {
				489	start = extent_end + 1;
				490	} else if (extent_start > start && extent_start < end) {
				491	size = extent_start - start;
				492	total_added += size;
Dennis Zhou	b0643e5	2019-12-13 16:22:14 -0800	[diff] [blame]	493	ret = btrfs_add_free_space_async_trimmed(block_group,
				494	start, size);
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	495	BUG_ON(ret); /* -ENOMEM or logic error */
				496	start = extent_end + 1;
				497	} else {
				498	break;
				499	}
				500	}
				501
				502	if (start < end) {
				503	size = end - start;
				504	total_added += size;
Dennis Zhou	b0643e5	2019-12-13 16:22:14 -0800	[diff] [blame]	505	ret = btrfs_add_free_space_async_trimmed(block_group, start,
				506	size);
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	507	BUG_ON(ret); /* -ENOMEM or logic error */
				508	}
				509
				510	return total_added;
				511	}
				512
				513	static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
				514	{
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	515	struct btrfs_block_group *block_group = caching_ctl->block_group;
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	516	struct btrfs_fs_info *fs_info = block_group->fs_info;
				517	struct btrfs_root *extent_root = fs_info->extent_root;
				518	struct btrfs_path *path;
				519	struct extent_buffer *leaf;
				520	struct btrfs_key key;
				521	u64 total_found = 0;
				522	u64 last = 0;
				523	u32 nritems;
				524	int ret;
				525	bool wakeup = true;
				526
				527	path = btrfs_alloc_path();
				528	if (!path)
				529	return -ENOMEM;
				530
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	531	last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET);
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	532
				533	#ifdef CONFIG_BTRFS_DEBUG
				534	/*
				535	* If we're fragmenting we don't want to make anybody think we can
				536	* allocate from this block group until we've had a chance to fragment
				537	* the free space.
				538	*/
				539	if (btrfs_should_fragment_free_space(block_group))
				540	wakeup = false;
				541	#endif
				542	/*
				543	* We don't want to deadlock with somebody trying to allocate a new
				544	* extent for the extent root while also trying to search the extent
				545	* root to add free space. So we skip locking and search the commit
				546	* root, since its read-only
				547	*/
				548	path->skip_locking = 1;
				549	path->search_commit_root = 1;
				550	path->reada = READA_FORWARD;
				551
				552	key.objectid = last;
				553	key.offset = 0;
				554	key.type = BTRFS_EXTENT_ITEM_KEY;
				555
				556	next:
				557	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
				558	if (ret < 0)
				559	goto out;
				560
				561	leaf = path->nodes[0];
				562	nritems = btrfs_header_nritems(leaf);
				563
				564	while (1) {
				565	if (btrfs_fs_closing(fs_info) > 1) {
				566	last = (u64)-1;
				567	break;
				568	}
				569
				570	if (path->slots[0] < nritems) {
				571	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
				572	} else {
				573	ret = btrfs_find_next_key(extent_root, path, &key, 0, 0);
				574	if (ret)
				575	break;
				576
				577	if (need_resched() \|\|
				578	rwsem_is_contended(&fs_info->commit_root_sem)) {
				579	if (wakeup)
				580	caching_ctl->progress = last;
				581	btrfs_release_path(path);
				582	up_read(&fs_info->commit_root_sem);
				583	mutex_unlock(&caching_ctl->mutex);
				584	cond_resched();
				585	mutex_lock(&caching_ctl->mutex);
				586	down_read(&fs_info->commit_root_sem);
				587	goto next;
				588	}
				589
				590	ret = btrfs_next_leaf(extent_root, path);
				591	if (ret < 0)
				592	goto out;
				593	if (ret)
				594	break;
				595	leaf = path->nodes[0];
				596	nritems = btrfs_header_nritems(leaf);
				597	continue;
				598	}
				599
				600	if (key.objectid < last) {
				601	key.objectid = last;
				602	key.offset = 0;
				603	key.type = BTRFS_EXTENT_ITEM_KEY;
				604
				605	if (wakeup)
				606	caching_ctl->progress = last;
				607	btrfs_release_path(path);
				608	goto next;
				609	}
				610
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	611	if (key.objectid < block_group->start) {
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	612	path->slots[0]++;
				613	continue;
				614	}
				615
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	616	if (key.objectid >= block_group->start + block_group->length)
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	617	break;
				618
				619	if (key.type == BTRFS_EXTENT_ITEM_KEY \|\|
				620	key.type == BTRFS_METADATA_ITEM_KEY) {
				621	total_found += add_new_free_space(block_group, last,
				622	key.objectid);
				623	if (key.type == BTRFS_METADATA_ITEM_KEY)
				624	last = key.objectid +
				625	fs_info->nodesize;
				626	else
				627	last = key.objectid + key.offset;
				628
				629	if (total_found > CACHING_CTL_WAKE_UP) {
				630	total_found = 0;
				631	if (wakeup)
				632	wake_up(&caching_ctl->wait);
				633	}
				634	}
				635	path->slots[0]++;
				636	}
				637	ret = 0;
				638
				639	total_found += add_new_free_space(block_group, last,
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	640	block_group->start + block_group->length);
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	641	caching_ctl->progress = (u64)-1;
				642
				643	out:
				644	btrfs_free_path(path);
				645	return ret;
				646	}
				647
				648	static noinline void caching_thread(struct btrfs_work *work)
				649	{
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	650	struct btrfs_block_group *block_group;
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	651	struct btrfs_fs_info *fs_info;
				652	struct btrfs_caching_control *caching_ctl;
				653	int ret;
				654
				655	caching_ctl = container_of(work, struct btrfs_caching_control, work);
				656	block_group = caching_ctl->block_group;
				657	fs_info = block_group->fs_info;
				658
				659	mutex_lock(&caching_ctl->mutex);
				660	down_read(&fs_info->commit_root_sem);
				661
Josef Bacik	e747853	2020-10-23 09:58:10 -0400	[diff] [blame]	662	if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
				663	ret = load_free_space_cache(block_group);
				664	if (ret == 1) {
				665	ret = 0;
				666	goto done;
				667	}
				668
				669	/*
				670	* We failed to load the space cache, set ourselves to
				671	* CACHE_STARTED and carry on.
				672	*/
				673	spin_lock(&block_group->lock);
				674	block_group->cached = BTRFS_CACHE_STARTED;
				675	spin_unlock(&block_group->lock);
				676	wake_up(&caching_ctl->wait);
				677	}
				678
Josef Bacik	2f96e40	2021-01-15 16:26:17 -0500	[diff] [blame]	679	/*
				680	* If we are in the transaction that populated the free space tree we
				681	* can't actually cache from the free space tree as our commit root and
				682	* real root are the same, so we could change the contents of the blocks
				683	* while caching. Instead do the slow caching in this case, and after
				684	* the transaction has committed we will be safe.
				685	*/
				686	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
				687	!(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags)))
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	688	ret = load_free_space_tree(caching_ctl);
				689	else
				690	ret = load_extent_tree_free(caching_ctl);
Josef Bacik	e747853	2020-10-23 09:58:10 -0400	[diff] [blame]	691	done:
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	692	spin_lock(&block_group->lock);
				693	block_group->caching_ctl = NULL;
				694	block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
				695	spin_unlock(&block_group->lock);
				696
				697	#ifdef CONFIG_BTRFS_DEBUG
				698	if (btrfs_should_fragment_free_space(block_group)) {
				699	u64 bytes_used;
				700
				701	spin_lock(&block_group->space_info->lock);
				702	spin_lock(&block_group->lock);
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	703	bytes_used = block_group->length - block_group->used;
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	704	block_group->space_info->bytes_used += bytes_used >> 1;
				705	spin_unlock(&block_group->lock);
				706	spin_unlock(&block_group->space_info->lock);
Josef Bacik	e11c040	2019-06-20 15:38:07 -0400	[diff] [blame]	707	fragment_free_space(block_group);
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	708	}
				709	#endif
				710
				711	caching_ctl->progress = (u64)-1;
				712
				713	up_read(&fs_info->commit_root_sem);
				714	btrfs_free_excluded_extents(block_group);
				715	mutex_unlock(&caching_ctl->mutex);
				716
				717	wake_up(&caching_ctl->wait);
				718
				719	btrfs_put_caching_control(caching_ctl);
				720	btrfs_put_block_group(block_group);
				721	}
				722
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	723	int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only)
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	724	{
				725	DEFINE_WAIT(wait);
				726	struct btrfs_fs_info *fs_info = cache->fs_info;
Josef Bacik	e747853	2020-10-23 09:58:10 -0400	[diff] [blame]	727	struct btrfs_caching_control *caching_ctl = NULL;
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	728	int ret = 0;
				729
Naohiro Aota	2eda570	2021-02-04 19:21:53 +0900	[diff] [blame]	730	/* Allocator for zoned filesystems does not use the cache at all */
				731	if (btrfs_is_zoned(fs_info))
				732	return 0;
				733
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	734	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
				735	if (!caching_ctl)
				736	return -ENOMEM;
				737
				738	INIT_LIST_HEAD(&caching_ctl->list);
				739	mutex_init(&caching_ctl->mutex);
				740	init_waitqueue_head(&caching_ctl->wait);
				741	caching_ctl->block_group = cache;
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	742	caching_ctl->progress = cache->start;
Josef Bacik	e747853	2020-10-23 09:58:10 -0400	[diff] [blame]	743	refcount_set(&caching_ctl->count, 2);
Omar Sandoval	a0cac0e	2019-09-16 11:30:57 -0700	[diff] [blame]	744	btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	745
				746	spin_lock(&cache->lock);
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	747	if (cache->cached != BTRFS_CACHE_NO) {
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	748	kfree(caching_ctl);
Josef Bacik	e747853	2020-10-23 09:58:10 -0400	[diff] [blame]	749
				750	caching_ctl = cache->caching_ctl;
				751	if (caching_ctl)
				752	refcount_inc(&caching_ctl->count);
				753	spin_unlock(&cache->lock);
				754	goto out;
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	755	}
				756	WARN_ON(cache->caching_ctl);
				757	cache->caching_ctl = caching_ctl;
Josef Bacik	e747853	2020-10-23 09:58:10 -0400	[diff] [blame]	758	if (btrfs_test_opt(fs_info, SPACE_CACHE))
				759	cache->cached = BTRFS_CACHE_FAST;
				760	else
				761	cache->cached = BTRFS_CACHE_STARTED;
				762	cache->has_caching_ctl = 1;
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	763	spin_unlock(&cache->lock);
				764
Josef Bacik	bbb86a3	2020-10-23 09:58:11 -0400	[diff] [blame]	765	spin_lock(&fs_info->block_group_cache_lock);
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	766	refcount_inc(&caching_ctl->count);
				767	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
Josef Bacik	bbb86a3	2020-10-23 09:58:11 -0400	[diff] [blame]	768	spin_unlock(&fs_info->block_group_cache_lock);
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	769
				770	btrfs_get_block_group(cache);
				771
				772	btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
Josef Bacik	e747853	2020-10-23 09:58:10 -0400	[diff] [blame]	773	out:
				774	if (load_cache_only && caching_ctl)
				775	btrfs_wait_space_cache_v1_finished(cache, caching_ctl);
				776	if (caching_ctl)
				777	btrfs_put_caching_control(caching_ctl);
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	778
				779	return ret;
				780	}
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	781
				782	static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
				783	{
				784	u64 extra_flags = chunk_to_extended(flags) &
				785	BTRFS_EXTENDED_PROFILE_MASK;
				786
				787	write_seqlock(&fs_info->profiles_lock);
				788	if (flags & BTRFS_BLOCK_GROUP_DATA)
				789	fs_info->avail_data_alloc_bits &= ~extra_flags;
				790	if (flags & BTRFS_BLOCK_GROUP_METADATA)
				791	fs_info->avail_metadata_alloc_bits &= ~extra_flags;
				792	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
				793	fs_info->avail_system_alloc_bits &= ~extra_flags;
				794	write_sequnlock(&fs_info->profiles_lock);
				795	}
				796
				797	/*
				798	* Clear incompat bits for the following feature(s):
				799	*
				800	* - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
				801	* in the whole filesystem
David Sterba	9c90744	2019-10-31 15:52:01 +0100	[diff] [blame]	802	*
				803	* - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	804	*/
				805	static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
				806	{
David Sterba	9c90744	2019-10-31 15:52:01 +0100	[diff] [blame]	807	bool found_raid56 = false;
				808	bool found_raid1c34 = false;
				809
				810	if ((flags & BTRFS_BLOCK_GROUP_RAID56_MASK) \|\|
				811	(flags & BTRFS_BLOCK_GROUP_RAID1C3) \|\|
				812	(flags & BTRFS_BLOCK_GROUP_RAID1C4)) {
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	813	struct list_head *head = &fs_info->space_info;
				814	struct btrfs_space_info *sinfo;
				815
				816	list_for_each_entry_rcu(sinfo, head, list) {
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	817	down_read(&sinfo->groups_sem);
				818	if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
David Sterba	9c90744	2019-10-31 15:52:01 +0100	[diff] [blame]	819	found_raid56 = true;
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	820	if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
David Sterba	9c90744	2019-10-31 15:52:01 +0100	[diff] [blame]	821	found_raid56 = true;
				822	if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C3]))
				823	found_raid1c34 = true;
				824	if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C4]))
				825	found_raid1c34 = true;
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	826	up_read(&sinfo->groups_sem);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	827	}
Filipe Manana	d8e6fd5	2020-03-20 18:43:48 +0000	[diff] [blame]	828	if (!found_raid56)
David Sterba	9c90744	2019-10-31 15:52:01 +0100	[diff] [blame]	829	btrfs_clear_fs_incompat(fs_info, RAID56);
Filipe Manana	d8e6fd5	2020-03-20 18:43:48 +0000	[diff] [blame]	830	if (!found_raid1c34)
David Sterba	9c90744	2019-10-31 15:52:01 +0100	[diff] [blame]	831	btrfs_clear_fs_incompat(fs_info, RAID1C34);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	832	}
				833	}
				834
Qu Wenruo	7357623	2020-05-05 07:58:21 +0800	[diff] [blame]	835	static int remove_block_group_item(struct btrfs_trans_handle *trans,
				836	struct btrfs_path *path,
				837	struct btrfs_block_group *block_group)
				838	{
				839	struct btrfs_fs_info *fs_info = trans->fs_info;
				840	struct btrfs_root *root;
				841	struct btrfs_key key;
				842	int ret;
				843
				844	root = fs_info->extent_root;
				845	key.objectid = block_group->start;
				846	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
				847	key.offset = block_group->length;
				848
				849	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
				850	if (ret > 0)
				851	ret = -ENOENT;
				852	if (ret < 0)
				853	return ret;
				854
				855	ret = btrfs_del_item(trans, root, path);
				856	return ret;
				857	}
				858
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	859	int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
				860	u64 group_start, struct extent_map *em)
				861	{
				862	struct btrfs_fs_info *fs_info = trans->fs_info;
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	863	struct btrfs_path *path;
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	864	struct btrfs_block_group *block_group;
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	865	struct btrfs_free_cluster *cluster;
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	866	struct inode *inode;
				867	struct kobject *kobj = NULL;
				868	int ret;
				869	int index;
				870	int factor;
				871	struct btrfs_caching_control *caching_ctl = NULL;
				872	bool remove_em;
				873	bool remove_rsv = false;
				874
				875	block_group = btrfs_lookup_block_group(fs_info, group_start);
				876	BUG_ON(!block_group);
				877	BUG_ON(!block_group->ro);
				878
				879	trace_btrfs_remove_block_group(block_group);
				880	/*
				881	* Free the reserved super bytes from this block group before
				882	* remove it.
				883	*/
				884	btrfs_free_excluded_extents(block_group);
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	885	btrfs_free_ref_tree_range(fs_info, block_group->start,
				886	block_group->length);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	887
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	888	index = btrfs_bg_flags_to_raid_index(block_group->flags);
				889	factor = btrfs_bg_type_to_factor(block_group->flags);
				890
				891	/* make sure this block group isn't part of an allocation cluster */
				892	cluster = &fs_info->data_alloc_cluster;
				893	spin_lock(&cluster->refill_lock);
				894	btrfs_return_cluster_to_free_space(block_group, cluster);
				895	spin_unlock(&cluster->refill_lock);
				896
				897	/*
				898	* make sure this block group isn't part of a metadata
				899	* allocation cluster
				900	*/
				901	cluster = &fs_info->meta_alloc_cluster;
				902	spin_lock(&cluster->refill_lock);
				903	btrfs_return_cluster_to_free_space(block_group, cluster);
				904	spin_unlock(&cluster->refill_lock);
				905
Naohiro Aota	40ab3be	2021-02-04 19:22:18 +0900	[diff] [blame]	906	btrfs_clear_treelog_bg(block_group);
Johannes Thumshirn	c2707a2	2021-09-09 01:19:26 +0900	[diff] [blame]	907	btrfs_clear_data_reloc_bg(block_group);
Naohiro Aota	40ab3be	2021-02-04 19:22:18 +0900	[diff] [blame]	908
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	909	path = btrfs_alloc_path();
				910	if (!path) {
				911	ret = -ENOMEM;
Filipe Manana	9fecd13	2020-06-01 19:12:06 +0100	[diff] [blame]	912	goto out;
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	913	}
				914
				915	/*
				916	* get the inode first so any iput calls done for the io_list
				917	* aren't the final iput (no unlinks allowed now)
				918	*/
				919	inode = lookup_free_space_inode(block_group, path);
				920
				921	mutex_lock(&trans->transaction->cache_write_mutex);
				922	/*
				923	* Make sure our free space cache IO is done before removing the
				924	* free space inode
				925	*/
				926	spin_lock(&trans->transaction->dirty_bgs_lock);
				927	if (!list_empty(&block_group->io_list)) {
				928	list_del_init(&block_group->io_list);
				929
				930	WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
				931
				932	spin_unlock(&trans->transaction->dirty_bgs_lock);
				933	btrfs_wait_cache_io(trans, block_group, path);
				934	btrfs_put_block_group(block_group);
				935	spin_lock(&trans->transaction->dirty_bgs_lock);
				936	}
				937
				938	if (!list_empty(&block_group->dirty_list)) {
				939	list_del_init(&block_group->dirty_list);
				940	remove_rsv = true;
				941	btrfs_put_block_group(block_group);
				942	}
				943	spin_unlock(&trans->transaction->dirty_bgs_lock);
				944	mutex_unlock(&trans->transaction->cache_write_mutex);
				945
Boris Burkov	36b216c	2020-11-18 15:06:25 -0800	[diff] [blame]	946	ret = btrfs_remove_free_space_inode(trans, inode, block_group);
				947	if (ret)
Filipe Manana	9fecd13	2020-06-01 19:12:06 +0100	[diff] [blame]	948	goto out;
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	949
				950	spin_lock(&fs_info->block_group_cache_lock);
				951	rb_erase(&block_group->cache_node,
				952	&fs_info->block_group_cache_tree);
				953	RB_CLEAR_NODE(&block_group->cache_node);
				954
Filipe Manana	9fecd13	2020-06-01 19:12:06 +0100	[diff] [blame]	955	/* Once for the block groups rbtree */
				956	btrfs_put_block_group(block_group);
				957
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	958	if (fs_info->first_logical_byte == block_group->start)
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	959	fs_info->first_logical_byte = (u64)-1;
				960	spin_unlock(&fs_info->block_group_cache_lock);
				961
				962	down_write(&block_group->space_info->groups_sem);
				963	/*
				964	* we must use list_del_init so people can check to see if they
				965	* are still on the list after taking the semaphore
				966	*/
				967	list_del_init(&block_group->list);
				968	if (list_empty(&block_group->space_info->block_groups[index])) {
				969	kobj = block_group->space_info->block_group_kobjs[index];
				970	block_group->space_info->block_group_kobjs[index] = NULL;
				971	clear_avail_alloc_bits(fs_info, block_group->flags);
				972	}
				973	up_write(&block_group->space_info->groups_sem);
				974	clear_incompat_bg_bits(fs_info, block_group->flags);
				975	if (kobj) {
				976	kobject_del(kobj);
				977	kobject_put(kobj);
				978	}
				979
				980	if (block_group->has_caching_ctl)
				981	caching_ctl = btrfs_get_caching_control(block_group);
				982	if (block_group->cached == BTRFS_CACHE_STARTED)
				983	btrfs_wait_block_group_cache_done(block_group);
				984	if (block_group->has_caching_ctl) {
Josef Bacik	bbb86a3	2020-10-23 09:58:11 -0400	[diff] [blame]	985	spin_lock(&fs_info->block_group_cache_lock);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	986	if (!caching_ctl) {
				987	struct btrfs_caching_control *ctl;
				988
				989	list_for_each_entry(ctl,
				990	&fs_info->caching_block_groups, list)
				991	if (ctl->block_group == block_group) {
				992	caching_ctl = ctl;
				993	refcount_inc(&caching_ctl->count);
				994	break;
				995	}
				996	}
				997	if (caching_ctl)
				998	list_del_init(&caching_ctl->list);
Josef Bacik	bbb86a3	2020-10-23 09:58:11 -0400	[diff] [blame]	999	spin_unlock(&fs_info->block_group_cache_lock);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1000	if (caching_ctl) {
				1001	/* Once for the caching bgs list and once for us. */
				1002	btrfs_put_caching_control(caching_ctl);
				1003	btrfs_put_caching_control(caching_ctl);
				1004	}
				1005	}
				1006
				1007	spin_lock(&trans->transaction->dirty_bgs_lock);
				1008	WARN_ON(!list_empty(&block_group->dirty_list));
				1009	WARN_ON(!list_empty(&block_group->io_list));
				1010	spin_unlock(&trans->transaction->dirty_bgs_lock);
				1011
				1012	btrfs_remove_free_space_cache(block_group);
				1013
				1014	spin_lock(&block_group->space_info->lock);
				1015	list_del_init(&block_group->ro_list);
				1016
				1017	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
				1018	WARN_ON(block_group->space_info->total_bytes
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	1019	< block_group->length);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1020	WARN_ON(block_group->space_info->bytes_readonly
Naohiro Aota	169e0da	2021-02-04 19:21:52 +0900	[diff] [blame]	1021	< block_group->length - block_group->zone_unusable);
				1022	WARN_ON(block_group->space_info->bytes_zone_unusable
				1023	< block_group->zone_unusable);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1024	WARN_ON(block_group->space_info->disk_total
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	1025	< block_group->length * factor);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1026	}
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	1027	block_group->space_info->total_bytes -= block_group->length;
Naohiro Aota	169e0da	2021-02-04 19:21:52 +0900	[diff] [blame]	1028	block_group->space_info->bytes_readonly -=
				1029	(block_group->length - block_group->zone_unusable);
				1030	block_group->space_info->bytes_zone_unusable -=
				1031	block_group->zone_unusable;
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	1032	block_group->space_info->disk_total -= block_group->length * factor;
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1033
				1034	spin_unlock(&block_group->space_info->lock);
				1035
Filipe Manana	ffcb9d4	2020-06-01 19:12:19 +0100	[diff] [blame]	1036	/*
				1037	* Remove the free space for the block group from the free space tree
				1038	* and the block group's item from the extent tree before marking the
				1039	* block group as removed. This is to prevent races with tasks that
				1040	* freeze and unfreeze a block group, this task and another task
				1041	* allocating a new block group - the unfreeze task ends up removing
				1042	* the block group's extent map before the task calling this function
				1043	* deletes the block group item from the extent tree, allowing for
				1044	* another task to attempt to create another block group with the same
				1045	* item key (and failing with -EEXIST and a transaction abort).
				1046	*/
				1047	ret = remove_block_group_free_space(trans, block_group);
				1048	if (ret)
				1049	goto out;
				1050
				1051	ret = remove_block_group_item(trans, path, block_group);
				1052	if (ret < 0)
				1053	goto out;
				1054
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1055	spin_lock(&block_group->lock);
				1056	block_group->removed = 1;
				1057	/*
Filipe Manana	6b7304a	2020-05-08 11:01:47 +0100	[diff] [blame]	1058	* At this point trimming or scrub can't start on this block group,
				1059	* because we removed the block group from the rbtree
				1060	* fs_info->block_group_cache_tree so no one can't find it anymore and
				1061	* even if someone already got this block group before we removed it
				1062	* from the rbtree, they have already incremented block_group->frozen -
				1063	* if they didn't, for the trimming case they won't find any free space
				1064	* entries because we already removed them all when we called
				1065	* btrfs_remove_free_space_cache().
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1066	*
				1067	* And we must not remove the extent map from the fs_info->mapping_tree
				1068	* to prevent the same logical address range and physical device space
Filipe Manana	6b7304a	2020-05-08 11:01:47 +0100	[diff] [blame]	1069	* ranges from being reused for a new block group. This is needed to
				1070	* avoid races with trimming and scrub.
				1071	*
				1072	* An fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1073	* completely transactionless, so while it is trimming a range the
				1074	* currently running transaction might finish and a new one start,
				1075	* allowing for new block groups to be created that can reuse the same
				1076	* physical device locations unless we take this special care.
				1077	*
				1078	* There may also be an implicit trim operation if the file system
				1079	* is mounted with -odiscard. The same protections must remain
				1080	* in place until the extents have been discarded completely when
				1081	* the transaction commit has completed.
				1082	*/
Filipe Manana	6b7304a	2020-05-08 11:01:47 +0100	[diff] [blame]	1083	remove_em = (atomic_read(&block_group->frozen) == 0);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1084	spin_unlock(&block_group->lock);
				1085
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1086	if (remove_em) {
				1087	struct extent_map_tree *em_tree;
				1088
				1089	em_tree = &fs_info->mapping_tree;
				1090	write_lock(&em_tree->lock);
				1091	remove_extent_mapping(em_tree, em);
				1092	write_unlock(&em_tree->lock);
				1093	/* once for the tree */
				1094	free_extent_map(em);
				1095	}
Xiyu Yang	f6033c5	2020-04-21 10:54:11 +0800	[diff] [blame]	1096
Filipe Manana	9fecd13	2020-06-01 19:12:06 +0100	[diff] [blame]	1097	out:
Xiyu Yang	f6033c5	2020-04-21 10:54:11 +0800	[diff] [blame]	1098	/* Once for the lookup reference */
				1099	btrfs_put_block_group(block_group);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1100	if (remove_rsv)
				1101	btrfs_delayed_refs_rsv_release(fs_info, 1);
				1102	btrfs_free_path(path);
				1103	return ret;
				1104	}
				1105
				1106	struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
				1107	struct btrfs_fs_info *fs_info, const u64 chunk_offset)
				1108	{
				1109	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
				1110	struct extent_map *em;
				1111	struct map_lookup *map;
				1112	unsigned int num_items;
				1113
				1114	read_lock(&em_tree->lock);
				1115	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
				1116	read_unlock(&em_tree->lock);
				1117	ASSERT(em && em->start == chunk_offset);
				1118
				1119	/*
				1120	* We need to reserve 3 + N units from the metadata space info in order
				1121	* to remove a block group (done at btrfs_remove_chunk() and at
				1122	* btrfs_remove_block_group()), which are used for:
				1123	*
				1124	* 1 unit for adding the free space inode's orphan (located in the tree
				1125	* of tree roots).
				1126	* 1 unit for deleting the block group item (located in the extent
				1127	* tree).
				1128	* 1 unit for deleting the free space item (located in tree of tree
				1129	* roots).
				1130	* N units for deleting N device extent items corresponding to each
				1131	* stripe (located in the device tree).
				1132	*
				1133	* In order to remove a block group we also need to reserve units in the
				1134	* system space info in order to update the chunk tree (update one or
				1135	* more device items and remove one chunk item), but this is done at
				1136	* btrfs_remove_chunk() through a call to check_system_chunk().
				1137	*/
				1138	map = em->map_lookup;
				1139	num_items = 3 + map->num_stripes;
				1140	free_extent_map(em);
				1141
				1142	return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
Josef Bacik	7f9fe61	2020-03-13 15:58:05 -0400	[diff] [blame]	1143	num_items);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1144	}
				1145
				1146	/*
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	1147	* Mark block group @cache read-only, so later write won't happen to block
				1148	* group @cache.
				1149	*
				1150	* If @force is not set, this function will only mark the block group readonly
				1151	* if we have enough free space (1M) in other metadata/system block groups.
				1152	* If @force is not set, this function will mark the block group readonly
				1153	* without checking free space.
				1154	*
				1155	* NOTE: This function doesn't care if other block groups can contain all the
				1156	* data in this block group. That check should be done by relocation routine,
				1157	* not this function.
				1158	*/
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	1159	static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	1160	{
				1161	struct btrfs_space_info *sinfo = cache->space_info;
				1162	u64 num_bytes;
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	1163	int ret = -ENOSPC;
				1164
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	1165	spin_lock(&sinfo->lock);
				1166	spin_lock(&cache->lock);
				1167
Filipe Manana	195a49e	2021-02-05 12:55:37 +0000	[diff] [blame]	1168	if (cache->swap_extents) {
				1169	ret = -ETXTBSY;
				1170	goto out;
				1171	}
				1172
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	1173	if (cache->ro) {
				1174	cache->ro++;
				1175	ret = 0;
				1176	goto out;
				1177	}
				1178
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	1179	num_bytes = cache->length - cache->reserved - cache->pinned -
Naohiro Aota	169e0da	2021-02-04 19:21:52 +0900	[diff] [blame]	1180	cache->bytes_super - cache->zone_unusable - cache->used;
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	1181
				1182	/*
Josef Bacik	a30a3d2	2020-01-17 09:07:39 -0500	[diff] [blame]	1183	* Data never overcommits, even in mixed mode, so do just the straight
				1184	* check of left over space in how much we have allocated.
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	1185	*/
Josef Bacik	a30a3d2	2020-01-17 09:07:39 -0500	[diff] [blame]	1186	if (force) {
				1187	ret = 0;
				1188	} else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) {
				1189	u64 sinfo_used = btrfs_space_info_used(sinfo, true);
				1190
				1191	/*
				1192	* Here we make sure if we mark this bg RO, we still have enough
				1193	* free space as buffer.
				1194	*/
				1195	if (sinfo_used + num_bytes <= sinfo->total_bytes)
				1196	ret = 0;
				1197	} else {
				1198	/*
				1199	* We overcommit metadata, so we need to do the
				1200	* btrfs_can_overcommit check here, and we need to pass in
				1201	* BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of
				1202	* leeway to allow us to mark this block group as read only.
				1203	*/
				1204	if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes,
				1205	BTRFS_RESERVE_NO_FLUSH))
				1206	ret = 0;
				1207	}
				1208
				1209	if (!ret) {
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	1210	sinfo->bytes_readonly += num_bytes;
Naohiro Aota	169e0da	2021-02-04 19:21:52 +0900	[diff] [blame]	1211	if (btrfs_is_zoned(cache->fs_info)) {
				1212	/* Migrate zone_unusable bytes to readonly */
				1213	sinfo->bytes_readonly += cache->zone_unusable;
				1214	sinfo->bytes_zone_unusable -= cache->zone_unusable;
				1215	cache->zone_unusable = 0;
				1216	}
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	1217	cache->ro++;
				1218	list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	1219	}
				1220	out:
				1221	spin_unlock(&cache->lock);
				1222	spin_unlock(&sinfo->lock);
				1223	if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
				1224	btrfs_info(cache->fs_info,
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	1225	"unable to make block group %llu ro", cache->start);
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	1226	btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
				1227	}
				1228	return ret;
				1229	}
				1230
Nikolay Borisov	fe119a6	2020-01-20 16:09:18 +0200	[diff] [blame]	1231	static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
				1232	struct btrfs_block_group *bg)
Nikolay Borisov	45bb5d6	2020-01-20 16:09:17 +0200	[diff] [blame]	1233	{
				1234	struct btrfs_fs_info *fs_info = bg->fs_info;
Nikolay Borisov	fe119a6	2020-01-20 16:09:18 +0200	[diff] [blame]	1235	struct btrfs_transaction *prev_trans = NULL;
Nikolay Borisov	45bb5d6	2020-01-20 16:09:17 +0200	[diff] [blame]	1236	const u64 start = bg->start;
				1237	const u64 end = start + bg->length - 1;
				1238	int ret;
				1239
Nikolay Borisov	fe119a6	2020-01-20 16:09:18 +0200	[diff] [blame]	1240	spin_lock(&fs_info->trans_lock);
				1241	if (trans->transaction->list.prev != &fs_info->trans_list) {
				1242	prev_trans = list_last_entry(&trans->transaction->list,
				1243	struct btrfs_transaction, list);
				1244	refcount_inc(&prev_trans->use_count);
				1245	}
				1246	spin_unlock(&fs_info->trans_lock);
				1247
Nikolay Borisov	45bb5d6	2020-01-20 16:09:17 +0200	[diff] [blame]	1248	/*
				1249	* Hold the unused_bg_unpin_mutex lock to avoid racing with
				1250	* btrfs_finish_extent_commit(). If we are at transaction N, another
				1251	* task might be running finish_extent_commit() for the previous
				1252	* transaction N - 1, and have seen a range belonging to the block
Nikolay Borisov	fe119a6	2020-01-20 16:09:18 +0200	[diff] [blame]	1253	* group in pinned_extents before we were able to clear the whole block
				1254	* group range from pinned_extents. This means that task can lookup for
				1255	* the block group after we unpinned it from pinned_extents and removed
				1256	* it, leading to a BUG_ON() at unpin_extent_range().
Nikolay Borisov	45bb5d6	2020-01-20 16:09:17 +0200	[diff] [blame]	1257	*/
				1258	mutex_lock(&fs_info->unused_bg_unpin_mutex);
Nikolay Borisov	fe119a6	2020-01-20 16:09:18 +0200	[diff] [blame]	1259	if (prev_trans) {
				1260	ret = clear_extent_bits(&prev_trans->pinned_extents, start, end,
				1261	EXTENT_DIRTY);
				1262	if (ret)
Filipe Manana	534cf53	2020-04-17 16:36:50 +0100	[diff] [blame]	1263	goto out;
Nikolay Borisov	fe119a6	2020-01-20 16:09:18 +0200	[diff] [blame]	1264	}
Nikolay Borisov	45bb5d6	2020-01-20 16:09:17 +0200	[diff] [blame]	1265
Nikolay Borisov	fe119a6	2020-01-20 16:09:18 +0200	[diff] [blame]	1266	ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end,
Nikolay Borisov	45bb5d6	2020-01-20 16:09:17 +0200	[diff] [blame]	1267	EXTENT_DIRTY);
Filipe Manana	534cf53	2020-04-17 16:36:50 +0100	[diff] [blame]	1268	out:
Nikolay Borisov	45bb5d6	2020-01-20 16:09:17 +0200	[diff] [blame]	1269	mutex_unlock(&fs_info->unused_bg_unpin_mutex);
Filipe Manana	5150bf1	2020-04-17 16:36:15 +0100	[diff] [blame]	1270	if (prev_trans)
				1271	btrfs_put_transaction(prev_trans);
Nikolay Borisov	45bb5d6	2020-01-20 16:09:17 +0200	[diff] [blame]	1272
Filipe Manana	534cf53	2020-04-17 16:36:50 +0100	[diff] [blame]	1273	return ret == 0;
Nikolay Borisov	45bb5d6	2020-01-20 16:09:17 +0200	[diff] [blame]	1274	}
				1275
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	1276	/*
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1277	* Process the unused_bgs list and remove any that don't have any allocated
				1278	* space inside of them.
				1279	*/
				1280	void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
				1281	{
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	1282	struct btrfs_block_group *block_group;
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1283	struct btrfs_space_info *space_info;
				1284	struct btrfs_trans_handle *trans;
Dennis Zhou	6e80d4f	2019-12-13 16:22:15 -0800	[diff] [blame]	1285	const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1286	int ret = 0;
				1287
				1288	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
				1289	return;
				1290
Josef Bacik	ddfd08c	2020-12-18 14:24:19 -0500	[diff] [blame]	1291	/*
				1292	* Long running balances can keep us blocked here for eternity, so
				1293	* simply skip deletion if we're unable to get the mutex.
				1294	*/
Johannes Thumshirn	f337206	2021-04-19 16:41:01 +0900	[diff] [blame]	1295	if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
Josef Bacik	ddfd08c	2020-12-18 14:24:19 -0500	[diff] [blame]	1296	return;
				1297
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1298	spin_lock(&fs_info->unused_bgs_lock);
				1299	while (!list_empty(&fs_info->unused_bgs)) {
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1300	int trimming;
				1301
				1302	block_group = list_first_entry(&fs_info->unused_bgs,
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	1303	struct btrfs_block_group,
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1304	bg_list);
				1305	list_del_init(&block_group->bg_list);
				1306
				1307	space_info = block_group->space_info;
				1308
				1309	if (ret \|\| btrfs_mixed_space_info(space_info)) {
				1310	btrfs_put_block_group(block_group);
				1311	continue;
				1312	}
				1313	spin_unlock(&fs_info->unused_bgs_lock);
				1314
Dennis Zhou	b0643e5	2019-12-13 16:22:14 -0800	[diff] [blame]	1315	btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
				1316
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1317	/* Don't want to race with allocators so take the groups_sem */
				1318	down_write(&space_info->groups_sem);
Dennis Zhou	6e80d4f	2019-12-13 16:22:15 -0800	[diff] [blame]	1319
				1320	/*
				1321	* Async discard moves the final block group discard to be prior
				1322	* to the unused_bgs code path. Therefore, if it's not fully
				1323	* trimmed, punt it back to the async discard lists.
				1324	*/
				1325	if (btrfs_test_opt(fs_info, DISCARD_ASYNC) &&
				1326	!btrfs_is_free_space_trimmed(block_group)) {
				1327	trace_btrfs_skip_unused_block_group(block_group);
				1328	up_write(&space_info->groups_sem);
				1329	/* Requeue if we failed because of async discard */
				1330	btrfs_discard_queue_work(&fs_info->discard_ctl,
				1331	block_group);
				1332	goto next;
				1333	}
				1334
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1335	spin_lock(&block_group->lock);
				1336	if (block_group->reserved \|\| block_group->pinned \|\|
David Sterba	bf38be6	2019-10-23 18:48:11 +0200	[diff] [blame]	1337	block_group->used \|\| block_group->ro \|\|
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1338	list_is_singular(&block_group->list)) {
				1339	/*
				1340	* We want to bail if we made new allocations or have
				1341	* outstanding allocations in this block group. We do
				1342	* the ro check in case balance is currently acting on
				1343	* this block group.
				1344	*/
				1345	trace_btrfs_skip_unused_block_group(block_group);
				1346	spin_unlock(&block_group->lock);
				1347	up_write(&space_info->groups_sem);
				1348	goto next;
				1349	}
				1350	spin_unlock(&block_group->lock);
				1351
				1352	/* We don't want to force the issue, only flip if it's ok. */
Josef Bacik	e11c040	2019-06-20 15:38:07 -0400	[diff] [blame]	1353	ret = inc_block_group_ro(block_group, 0);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1354	up_write(&space_info->groups_sem);
				1355	if (ret < 0) {
				1356	ret = 0;
				1357	goto next;
				1358	}
				1359
				1360	/*
				1361	* Want to do this before we do anything else so we can recover
				1362	* properly if we fail to join the transaction.
				1363	*/
				1364	trans = btrfs_start_trans_remove_block_group(fs_info,
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	1365	block_group->start);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1366	if (IS_ERR(trans)) {
				1367	btrfs_dec_block_group_ro(block_group);
				1368	ret = PTR_ERR(trans);
				1369	goto next;
				1370	}
				1371
				1372	/*
				1373	* We could have pending pinned extents for this block group,
				1374	* just delete them, we don't care about them anymore.
				1375	*/
Filipe Manana	534cf53	2020-04-17 16:36:50 +0100	[diff] [blame]	1376	if (!clean_pinned_extents(trans, block_group)) {
				1377	btrfs_dec_block_group_ro(block_group);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1378	goto end_trans;
Filipe Manana	534cf53	2020-04-17 16:36:50 +0100	[diff] [blame]	1379	}
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1380
Dennis Zhou	b0643e5	2019-12-13 16:22:14 -0800	[diff] [blame]	1381	/*
				1382	* At this point, the block_group is read only and should fail
				1383	* new allocations. However, btrfs_finish_extent_commit() can
				1384	* cause this block_group to be placed back on the discard
				1385	* lists because now the block_group isn't fully discarded.
				1386	* Bail here and try again later after discarding everything.
				1387	*/
				1388	spin_lock(&fs_info->discard_ctl.lock);
				1389	if (!list_empty(&block_group->discard_list)) {
				1390	spin_unlock(&fs_info->discard_ctl.lock);
				1391	btrfs_dec_block_group_ro(block_group);
				1392	btrfs_discard_queue_work(&fs_info->discard_ctl,
				1393	block_group);
				1394	goto end_trans;
				1395	}
				1396	spin_unlock(&fs_info->discard_ctl.lock);
				1397
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1398	/* Reset pinned so btrfs_put_block_group doesn't complain */
				1399	spin_lock(&space_info->lock);
				1400	spin_lock(&block_group->lock);
				1401
				1402	btrfs_space_info_update_bytes_pinned(fs_info, space_info,
				1403	-block_group->pinned);
				1404	space_info->bytes_readonly += block_group->pinned;
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1405	block_group->pinned = 0;
				1406
				1407	spin_unlock(&block_group->lock);
				1408	spin_unlock(&space_info->lock);
				1409
Dennis Zhou	6e80d4f	2019-12-13 16:22:15 -0800	[diff] [blame]	1410	/*
				1411	* The normal path here is an unused block group is passed here,
				1412	* then trimming is handled in the transaction commit path.
				1413	* Async discard interposes before this to do the trimming
				1414	* before coming down the unused block group path as trimming
				1415	* will no longer be done later in the transaction commit path.
				1416	*/
				1417	if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC))
				1418	goto flip_async;
				1419
Naohiro Aota	dcba6e4	2021-02-04 19:21:56 +0900	[diff] [blame]	1420	/*
				1421	* DISCARD can flip during remount. On zoned filesystems, we
				1422	* need to reset sequential-required zones.
				1423	*/
				1424	trimming = btrfs_test_opt(fs_info, DISCARD_SYNC) \|\|
				1425	btrfs_is_zoned(fs_info);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1426
				1427	/* Implicit trim during transaction commit. */
				1428	if (trimming)
Filipe Manana	6b7304a	2020-05-08 11:01:47 +0100	[diff] [blame]	1429	btrfs_freeze_block_group(block_group);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1430
				1431	/*
				1432	* Btrfs_remove_chunk will abort the transaction if things go
				1433	* horribly wrong.
				1434	*/
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	1435	ret = btrfs_remove_chunk(trans, block_group->start);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1436
				1437	if (ret) {
				1438	if (trimming)
Filipe Manana	6b7304a	2020-05-08 11:01:47 +0100	[diff] [blame]	1439	btrfs_unfreeze_block_group(block_group);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1440	goto end_trans;
				1441	}
				1442
				1443	/*
				1444	* If we're not mounted with -odiscard, we can just forget
				1445	* about this block group. Otherwise we'll need to wait
				1446	* until transaction commit to do the actual discard.
				1447	*/
				1448	if (trimming) {
				1449	spin_lock(&fs_info->unused_bgs_lock);
				1450	/*
				1451	* A concurrent scrub might have added us to the list
				1452	* fs_info->unused_bgs, so use a list_move operation
				1453	* to add the block group to the deleted_bgs list.
				1454	*/
				1455	list_move(&block_group->bg_list,
				1456	&trans->transaction->deleted_bgs);
				1457	spin_unlock(&fs_info->unused_bgs_lock);
				1458	btrfs_get_block_group(block_group);
				1459	}
				1460	end_trans:
				1461	btrfs_end_transaction(trans);
				1462	next:
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1463	btrfs_put_block_group(block_group);
				1464	spin_lock(&fs_info->unused_bgs_lock);
				1465	}
				1466	spin_unlock(&fs_info->unused_bgs_lock);
Johannes Thumshirn	f337206	2021-04-19 16:41:01 +0900	[diff] [blame]	1467	mutex_unlock(&fs_info->reclaim_bgs_lock);
Dennis Zhou	6e80d4f	2019-12-13 16:22:15 -0800	[diff] [blame]	1468	return;
				1469
				1470	flip_async:
				1471	btrfs_end_transaction(trans);
Johannes Thumshirn	f337206	2021-04-19 16:41:01 +0900	[diff] [blame]	1472	mutex_unlock(&fs_info->reclaim_bgs_lock);
Dennis Zhou	6e80d4f	2019-12-13 16:22:15 -0800	[diff] [blame]	1473	btrfs_put_block_group(block_group);
				1474	btrfs_discard_punt_unused_bgs_list(fs_info);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1475	}
				1476
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	1477	void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1478	{
				1479	struct btrfs_fs_info *fs_info = bg->fs_info;
				1480
				1481	spin_lock(&fs_info->unused_bgs_lock);
				1482	if (list_empty(&bg->bg_list)) {
				1483	btrfs_get_block_group(bg);
				1484	trace_btrfs_add_unused_block_group(bg);
				1485	list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
				1486	}
				1487	spin_unlock(&fs_info->unused_bgs_lock);
				1488	}
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1489
Johannes Thumshirn	2ca0ec7	2021-10-14 18:39:02 +0900	[diff] [blame]	1490	/*
				1491	* We want block groups with a low number of used bytes to be in the beginning
				1492	* of the list, so they will get reclaimed first.
				1493	*/
				1494	static int reclaim_bgs_cmp(void unused, const struct list_head a,
				1495	const struct list_head *b)
				1496	{
				1497	const struct btrfs_block_group bg1, bg2;
				1498
				1499	bg1 = list_entry(a, struct btrfs_block_group, bg_list);
				1500	bg2 = list_entry(b, struct btrfs_block_group, bg_list);
				1501
				1502	return bg1->used > bg2->used;
				1503	}
				1504
Johannes Thumshirn	18bb8bb	2021-04-19 16:41:02 +0900	[diff] [blame]	1505	void btrfs_reclaim_bgs_work(struct work_struct *work)
				1506	{
				1507	struct btrfs_fs_info *fs_info =
				1508	container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
				1509	struct btrfs_block_group *bg;
				1510	struct btrfs_space_info *space_info;
Filipe Manana	1cea5cf	2021-06-21 11:10:38 +0100	[diff] [blame]	1511	LIST_HEAD(again_list);
Johannes Thumshirn	18bb8bb	2021-04-19 16:41:02 +0900	[diff] [blame]	1512
				1513	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
				1514	return;
				1515
				1516	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
				1517	return;
				1518
Johannes Thumshirn	9cc0b83	2021-07-06 01:32:38 +0900	[diff] [blame]	1519	/*
				1520	* Long running balances can keep us blocked here for eternity, so
				1521	* simply skip reclaim if we're unable to get the mutex.
				1522	*/
				1523	if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
				1524	btrfs_exclop_finish(fs_info);
				1525	return;
				1526	}
				1527
Johannes Thumshirn	18bb8bb	2021-04-19 16:41:02 +0900	[diff] [blame]	1528	spin_lock(&fs_info->unused_bgs_lock);
Johannes Thumshirn	2ca0ec7	2021-10-14 18:39:02 +0900	[diff] [blame]	1529	/*
				1530	* Sort happens under lock because we can't simply splice it and sort.
				1531	* The block groups might still be in use and reachable via bg_list,
				1532	* and their presence in the reclaim_bgs list must be preserved.
				1533	*/
				1534	list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp);
Johannes Thumshirn	18bb8bb	2021-04-19 16:41:02 +0900	[diff] [blame]	1535	while (!list_empty(&fs_info->reclaim_bgs)) {
Johannes Thumshirn	5f93e77	2021-06-29 03:16:46 +0900	[diff] [blame]	1536	u64 zone_unusable;
Filipe Manana	1cea5cf	2021-06-21 11:10:38 +0100	[diff] [blame]	1537	int ret = 0;
				1538
Johannes Thumshirn	18bb8bb	2021-04-19 16:41:02 +0900	[diff] [blame]	1539	bg = list_first_entry(&fs_info->reclaim_bgs,
				1540	struct btrfs_block_group,
				1541	bg_list);
				1542	list_del_init(&bg->bg_list);
				1543
				1544	space_info = bg->space_info;
				1545	spin_unlock(&fs_info->unused_bgs_lock);
				1546
				1547	/* Don't race with allocators so take the groups_sem */
				1548	down_write(&space_info->groups_sem);
				1549
				1550	spin_lock(&bg->lock);
				1551	if (bg->reserved \|\| bg->pinned \|\| bg->ro) {
				1552	/*
				1553	* We want to bail if we made new allocations or have
				1554	* outstanding allocations in this block group. We do
				1555	* the ro check in case balance is currently acting on
				1556	* this block group.
				1557	*/
				1558	spin_unlock(&bg->lock);
				1559	up_write(&space_info->groups_sem);
				1560	goto next;
				1561	}
				1562	spin_unlock(&bg->lock);
				1563
				1564	/* Get out fast, in case we're unmounting the filesystem */
				1565	if (btrfs_fs_closing(fs_info)) {
				1566	up_write(&space_info->groups_sem);
				1567	goto next;
				1568	}
				1569
Johannes Thumshirn	5f93e77	2021-06-29 03:16:46 +0900	[diff] [blame]	1570	/*
				1571	* Cache the zone_unusable value before turning the block group
				1572	* to read only. As soon as the blog group is read only it's
				1573	* zone_unusable value gets moved to the block group's read-only
				1574	* bytes and isn't available for calculations anymore.
				1575	*/
				1576	zone_unusable = bg->zone_unusable;
Johannes Thumshirn	18bb8bb	2021-04-19 16:41:02 +0900	[diff] [blame]	1577	ret = inc_block_group_ro(bg, 0);
				1578	up_write(&space_info->groups_sem);
				1579	if (ret < 0)
				1580	goto next;
				1581
Johannes Thumshirn	5f93e77	2021-06-29 03:16:46 +0900	[diff] [blame]	1582	btrfs_info(fs_info,
				1583	"reclaiming chunk %llu with %llu%% used %llu%% unusable",
				1584	bg->start, div_u64(bg->used * 100, bg->length),
				1585	div64_u64(zone_unusable * 100, bg->length));
Johannes Thumshirn	18bb8bb	2021-04-19 16:41:02 +0900	[diff] [blame]	1586	trace_btrfs_reclaim_block_group(bg);
				1587	ret = btrfs_relocate_chunk(fs_info, bg->start);
Naohiro Aota	ba86dd9	2021-08-09 13:32:30 +0900	[diff] [blame]	1588	if (ret && ret != -EAGAIN)
Johannes Thumshirn	18bb8bb	2021-04-19 16:41:02 +0900	[diff] [blame]	1589	btrfs_err(fs_info, "error relocating chunk %llu",
				1590	bg->start);
				1591
				1592	next:
Johannes Thumshirn	18bb8bb	2021-04-19 16:41:02 +0900	[diff] [blame]	1593	spin_lock(&fs_info->unused_bgs_lock);
Filipe Manana	1cea5cf	2021-06-21 11:10:38 +0100	[diff] [blame]	1594	if (ret == -EAGAIN && list_empty(&bg->bg_list))
				1595	list_add_tail(&bg->bg_list, &again_list);
				1596	else
				1597	btrfs_put_block_group(bg);
Johannes Thumshirn	18bb8bb	2021-04-19 16:41:02 +0900	[diff] [blame]	1598	}
Filipe Manana	1cea5cf	2021-06-21 11:10:38 +0100	[diff] [blame]	1599	list_splice_tail(&again_list, &fs_info->reclaim_bgs);
Johannes Thumshirn	18bb8bb	2021-04-19 16:41:02 +0900	[diff] [blame]	1600	spin_unlock(&fs_info->unused_bgs_lock);
				1601	mutex_unlock(&fs_info->reclaim_bgs_lock);
				1602	btrfs_exclop_finish(fs_info);
				1603	}
				1604
				1605	void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
				1606	{
				1607	spin_lock(&fs_info->unused_bgs_lock);
				1608	if (!list_empty(&fs_info->reclaim_bgs))
				1609	queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work);
				1610	spin_unlock(&fs_info->unused_bgs_lock);
				1611	}
				1612
				1613	void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
				1614	{
				1615	struct btrfs_fs_info *fs_info = bg->fs_info;
				1616
				1617	spin_lock(&fs_info->unused_bgs_lock);
				1618	if (list_empty(&bg->bg_list)) {
				1619	btrfs_get_block_group(bg);
				1620	trace_btrfs_add_reclaim_block_group(bg);
				1621	list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs);
				1622	}
				1623	spin_unlock(&fs_info->unused_bgs_lock);
				1624	}
				1625
Johannes Thumshirn	e3ba67a	2020-06-02 19:05:57 +0900	[diff] [blame]	1626	static int read_bg_from_eb(struct btrfs_fs_info fs_info, struct btrfs_key key,
				1627	struct btrfs_path *path)
				1628	{
				1629	struct extent_map_tree *em_tree;
				1630	struct extent_map *em;
				1631	struct btrfs_block_group_item bg;
				1632	struct extent_buffer *leaf;
				1633	int slot;
				1634	u64 flags;
				1635	int ret = 0;
				1636
				1637	slot = path->slots[0];
				1638	leaf = path->nodes[0];
				1639
				1640	em_tree = &fs_info->mapping_tree;
				1641	read_lock(&em_tree->lock);
				1642	em = lookup_extent_mapping(em_tree, key->objectid, key->offset);
				1643	read_unlock(&em_tree->lock);
				1644	if (!em) {
				1645	btrfs_err(fs_info,
				1646	"logical %llu len %llu found bg but no related chunk",
				1647	key->objectid, key->offset);
				1648	return -ENOENT;
				1649	}
				1650
				1651	if (em->start != key->objectid \|\| em->len != key->offset) {
				1652	btrfs_err(fs_info,
				1653	"block group %llu len %llu mismatch with chunk %llu len %llu",
				1654	key->objectid, key->offset, em->start, em->len);
				1655	ret = -EUCLEAN;
				1656	goto out_free_em;
				1657	}
				1658
				1659	read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot),
				1660	sizeof(bg));
				1661	flags = btrfs_stack_block_group_flags(&bg) &
				1662	BTRFS_BLOCK_GROUP_TYPE_MASK;
				1663
				1664	if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
				1665	btrfs_err(fs_info,
				1666	"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
				1667	key->objectid, key->offset, flags,
				1668	(BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type));
				1669	ret = -EUCLEAN;
				1670	}
				1671
				1672	out_free_em:
				1673	free_extent_map(em);
				1674	return ret;
				1675	}
				1676
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1677	static int find_first_block_group(struct btrfs_fs_info *fs_info,
				1678	struct btrfs_path *path,
				1679	struct btrfs_key *key)
				1680	{
				1681	struct btrfs_root *root = fs_info->extent_root;
Johannes Thumshirn	e3ba67a	2020-06-02 19:05:57 +0900	[diff] [blame]	1682	int ret;
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1683	struct btrfs_key found_key;
				1684	struct extent_buffer *leaf;
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1685	int slot;
				1686
				1687	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
				1688	if (ret < 0)
Johannes Thumshirn	e3ba67a	2020-06-02 19:05:57 +0900	[diff] [blame]	1689	return ret;
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1690
				1691	while (1) {
				1692	slot = path->slots[0];
				1693	leaf = path->nodes[0];
				1694	if (slot >= btrfs_header_nritems(leaf)) {
				1695	ret = btrfs_next_leaf(root, path);
				1696	if (ret == 0)
				1697	continue;
				1698	if (ret < 0)
				1699	goto out;
				1700	break;
				1701	}
				1702	btrfs_item_key_to_cpu(leaf, &found_key, slot);
				1703
				1704	if (found_key.objectid >= key->objectid &&
				1705	found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
Johannes Thumshirn	e3ba67a	2020-06-02 19:05:57 +0900	[diff] [blame]	1706	ret = read_bg_from_eb(fs_info, &found_key, path);
				1707	break;
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1708	}
Johannes Thumshirn	e3ba67a	2020-06-02 19:05:57 +0900	[diff] [blame]	1709
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1710	path->slots[0]++;
				1711	}
				1712	out:
				1713	return ret;
				1714	}
				1715
				1716	static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
				1717	{
				1718	u64 extra_flags = chunk_to_extended(flags) &
				1719	BTRFS_EXTENDED_PROFILE_MASK;
				1720
				1721	write_seqlock(&fs_info->profiles_lock);
				1722	if (flags & BTRFS_BLOCK_GROUP_DATA)
				1723	fs_info->avail_data_alloc_bits \|= extra_flags;
				1724	if (flags & BTRFS_BLOCK_GROUP_METADATA)
				1725	fs_info->avail_metadata_alloc_bits \|= extra_flags;
				1726	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
				1727	fs_info->avail_system_alloc_bits \|= extra_flags;
				1728	write_sequnlock(&fs_info->profiles_lock);
				1729	}
				1730
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	1731	/**
Nikolay Borisov	9ee9b97	2021-01-22 11:57:58 +0200	[diff] [blame]	1732	* Map a physical disk address to a list of logical addresses
				1733	*
				1734	* @fs_info: the filesystem
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	1735	* @chunk_start: logical address of block group
Naohiro Aota	138082f	2021-02-04 19:22:02 +0900	[diff] [blame]	1736	* @bdev: physical device to resolve, can be NULL to indicate any device
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	1737	* @physical: physical address to map to logical addresses
				1738	* @logical: return array of logical addresses which map to @physical
				1739	* @naddrs: length of @logical
				1740	* @stripe_len: size of IO stripe for the given block group
				1741	*
				1742	* Maps a particular @physical disk address to a list of @logical addresses.
				1743	* Used primarily to exclude those portions of a block group that contain super
				1744	* block copies.
				1745	*/
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	1746	int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
Naohiro Aota	138082f	2021-02-04 19:22:02 +0900	[diff] [blame]	1747	struct block_device bdev, u64 physical, u64 *logical,
				1748	int naddrs, int stripe_len)
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	1749	{
				1750	struct extent_map *em;
				1751	struct map_lookup *map;
				1752	u64 *buf;
				1753	u64 bytenr;
Nikolay Borisov	1776ad1	2019-11-19 14:05:53 +0200	[diff] [blame]	1754	u64 data_stripe_length;
				1755	u64 io_stripe_size;
				1756	int i, nr = 0;
				1757	int ret = 0;
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	1758
				1759	em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
				1760	if (IS_ERR(em))
				1761	return -EIO;
				1762
				1763	map = em->map_lookup;
Nikolay Borisov	9e22b92	2020-04-03 16:40:34 +0300	[diff] [blame]	1764	data_stripe_length = em->orig_block_len;
Nikolay Borisov	1776ad1	2019-11-19 14:05:53 +0200	[diff] [blame]	1765	io_stripe_size = map->stripe_len;
Naohiro Aota	138082f	2021-02-04 19:22:02 +0900	[diff] [blame]	1766	chunk_start = em->start;
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	1767
Nikolay Borisov	9e22b92	2020-04-03 16:40:34 +0300	[diff] [blame]	1768	/* For RAID5/6 adjust to a full IO stripe length */
				1769	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
Nikolay Borisov	1776ad1	2019-11-19 14:05:53 +0200	[diff] [blame]	1770	io_stripe_size = map->stripe_len * nr_data_stripes(map);
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	1771
				1772	buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
Nikolay Borisov	1776ad1	2019-11-19 14:05:53 +0200	[diff] [blame]	1773	if (!buf) {
				1774	ret = -ENOMEM;
				1775	goto out;
				1776	}
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	1777
				1778	for (i = 0; i < map->num_stripes; i++) {
Nikolay Borisov	1776ad1	2019-11-19 14:05:53 +0200	[diff] [blame]	1779	bool already_inserted = false;
				1780	u64 stripe_nr;
Naohiro Aota	138082f	2021-02-04 19:22:02 +0900	[diff] [blame]	1781	u64 offset;
Nikolay Borisov	1776ad1	2019-11-19 14:05:53 +0200	[diff] [blame]	1782	int j;
				1783
				1784	if (!in_range(physical, map->stripes[i].physical,
				1785	data_stripe_length))
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	1786	continue;
				1787
Naohiro Aota	138082f	2021-02-04 19:22:02 +0900	[diff] [blame]	1788	if (bdev && map->stripes[i].dev->bdev != bdev)
				1789	continue;
				1790
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	1791	stripe_nr = physical - map->stripes[i].physical;
Naohiro Aota	138082f	2021-02-04 19:22:02 +0900	[diff] [blame]	1792	stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset);
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	1793
				1794	if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
				1795	stripe_nr = stripe_nr * map->num_stripes + i;
				1796	stripe_nr = div_u64(stripe_nr, map->sub_stripes);
				1797	} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
				1798	stripe_nr = stripe_nr * map->num_stripes + i;
				1799	}
				1800	/*
				1801	* The remaining case would be for RAID56, multiply by
				1802	* nr_data_stripes(). Alternatively, just use rmap_len below
				1803	* instead of map->stripe_len
				1804	*/
				1805
Naohiro Aota	138082f	2021-02-04 19:22:02 +0900	[diff] [blame]	1806	bytenr = chunk_start + stripe_nr * io_stripe_size + offset;
Nikolay Borisov	1776ad1	2019-11-19 14:05:53 +0200	[diff] [blame]	1807
				1808	/* Ensure we don't add duplicate addresses */
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	1809	for (j = 0; j < nr; j++) {
Nikolay Borisov	1776ad1	2019-11-19 14:05:53 +0200	[diff] [blame]	1810	if (buf[j] == bytenr) {
				1811	already_inserted = true;
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	1812	break;
Nikolay Borisov	1776ad1	2019-11-19 14:05:53 +0200	[diff] [blame]	1813	}
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	1814	}
Nikolay Borisov	1776ad1	2019-11-19 14:05:53 +0200	[diff] [blame]	1815
				1816	if (!already_inserted)
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	1817	buf[nr++] = bytenr;
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	1818	}
				1819
				1820	*logical = buf;
				1821	*naddrs = nr;
Nikolay Borisov	1776ad1	2019-11-19 14:05:53 +0200	[diff] [blame]	1822	*stripe_len = io_stripe_size;
				1823	out:
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	1824	free_extent_map(em);
Nikolay Borisov	1776ad1	2019-11-19 14:05:53 +0200	[diff] [blame]	1825	return ret;
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	1826	}
				1827
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	1828	static int exclude_super_stripes(struct btrfs_block_group *cache)
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1829	{
				1830	struct btrfs_fs_info *fs_info = cache->fs_info;
Naohiro Aota	1265925	2020-11-10 20:26:14 +0900	[diff] [blame]	1831	const bool zoned = btrfs_is_zoned(fs_info);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1832	u64 bytenr;
				1833	u64 *logical;
				1834	int stripe_len;
				1835	int i, nr, ret;
				1836
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	1837	if (cache->start < BTRFS_SUPER_INFO_OFFSET) {
				1838	stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start;
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1839	cache->bytes_super += stripe_len;
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	1840	ret = btrfs_add_excluded_extent(fs_info, cache->start,
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1841	stripe_len);
				1842	if (ret)
				1843	return ret;
				1844	}
				1845
				1846	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
				1847	bytenr = btrfs_sb_offset(i);
Naohiro Aota	138082f	2021-02-04 19:22:02 +0900	[diff] [blame]	1848	ret = btrfs_rmap_block(fs_info, cache->start, NULL,
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1849	bytenr, &logical, &nr, &stripe_len);
				1850	if (ret)
				1851	return ret;
				1852
Naohiro Aota	1265925	2020-11-10 20:26:14 +0900	[diff] [blame]	1853	/* Shouldn't have super stripes in sequential zones */
				1854	if (zoned && nr) {
				1855	btrfs_err(fs_info,
				1856	"zoned: block group %llu must not contain super block",
				1857	cache->start);
				1858	return -EUCLEAN;
				1859	}
				1860
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1861	while (nr--) {
Nikolay Borisov	96f9b0f	2020-04-03 16:40:35 +0300	[diff] [blame]	1862	u64 len = min_t(u64, stripe_len,
				1863	cache->start + cache->length - logical[nr]);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1864
				1865	cache->bytes_super += len;
Nikolay Borisov	96f9b0f	2020-04-03 16:40:35 +0300	[diff] [blame]	1866	ret = btrfs_add_excluded_extent(fs_info, logical[nr],
				1867	len);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1868	if (ret) {
				1869	kfree(logical);
				1870	return ret;
				1871	}
				1872	}
				1873
				1874	kfree(logical);
				1875	}
				1876	return 0;
				1877	}
				1878
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	1879	static void link_block_group(struct btrfs_block_group *cache)
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1880	{
				1881	struct btrfs_space_info *space_info = cache->space_info;
				1882	int index = btrfs_bg_flags_to_raid_index(cache->flags);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1883
				1884	down_write(&space_info->groups_sem);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1885	list_add_tail(&cache->list, &space_info->block_groups[index]);
				1886	up_write(&space_info->groups_sem);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1887	}
				1888
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	1889	static struct btrfs_block_group *btrfs_create_block_group_cache(
Qu Wenruo	9afc664	2020-05-05 07:58:20 +0800	[diff] [blame]	1890	struct btrfs_fs_info *fs_info, u64 start)
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1891	{
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	1892	struct btrfs_block_group *cache;
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1893
				1894	cache = kzalloc(sizeof(*cache), GFP_NOFS);
				1895	if (!cache)
				1896	return NULL;
				1897
				1898	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
				1899	GFP_NOFS);
				1900	if (!cache->free_space_ctl) {
				1901	kfree(cache);
				1902	return NULL;
				1903	}
				1904
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	1905	cache->start = start;
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1906
				1907	cache->fs_info = fs_info;
				1908	cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1909
Dennis Zhou	6e80d4f	2019-12-13 16:22:15 -0800	[diff] [blame]	1910	cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
				1911
Josef Bacik	48aaeeb	2020-07-06 09:14:11 -0400	[diff] [blame]	1912	refcount_set(&cache->refs, 1);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1913	spin_lock_init(&cache->lock);
				1914	init_rwsem(&cache->data_rwsem);
				1915	INIT_LIST_HEAD(&cache->list);
				1916	INIT_LIST_HEAD(&cache->cluster_list);
				1917	INIT_LIST_HEAD(&cache->bg_list);
				1918	INIT_LIST_HEAD(&cache->ro_list);
Dennis Zhou	b0643e5	2019-12-13 16:22:14 -0800	[diff] [blame]	1919	INIT_LIST_HEAD(&cache->discard_list);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1920	INIT_LIST_HEAD(&cache->dirty_list);
				1921	INIT_LIST_HEAD(&cache->io_list);
Naohiro Aota	afba2bc	2021-08-19 21:19:17 +0900	[diff] [blame]	1922	INIT_LIST_HEAD(&cache->active_bg_list);
Josef Bacik	cd79909	2020-10-23 09:58:08 -0400	[diff] [blame]	1923	btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
Filipe Manana	6b7304a	2020-05-08 11:01:47 +0100	[diff] [blame]	1924	atomic_set(&cache->frozen, 0);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1925	mutex_init(&cache->free_space_lock);
				1926	btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
				1927
				1928	return cache;
				1929	}
				1930
				1931	/*
				1932	* Iterate all chunks and verify that each of them has the corresponding block
				1933	* group
				1934	*/
				1935	static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
				1936	{
				1937	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
				1938	struct extent_map *em;
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	1939	struct btrfs_block_group *bg;
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1940	u64 start = 0;
				1941	int ret = 0;
				1942
				1943	while (1) {
				1944	read_lock(&map_tree->lock);
				1945	/*
				1946	* lookup_extent_mapping will return the first extent map
				1947	* intersecting the range, so setting @len to 1 is enough to
				1948	* get the first chunk.
				1949	*/
				1950	em = lookup_extent_mapping(map_tree, start, 1);
				1951	read_unlock(&map_tree->lock);
				1952	if (!em)
				1953	break;
				1954
				1955	bg = btrfs_lookup_block_group(fs_info, em->start);
				1956	if (!bg) {
				1957	btrfs_err(fs_info,
				1958	"chunk start=%llu len=%llu doesn't have corresponding block group",
				1959	em->start, em->len);
				1960	ret = -EUCLEAN;
				1961	free_extent_map(em);
				1962	break;
				1963	}
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	1964	if (bg->start != em->start \|\| bg->length != em->len \|\|
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1965	(bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
				1966	(em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
				1967	btrfs_err(fs_info,
				1968	"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
				1969	em->start, em->len,
				1970	em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	1971	bg->start, bg->length,
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1972	bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
				1973	ret = -EUCLEAN;
				1974	free_extent_map(em);
				1975	btrfs_put_block_group(bg);
				1976	break;
				1977	}
				1978	start = em->start + em->len;
				1979	free_extent_map(em);
				1980	btrfs_put_block_group(bg);
				1981	}
				1982	return ret;
				1983	}
				1984
Qu Wenruo	ffb9e0f	2019-10-10 10:39:27 +0800	[diff] [blame]	1985	static int read_one_block_group(struct btrfs_fs_info *info,
Johannes Thumshirn	4afd2fe	2021-02-04 19:21:44 +0900	[diff] [blame]	1986	struct btrfs_block_group_item *bgi,
Qu Wenruo	d49a2dd	2019-11-05 09:35:35 +0800	[diff] [blame]	1987	const struct btrfs_key *key,
Qu Wenruo	ffb9e0f	2019-10-10 10:39:27 +0800	[diff] [blame]	1988	int need_clear)
				1989	{
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	1990	struct btrfs_block_group *cache;
Qu Wenruo	ffb9e0f	2019-10-10 10:39:27 +0800	[diff] [blame]	1991	struct btrfs_space_info *space_info;
Qu Wenruo	ffb9e0f	2019-10-10 10:39:27 +0800	[diff] [blame]	1992	const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS);
Qu Wenruo	ffb9e0f	2019-10-10 10:39:27 +0800	[diff] [blame]	1993	int ret;
				1994
Qu Wenruo	d49a2dd	2019-11-05 09:35:35 +0800	[diff] [blame]	1995	ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY);
Qu Wenruo	ffb9e0f	2019-10-10 10:39:27 +0800	[diff] [blame]	1996
Qu Wenruo	9afc664	2020-05-05 07:58:20 +0800	[diff] [blame]	1997	cache = btrfs_create_block_group_cache(info, key->objectid);
Qu Wenruo	ffb9e0f	2019-10-10 10:39:27 +0800	[diff] [blame]	1998	if (!cache)
				1999	return -ENOMEM;
				2000
Johannes Thumshirn	4afd2fe	2021-02-04 19:21:44 +0900	[diff] [blame]	2001	cache->length = key->offset;
				2002	cache->used = btrfs_stack_block_group_used(bgi);
				2003	cache->flags = btrfs_stack_block_group_flags(bgi);
Qu Wenruo	9afc664	2020-05-05 07:58:20 +0800	[diff] [blame]	2004
Marcos Paulo de Souza	e3e39c7	2020-08-21 11:54:44 -0300	[diff] [blame]	2005	set_free_space_tree_thresholds(cache);
				2006
Qu Wenruo	ffb9e0f	2019-10-10 10:39:27 +0800	[diff] [blame]	2007	if (need_clear) {
				2008	/*
				2009	* When we mount with old space cache, we need to
				2010	* set BTRFS_DC_CLEAR and set dirty flag.
				2011	*
				2012	* a) Setting 'BTRFS_DC_CLEAR' makes sure that we
				2013	* truncate the old free space cache inode and
				2014	* setup a new one.
				2015	* b) Setting 'dirty flag' makes sure that we flush
				2016	* the new space cache info onto disk.
				2017	*/
				2018	if (btrfs_test_opt(info, SPACE_CACHE))
				2019	cache->disk_cache_state = BTRFS_DC_CLEAR;
				2020	}
Qu Wenruo	ffb9e0f	2019-10-10 10:39:27 +0800	[diff] [blame]	2021	if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
				2022	(cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
				2023	btrfs_err(info,
				2024	"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
				2025	cache->start);
				2026	ret = -EINVAL;
				2027	goto error;
				2028	}
				2029
Naohiro Aota	a94794d	2021-02-04 19:21:51 +0900	[diff] [blame]	2030	ret = btrfs_load_block_group_zone_info(cache, false);
Naohiro Aota	08e11a3	2021-02-04 19:21:50 +0900	[diff] [blame]	2031	if (ret) {
				2032	btrfs_err(info, "zoned: failed to load zone info of bg %llu",
				2033	cache->start);
				2034	goto error;
				2035	}
				2036
Qu Wenruo	ffb9e0f	2019-10-10 10:39:27 +0800	[diff] [blame]	2037	/*
				2038	* We need to exclude the super stripes now so that the space info has
				2039	* super bytes accounted for, otherwise we'll think we have more space
				2040	* than we actually do.
				2041	*/
				2042	ret = exclude_super_stripes(cache);
				2043	if (ret) {
				2044	/* We may have excluded something, so call this just in case. */
				2045	btrfs_free_excluded_extents(cache);
				2046	goto error;
				2047	}
				2048
				2049	/*
Naohiro Aota	169e0da	2021-02-04 19:21:52 +0900	[diff] [blame]	2050	* For zoned filesystem, space after the allocation offset is the only
				2051	* free space for a block group. So, we don't need any caching work.
				2052	* btrfs_calc_zone_unusable() will set the amount of free space and
				2053	* zone_unusable space.
				2054	*
				2055	* For regular filesystem, check for two cases, either we are full, and
				2056	* therefore don't need to bother with the caching work since we won't
				2057	* find any space, or we are empty, and we can just add all the space
				2058	* in and be done with it. This saves us _a_lot_ of time, particularly
				2059	* in the full case.
Qu Wenruo	ffb9e0f	2019-10-10 10:39:27 +0800	[diff] [blame]	2060	*/
Naohiro Aota	169e0da	2021-02-04 19:21:52 +0900	[diff] [blame]	2061	if (btrfs_is_zoned(info)) {
				2062	btrfs_calc_zone_unusable(cache);
Naohiro Aota	c46c424	2021-08-19 21:19:09 +0900	[diff] [blame]	2063	/* Should not have any excluded extents. Just in case, though. */
				2064	btrfs_free_excluded_extents(cache);
Naohiro Aota	169e0da	2021-02-04 19:21:52 +0900	[diff] [blame]	2065	} else if (cache->length == cache->used) {
Qu Wenruo	ffb9e0f	2019-10-10 10:39:27 +0800	[diff] [blame]	2066	cache->last_byte_to_unpin = (u64)-1;
				2067	cache->cached = BTRFS_CACHE_FINISHED;
				2068	btrfs_free_excluded_extents(cache);
				2069	} else if (cache->used == 0) {
				2070	cache->last_byte_to_unpin = (u64)-1;
				2071	cache->cached = BTRFS_CACHE_FINISHED;
Qu Wenruo	9afc664	2020-05-05 07:58:20 +0800	[diff] [blame]	2072	add_new_free_space(cache, cache->start,
				2073	cache->start + cache->length);
Qu Wenruo	ffb9e0f	2019-10-10 10:39:27 +0800	[diff] [blame]	2074	btrfs_free_excluded_extents(cache);
				2075	}
				2076
				2077	ret = btrfs_add_block_group_cache(info, cache);
				2078	if (ret) {
				2079	btrfs_remove_free_space_cache(cache);
				2080	goto error;
				2081	}
				2082	trace_btrfs_add_block_group(info, cache, 0);
Qu Wenruo	9afc664	2020-05-05 07:58:20 +0800	[diff] [blame]	2083	btrfs_update_space_info(info, cache->flags, cache->length,
Naohiro Aota	169e0da	2021-02-04 19:21:52 +0900	[diff] [blame]	2084	cache->used, cache->bytes_super,
				2085	cache->zone_unusable, &space_info);
Qu Wenruo	ffb9e0f	2019-10-10 10:39:27 +0800	[diff] [blame]	2086
				2087	cache->space_info = space_info;
				2088
				2089	link_block_group(cache);
				2090
				2091	set_avail_alloc_bits(info, cache->flags);
Anand Jain	a09f23c	2021-08-24 13:27:42 +0800	[diff] [blame]	2092	if (btrfs_chunk_writeable(info, cache->start)) {
				2093	if (cache->used == 0) {
				2094	ASSERT(list_empty(&cache->bg_list));
				2095	if (btrfs_test_opt(info, DISCARD_ASYNC))
				2096	btrfs_discard_queue_work(&info->discard_ctl, cache);
				2097	else
				2098	btrfs_mark_bg_unused(cache);
				2099	}
				2100	} else {
Qu Wenruo	ffb9e0f	2019-10-10 10:39:27 +0800	[diff] [blame]	2101	inc_block_group_ro(cache, 1);
Qu Wenruo	ffb9e0f	2019-10-10 10:39:27 +0800	[diff] [blame]	2102	}
Anand Jain	a09f23c	2021-08-24 13:27:42 +0800	[diff] [blame]	2103
Qu Wenruo	ffb9e0f	2019-10-10 10:39:27 +0800	[diff] [blame]	2104	return 0;
				2105	error:
				2106	btrfs_put_block_group(cache);
				2107	return ret;
				2108	}
				2109
Josef Bacik	42437a6	2020-10-16 11:29:18 -0400	[diff] [blame]	2110	static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
				2111	{
				2112	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
				2113	struct btrfs_space_info *space_info;
				2114	struct rb_node *node;
				2115	int ret = 0;
				2116
				2117	for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
				2118	struct extent_map *em;
				2119	struct map_lookup *map;
				2120	struct btrfs_block_group *bg;
				2121
				2122	em = rb_entry(node, struct extent_map, rb_node);
				2123	map = em->map_lookup;
				2124	bg = btrfs_create_block_group_cache(fs_info, em->start);
				2125	if (!bg) {
				2126	ret = -ENOMEM;
				2127	break;
				2128	}
				2129
				2130	/* Fill dummy cache as FULL */
				2131	bg->length = em->len;
				2132	bg->flags = map->type;
				2133	bg->last_byte_to_unpin = (u64)-1;
				2134	bg->cached = BTRFS_CACHE_FINISHED;
				2135	bg->used = em->len;
				2136	bg->flags = map->type;
				2137	ret = btrfs_add_block_group_cache(fs_info, bg);
Qu Wenruo	2b29726	2021-07-19 13:43:04 +0800	[diff] [blame]	2138	/*
				2139	* We may have some valid block group cache added already, in
				2140	* that case we skip to the next one.
				2141	*/
				2142	if (ret == -EEXIST) {
				2143	ret = 0;
				2144	btrfs_put_block_group(bg);
				2145	continue;
				2146	}
				2147
Josef Bacik	42437a6	2020-10-16 11:29:18 -0400	[diff] [blame]	2148	if (ret) {
				2149	btrfs_remove_free_space_cache(bg);
				2150	btrfs_put_block_group(bg);
				2151	break;
				2152	}
Qu Wenruo	2b29726	2021-07-19 13:43:04 +0800	[diff] [blame]	2153
Josef Bacik	42437a6	2020-10-16 11:29:18 -0400	[diff] [blame]	2154	btrfs_update_space_info(fs_info, bg->flags, em->len, em->len,
Naohiro Aota	169e0da	2021-02-04 19:21:52 +0900	[diff] [blame]	2155	0, 0, &space_info);
Josef Bacik	42437a6	2020-10-16 11:29:18 -0400	[diff] [blame]	2156	bg->space_info = space_info;
				2157	link_block_group(bg);
				2158
				2159	set_avail_alloc_bits(fs_info, bg->flags);
				2160	}
				2161	if (!ret)
				2162	btrfs_init_global_block_rsv(fs_info);
				2163	return ret;
				2164	}
				2165
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2166	int btrfs_read_block_groups(struct btrfs_fs_info *info)
				2167	{
				2168	struct btrfs_path *path;
				2169	int ret;
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	2170	struct btrfs_block_group *cache;
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2171	struct btrfs_space_info *space_info;
				2172	struct btrfs_key key;
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2173	int need_clear = 0;
				2174	u64 cache_gen;
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2175
Josef Bacik	42437a6	2020-10-16 11:29:18 -0400	[diff] [blame]	2176	if (!info->extent_root)
				2177	return fill_dummy_bgs(info);
				2178
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2179	key.objectid = 0;
				2180	key.offset = 0;
				2181	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
				2182	path = btrfs_alloc_path();
				2183	if (!path)
				2184	return -ENOMEM;
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2185
				2186	cache_gen = btrfs_super_cache_generation(info->super_copy);
				2187	if (btrfs_test_opt(info, SPACE_CACHE) &&
				2188	btrfs_super_generation(info->super_copy) != cache_gen)
				2189	need_clear = 1;
				2190	if (btrfs_test_opt(info, CLEAR_CACHE))
				2191	need_clear = 1;
				2192
				2193	while (1) {
Johannes Thumshirn	4afd2fe	2021-02-04 19:21:44 +0900	[diff] [blame]	2194	struct btrfs_block_group_item bgi;
				2195	struct extent_buffer *leaf;
				2196	int slot;
				2197
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2198	ret = find_first_block_group(info, path, &key);
				2199	if (ret > 0)
				2200	break;
				2201	if (ret != 0)
				2202	goto error;
				2203
Johannes Thumshirn	4afd2fe	2021-02-04 19:21:44 +0900	[diff] [blame]	2204	leaf = path->nodes[0];
				2205	slot = path->slots[0];
				2206
				2207	read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
				2208	sizeof(bgi));
				2209
				2210	btrfs_item_key_to_cpu(leaf, &key, slot);
				2211	btrfs_release_path(path);
				2212	ret = read_one_block_group(info, &bgi, &key, need_clear);
Qu Wenruo	ffb9e0f	2019-10-10 10:39:27 +0800	[diff] [blame]	2213	if (ret < 0)
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2214	goto error;
Qu Wenruo	ffb9e0f	2019-10-10 10:39:27 +0800	[diff] [blame]	2215	key.objectid += key.offset;
				2216	key.offset = 0;
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2217	}
Josef Bacik	7837fa8	2020-10-14 17:00:51 -0400	[diff] [blame]	2218	btrfs_release_path(path);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2219
Josef Bacik	7280490	2020-09-01 17:40:37 -0400	[diff] [blame]	2220	list_for_each_entry(space_info, &info->space_info, list) {
Josef Bacik	49ea112	2020-09-01 17:40:38 -0400	[diff] [blame]	2221	int i;
				2222
				2223	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
				2224	if (list_empty(&space_info->block_groups[i]))
				2225	continue;
				2226	cache = list_first_entry(&space_info->block_groups[i],
				2227	struct btrfs_block_group,
				2228	list);
				2229	btrfs_sysfs_add_block_group_type(cache);
				2230	}
				2231
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2232	if (!(btrfs_get_alloc_profile(info, space_info->flags) &
				2233	(BTRFS_BLOCK_GROUP_RAID10 \|
				2234	BTRFS_BLOCK_GROUP_RAID1_MASK \|
				2235	BTRFS_BLOCK_GROUP_RAID56_MASK \|
				2236	BTRFS_BLOCK_GROUP_DUP)))
				2237	continue;
				2238	/*
				2239	* Avoid allocating from un-mirrored block group if there are
				2240	* mirrored block groups.
				2241	*/
				2242	list_for_each_entry(cache,
				2243	&space_info->block_groups[BTRFS_RAID_RAID0],
				2244	list)
Josef Bacik	e11c040	2019-06-20 15:38:07 -0400	[diff] [blame]	2245	inc_block_group_ro(cache, 1);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2246	list_for_each_entry(cache,
				2247	&space_info->block_groups[BTRFS_RAID_SINGLE],
				2248	list)
Josef Bacik	e11c040	2019-06-20 15:38:07 -0400	[diff] [blame]	2249	inc_block_group_ro(cache, 1);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2250	}
				2251
				2252	btrfs_init_global_block_rsv(info);
				2253	ret = check_chunk_block_group_mappings(info);
				2254	error:
				2255	btrfs_free_path(path);
Qu Wenruo	2b29726	2021-07-19 13:43:04 +0800	[diff] [blame]	2256	/*
				2257	* We've hit some error while reading the extent tree, and have
				2258	* rescue=ibadroots mount option.
				2259	* Try to fill the tree using dummy block groups so that the user can
				2260	* continue to mount and grab their data.
				2261	*/
				2262	if (ret && btrfs_test_opt(info, IGNOREBADROOTS))
				2263	ret = fill_dummy_bgs(info);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2264	return ret;
				2265	}
				2266
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	2267	/*
				2268	* This function, insert_block_group_item(), belongs to the phase 2 of chunk
				2269	* allocation.
				2270	*
				2271	* See the comment at btrfs_chunk_alloc() for details about the chunk allocation
				2272	* phases.
				2273	*/
Qu Wenruo	97f4728	2020-05-05 07:58:22 +0800	[diff] [blame]	2274	static int insert_block_group_item(struct btrfs_trans_handle *trans,
				2275	struct btrfs_block_group *block_group)
				2276	{
				2277	struct btrfs_fs_info *fs_info = trans->fs_info;
				2278	struct btrfs_block_group_item bgi;
				2279	struct btrfs_root *root;
				2280	struct btrfs_key key;
				2281
				2282	spin_lock(&block_group->lock);
				2283	btrfs_set_stack_block_group_used(&bgi, block_group->used);
				2284	btrfs_set_stack_block_group_chunk_objectid(&bgi,
				2285	BTRFS_FIRST_CHUNK_TREE_OBJECTID);
				2286	btrfs_set_stack_block_group_flags(&bgi, block_group->flags);
				2287	key.objectid = block_group->start;
				2288	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
				2289	key.offset = block_group->length;
				2290	spin_unlock(&block_group->lock);
				2291
				2292	root = fs_info->extent_root;
				2293	return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
				2294	}
				2295
Nikolay Borisov	2eadb9e	2021-07-05 12:29:19 +0300	[diff] [blame]	2296	static int insert_dev_extent(struct btrfs_trans_handle *trans,
				2297	struct btrfs_device *device, u64 chunk_offset,
				2298	u64 start, u64 num_bytes)
				2299	{
				2300	struct btrfs_fs_info *fs_info = device->fs_info;
				2301	struct btrfs_root *root = fs_info->dev_root;
				2302	struct btrfs_path *path;
				2303	struct btrfs_dev_extent *extent;
				2304	struct extent_buffer *leaf;
				2305	struct btrfs_key key;
				2306	int ret;
				2307
				2308	WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
				2309	WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
				2310	path = btrfs_alloc_path();
				2311	if (!path)
				2312	return -ENOMEM;
				2313
				2314	key.objectid = device->devid;
				2315	key.type = BTRFS_DEV_EXTENT_KEY;
				2316	key.offset = start;
				2317	ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent));
				2318	if (ret)
				2319	goto out;
				2320
				2321	leaf = path->nodes[0];
				2322	extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
				2323	btrfs_set_dev_extent_chunk_tree(leaf, extent, BTRFS_CHUNK_TREE_OBJECTID);
				2324	btrfs_set_dev_extent_chunk_objectid(leaf, extent,
				2325	BTRFS_FIRST_CHUNK_TREE_OBJECTID);
				2326	btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
				2327
				2328	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
				2329	btrfs_mark_buffer_dirty(leaf);
				2330	out:
				2331	btrfs_free_path(path);
				2332	return ret;
				2333	}
				2334
				2335	/*
				2336	* This function belongs to phase 2.
				2337	*
				2338	* See the comment at btrfs_chunk_alloc() for details about the chunk allocation
				2339	* phases.
				2340	*/
				2341	static int insert_dev_extents(struct btrfs_trans_handle *trans,
				2342	u64 chunk_offset, u64 chunk_size)
				2343	{
				2344	struct btrfs_fs_info *fs_info = trans->fs_info;
				2345	struct btrfs_device *device;
				2346	struct extent_map *em;
				2347	struct map_lookup *map;
				2348	u64 dev_offset;
				2349	u64 stripe_size;
				2350	int i;
				2351	int ret = 0;
				2352
				2353	em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
				2354	if (IS_ERR(em))
				2355	return PTR_ERR(em);
				2356
				2357	map = em->map_lookup;
				2358	stripe_size = em->orig_block_len;
				2359
				2360	/*
				2361	* Take the device list mutex to prevent races with the final phase of
				2362	* a device replace operation that replaces the device object associated
				2363	* with the map's stripes, because the device object's id can change
				2364	* at any time during that final phase of the device replace operation
				2365	* (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
				2366	* replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
				2367	* resulting in persisting a device extent item with such ID.
				2368	*/
				2369	mutex_lock(&fs_info->fs_devices->device_list_mutex);
				2370	for (i = 0; i < map->num_stripes; i++) {
				2371	device = map->stripes[i].dev;
				2372	dev_offset = map->stripes[i].physical;
				2373
				2374	ret = insert_dev_extent(trans, device, chunk_offset, dev_offset,
				2375	stripe_size);
				2376	if (ret)
				2377	break;
				2378	}
				2379	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
				2380
				2381	free_extent_map(em);
				2382	return ret;
				2383	}
				2384
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	2385	/*
				2386	* This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of
				2387	* chunk allocation.
				2388	*
				2389	* See the comment at btrfs_chunk_alloc() for details about the chunk allocation
				2390	* phases.
				2391	*/
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2392	void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
				2393	{
				2394	struct btrfs_fs_info *fs_info = trans->fs_info;
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	2395	struct btrfs_block_group *block_group;
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2396	int ret = 0;
				2397
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2398	while (!list_empty(&trans->new_bgs)) {
Josef Bacik	49ea112	2020-09-01 17:40:38 -0400	[diff] [blame]	2399	int index;
				2400
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2401	block_group = list_first_entry(&trans->new_bgs,
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	2402	struct btrfs_block_group,
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2403	bg_list);
				2404	if (ret)
				2405	goto next;
				2406
Josef Bacik	49ea112	2020-09-01 17:40:38 -0400	[diff] [blame]	2407	index = btrfs_bg_flags_to_raid_index(block_group->flags);
				2408
Qu Wenruo	97f4728	2020-05-05 07:58:22 +0800	[diff] [blame]	2409	ret = insert_block_group_item(trans, block_group);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2410	if (ret)
				2411	btrfs_abort_transaction(trans, ret);
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	2412	if (!block_group->chunk_item_inserted) {
				2413	mutex_lock(&fs_info->chunk_mutex);
				2414	ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group);
				2415	mutex_unlock(&fs_info->chunk_mutex);
				2416	if (ret)
				2417	btrfs_abort_transaction(trans, ret);
				2418	}
Nikolay Borisov	2eadb9e	2021-07-05 12:29:19 +0300	[diff] [blame]	2419	ret = insert_dev_extents(trans, block_group->start,
				2420	block_group->length);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2421	if (ret)
				2422	btrfs_abort_transaction(trans, ret);
				2423	add_block_group_free_space(trans, block_group);
Josef Bacik	49ea112	2020-09-01 17:40:38 -0400	[diff] [blame]	2424
				2425	/*
				2426	* If we restriped during balance, we may have added a new raid
				2427	* type, so now add the sysfs entries when it is safe to do so.
				2428	* We don't have to worry about locking here as it's handled in
				2429	* btrfs_sysfs_add_block_group_type.
				2430	*/
				2431	if (block_group->space_info->block_group_kobjs[index] == NULL)
				2432	btrfs_sysfs_add_block_group_type(block_group);
				2433
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2434	/* Already aborted the transaction if it failed. */
				2435	next:
				2436	btrfs_delayed_refs_rsv_release(fs_info, 1);
				2437	list_del_init(&block_group->bg_list);
				2438	}
				2439	btrfs_trans_release_chunk_metadata(trans);
				2440	}
				2441
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	2442	struct btrfs_block_group btrfs_make_block_group(struct btrfs_trans_handle trans,
				2443	u64 bytes_used, u64 type,
				2444	u64 chunk_offset, u64 size)
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2445	{
				2446	struct btrfs_fs_info *fs_info = trans->fs_info;
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	2447	struct btrfs_block_group *cache;
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2448	int ret;
				2449
				2450	btrfs_set_log_full_commit(trans);
				2451
Qu Wenruo	9afc664	2020-05-05 07:58:20 +0800	[diff] [blame]	2452	cache = btrfs_create_block_group_cache(fs_info, chunk_offset);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2453	if (!cache)
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	2454	return ERR_PTR(-ENOMEM);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2455
Qu Wenruo	9afc664	2020-05-05 07:58:20 +0800	[diff] [blame]	2456	cache->length = size;
Marcos Paulo de Souza	e3e39c7	2020-08-21 11:54:44 -0300	[diff] [blame]	2457	set_free_space_tree_thresholds(cache);
David Sterba	bf38be6	2019-10-23 18:48:11 +0200	[diff] [blame]	2458	cache->used = bytes_used;
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2459	cache->flags = type;
				2460	cache->last_byte_to_unpin = (u64)-1;
				2461	cache->cached = BTRFS_CACHE_FINISHED;
Boris Burkov	997e3e2	2020-11-18 15:06:18 -0800	[diff] [blame]	2462	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
				2463	cache->needs_free_space = 1;
Naohiro Aota	08e11a3	2021-02-04 19:21:50 +0900	[diff] [blame]	2464
Naohiro Aota	a94794d	2021-02-04 19:21:51 +0900	[diff] [blame]	2465	ret = btrfs_load_block_group_zone_info(cache, true);
Naohiro Aota	08e11a3	2021-02-04 19:21:50 +0900	[diff] [blame]	2466	if (ret) {
				2467	btrfs_put_block_group(cache);
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	2468	return ERR_PTR(ret);
Naohiro Aota	08e11a3	2021-02-04 19:21:50 +0900	[diff] [blame]	2469	}
				2470
Naohiro Aota	eb66a01	2021-08-19 21:19:20 +0900	[diff] [blame]	2471	/*
				2472	* New block group is likely to be used soon. Try to activate it now.
				2473	* Failure is OK for now.
				2474	*/
				2475	btrfs_zone_activate(cache);
				2476
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2477	ret = exclude_super_stripes(cache);
				2478	if (ret) {
				2479	/* We may have excluded something, so call this just in case */
				2480	btrfs_free_excluded_extents(cache);
				2481	btrfs_put_block_group(cache);
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	2482	return ERR_PTR(ret);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2483	}
				2484
				2485	add_new_free_space(cache, chunk_offset, chunk_offset + size);
				2486
				2487	btrfs_free_excluded_extents(cache);
				2488
				2489	#ifdef CONFIG_BTRFS_DEBUG
				2490	if (btrfs_should_fragment_free_space(cache)) {
				2491	u64 new_bytes_used = size - bytes_used;
				2492
				2493	bytes_used += new_bytes_used >> 1;
Josef Bacik	e11c040	2019-06-20 15:38:07 -0400	[diff] [blame]	2494	fragment_free_space(cache);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2495	}
				2496	#endif
				2497	/*
				2498	* Ensure the corresponding space_info object is created and
				2499	* assigned to our block group. We want our bg to be added to the rbtree
				2500	* with its ->space_info set.
				2501	*/
				2502	cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
				2503	ASSERT(cache->space_info);
				2504
				2505	ret = btrfs_add_block_group_cache(fs_info, cache);
				2506	if (ret) {
				2507	btrfs_remove_free_space_cache(cache);
				2508	btrfs_put_block_group(cache);
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	2509	return ERR_PTR(ret);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2510	}
				2511
				2512	/*
				2513	* Now that our block group has its ->space_info set and is inserted in
				2514	* the rbtree, update the space info's counters.
				2515	*/
				2516	trace_btrfs_add_block_group(fs_info, cache, 1);
				2517	btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
Naohiro Aota	9817325	2021-08-19 21:19:10 +0900	[diff] [blame]	2518	cache->bytes_super, cache->zone_unusable,
				2519	&cache->space_info);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2520	btrfs_update_global_block_rsv(fs_info);
				2521
				2522	link_block_group(cache);
				2523
				2524	list_add_tail(&cache->bg_list, &trans->new_bgs);
				2525	trans->delayed_ref_updates++;
				2526	btrfs_update_delayed_refs_rsv(trans);
				2527
				2528	set_avail_alloc_bits(fs_info, type);
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	2529	return cache;
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2530	}
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	2531
Qu Wenruo	b12de52	2019-11-15 10:09:00 +0800	[diff] [blame]	2532	/*
				2533	* Mark one block group RO, can be called several times for the same block
				2534	* group.
				2535	*
				2536	* @cache: the destination block group
				2537	* @do_chunk_alloc: whether need to do chunk pre-allocation, this is to
				2538	* ensure we still have some free space after marking this
				2539	* block group RO.
				2540	*/
				2541	int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
				2542	bool do_chunk_alloc)
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	2543	{
				2544	struct btrfs_fs_info *fs_info = cache->fs_info;
				2545	struct btrfs_trans_handle *trans;
				2546	u64 alloc_flags;
				2547	int ret;
Nikolay Borisov	b6e9f16	2021-02-17 15:12:50 +0200	[diff] [blame]	2548	bool dirty_bg_running;
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	2549
Nikolay Borisov	b6e9f16	2021-02-17 15:12:50 +0200	[diff] [blame]	2550	do {
				2551	trans = btrfs_join_transaction(fs_info->extent_root);
				2552	if (IS_ERR(trans))
				2553	return PTR_ERR(trans);
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	2554
Nikolay Borisov	b6e9f16	2021-02-17 15:12:50 +0200	[diff] [blame]	2555	dirty_bg_running = false;
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	2556
Nikolay Borisov	b6e9f16	2021-02-17 15:12:50 +0200	[diff] [blame]	2557	/*
				2558	* We're not allowed to set block groups readonly after the dirty
				2559	* block group cache has started writing. If it already started,
				2560	* back off and let this transaction commit.
				2561	*/
				2562	mutex_lock(&fs_info->ro_block_group_mutex);
				2563	if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
				2564	u64 transid = trans->transid;
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	2565
Nikolay Borisov	b6e9f16	2021-02-17 15:12:50 +0200	[diff] [blame]	2566	mutex_unlock(&fs_info->ro_block_group_mutex);
				2567	btrfs_end_transaction(trans);
				2568
				2569	ret = btrfs_wait_for_commit(fs_info, transid);
				2570	if (ret)
				2571	return ret;
				2572	dirty_bg_running = true;
				2573	}
				2574	} while (dirty_bg_running);
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	2575
Qu Wenruo	b12de52	2019-11-15 10:09:00 +0800	[diff] [blame]	2576	if (do_chunk_alloc) {
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	2577	/*
Qu Wenruo	b12de52	2019-11-15 10:09:00 +0800	[diff] [blame]	2578	* If we are changing raid levels, try to allocate a
				2579	* corresponding block group with the new raid level.
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	2580	*/
Josef Bacik	349e120	2020-07-21 10:48:45 -0400	[diff] [blame]	2581	alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
Qu Wenruo	b12de52	2019-11-15 10:09:00 +0800	[diff] [blame]	2582	if (alloc_flags != cache->flags) {
				2583	ret = btrfs_chunk_alloc(trans, alloc_flags,
				2584	CHUNK_ALLOC_FORCE);
				2585	/*
				2586	* ENOSPC is allowed here, we may have enough space
				2587	* already allocated at the new raid level to carry on
				2588	*/
				2589	if (ret == -ENOSPC)
				2590	ret = 0;
				2591	if (ret < 0)
				2592	goto out;
				2593	}
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	2594	}
				2595
Josef Bacik	a7a63acc	2020-01-17 09:07:38 -0500	[diff] [blame]	2596	ret = inc_block_group_ro(cache, 0);
Filipe Manana	195a49e	2021-02-05 12:55:37 +0000	[diff] [blame]	2597	if (!do_chunk_alloc \|\| ret == -ETXTBSY)
Qu Wenruo	b12de52	2019-11-15 10:09:00 +0800	[diff] [blame]	2598	goto unlock_out;
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	2599	if (!ret)
				2600	goto out;
				2601	alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags);
				2602	ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
				2603	if (ret < 0)
				2604	goto out;
Josef Bacik	e11c040	2019-06-20 15:38:07 -0400	[diff] [blame]	2605	ret = inc_block_group_ro(cache, 0);
Filipe Manana	195a49e	2021-02-05 12:55:37 +0000	[diff] [blame]	2606	if (ret == -ETXTBSY)
				2607	goto unlock_out;
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	2608	out:
				2609	if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
Josef Bacik	349e120	2020-07-21 10:48:45 -0400	[diff] [blame]	2610	alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	2611	mutex_lock(&fs_info->chunk_mutex);
				2612	check_system_chunk(trans, alloc_flags);
				2613	mutex_unlock(&fs_info->chunk_mutex);
				2614	}
Qu Wenruo	b12de52	2019-11-15 10:09:00 +0800	[diff] [blame]	2615	unlock_out:
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	2616	mutex_unlock(&fs_info->ro_block_group_mutex);
				2617
				2618	btrfs_end_transaction(trans);
				2619	return ret;
				2620	}
				2621
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	2622	void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	2623	{
				2624	struct btrfs_space_info *sinfo = cache->space_info;
				2625	u64 num_bytes;
				2626
				2627	BUG_ON(!cache->ro);
				2628
				2629	spin_lock(&sinfo->lock);
				2630	spin_lock(&cache->lock);
				2631	if (!--cache->ro) {
Naohiro Aota	169e0da	2021-02-04 19:21:52 +0900	[diff] [blame]	2632	if (btrfs_is_zoned(cache->fs_info)) {
				2633	/* Migrate zone_unusable bytes back */
Naohiro Aota	9817325	2021-08-19 21:19:10 +0900	[diff] [blame]	2634	cache->zone_unusable =
				2635	(cache->alloc_offset - cache->used) +
				2636	(cache->length - cache->zone_capacity);
Naohiro Aota	169e0da	2021-02-04 19:21:52 +0900	[diff] [blame]	2637	sinfo->bytes_zone_unusable += cache->zone_unusable;
				2638	sinfo->bytes_readonly -= cache->zone_unusable;
				2639	}
Naohiro Aota	f9f28e5	2021-06-17 13:56:18 +0900	[diff] [blame]	2640	num_bytes = cache->length - cache->reserved -
				2641	cache->pinned - cache->bytes_super -
				2642	cache->zone_unusable - cache->used;
				2643	sinfo->bytes_readonly -= num_bytes;
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	2644	list_del_init(&cache->ro_list);
				2645	}
				2646	spin_unlock(&cache->lock);
				2647	spin_unlock(&sinfo->lock);
				2648	}
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2649
Qu Wenruo	3be4d8e	2020-05-05 07:58:23 +0800	[diff] [blame]	2650	static int update_block_group_item(struct btrfs_trans_handle *trans,
				2651	struct btrfs_path *path,
				2652	struct btrfs_block_group *cache)
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2653	{
				2654	struct btrfs_fs_info *fs_info = trans->fs_info;
				2655	int ret;
Qu Wenruo	3be4d8e	2020-05-05 07:58:23 +0800	[diff] [blame]	2656	struct btrfs_root *root = fs_info->extent_root;
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2657	unsigned long bi;
				2658	struct extent_buffer *leaf;
David Sterba	bf38be6	2019-10-23 18:48:11 +0200	[diff] [blame]	2659	struct btrfs_block_group_item bgi;
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	2660	struct btrfs_key key;
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2661
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	2662	key.objectid = cache->start;
				2663	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
				2664	key.offset = cache->length;
				2665
Qu Wenruo	3be4d8e	2020-05-05 07:58:23 +0800	[diff] [blame]	2666	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2667	if (ret) {
				2668	if (ret > 0)
				2669	ret = -ENOENT;
				2670	goto fail;
				2671	}
				2672
				2673	leaf = path->nodes[0];
				2674	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
David Sterba	de0dc45	2019-10-23 18:48:18 +0200	[diff] [blame]	2675	btrfs_set_stack_block_group_used(&bgi, cache->used);
				2676	btrfs_set_stack_block_group_chunk_objectid(&bgi,
David Sterba	3d97638	2019-10-23 18:48:15 +0200	[diff] [blame]	2677	BTRFS_FIRST_CHUNK_TREE_OBJECTID);
David Sterba	de0dc45	2019-10-23 18:48:18 +0200	[diff] [blame]	2678	btrfs_set_stack_block_group_flags(&bgi, cache->flags);
David Sterba	bf38be6	2019-10-23 18:48:11 +0200	[diff] [blame]	2679	write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2680	btrfs_mark_buffer_dirty(leaf);
				2681	fail:
				2682	btrfs_release_path(path);
				2683	return ret;
				2684
				2685	}
				2686
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	2687	static int cache_save_setup(struct btrfs_block_group *block_group,
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2688	struct btrfs_trans_handle *trans,
				2689	struct btrfs_path *path)
				2690	{
				2691	struct btrfs_fs_info *fs_info = block_group->fs_info;
				2692	struct btrfs_root *root = fs_info->tree_root;
				2693	struct inode *inode = NULL;
				2694	struct extent_changeset *data_reserved = NULL;
				2695	u64 alloc_hint = 0;
				2696	int dcs = BTRFS_DC_ERROR;
Qu Wenruo	0044ae1	2021-04-13 14:23:14 +0800	[diff] [blame]	2697	u64 cache_size = 0;
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2698	int retries = 0;
				2699	int ret = 0;
				2700
Boris Burkov	af456a2	2020-11-18 15:06:26 -0800	[diff] [blame]	2701	if (!btrfs_test_opt(fs_info, SPACE_CACHE))
				2702	return 0;
				2703
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2704	/*
				2705	* If this block group is smaller than 100 megs don't bother caching the
				2706	* block group.
				2707	*/
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	2708	if (block_group->length < (100 * SZ_1M)) {
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2709	spin_lock(&block_group->lock);
				2710	block_group->disk_cache_state = BTRFS_DC_WRITTEN;
				2711	spin_unlock(&block_group->lock);
				2712	return 0;
				2713	}
				2714
David Sterba	bf31f87	2020-02-05 17:34:34 +0100	[diff] [blame]	2715	if (TRANS_ABORTED(trans))
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2716	return 0;
				2717	again:
				2718	inode = lookup_free_space_inode(block_group, path);
				2719	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
				2720	ret = PTR_ERR(inode);
				2721	btrfs_release_path(path);
				2722	goto out;
				2723	}
				2724
				2725	if (IS_ERR(inode)) {
				2726	BUG_ON(retries);
				2727	retries++;
				2728
				2729	if (block_group->ro)
				2730	goto out_free;
				2731
				2732	ret = create_free_space_inode(trans, block_group, path);
				2733	if (ret)
				2734	goto out_free;
				2735	goto again;
				2736	}
				2737
				2738	/*
				2739	* We want to set the generation to 0, that way if anything goes wrong
				2740	* from here on out we know not to trust this cache when we load up next
				2741	* time.
				2742	*/
				2743	BTRFS_I(inode)->generation = 0;
Nikolay Borisov	9a56fcd	2020-11-02 16:48:59 +0200	[diff] [blame]	2744	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2745	if (ret) {
				2746	/*
				2747	* So theoretically we could recover from this, simply set the
				2748	* super cache generation to 0 so we know to invalidate the
				2749	* cache, but then we'd have to keep track of the block groups
				2750	* that fail this way so we know we _have_ to reset this cache
				2751	* before the next commit or risk reading stale cache. So to
				2752	* limit our exposure to horrible edge cases lets just abort the
				2753	* transaction, this only happens in really bad situations
				2754	* anyway.
				2755	*/
				2756	btrfs_abort_transaction(trans, ret);
				2757	goto out_put;
				2758	}
				2759	WARN_ON(ret);
				2760
				2761	/* We've already setup this transaction, go ahead and exit */
				2762	if (block_group->cache_generation == trans->transid &&
				2763	i_size_read(inode)) {
				2764	dcs = BTRFS_DC_SETUP;
				2765	goto out_put;
				2766	}
				2767
				2768	if (i_size_read(inode) > 0) {
				2769	ret = btrfs_check_trunc_cache_free_space(fs_info,
				2770	&fs_info->global_block_rsv);
				2771	if (ret)
				2772	goto out_put;
				2773
				2774	ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
				2775	if (ret)
				2776	goto out_put;
				2777	}
				2778
				2779	spin_lock(&block_group->lock);
				2780	if (block_group->cached != BTRFS_CACHE_FINISHED \|\|
				2781	!btrfs_test_opt(fs_info, SPACE_CACHE)) {
				2782	/*
				2783	* don't bother trying to write stuff out _if_
				2784	* a) we're not cached,
				2785	* b) we're with nospace_cache mount option,
				2786	* c) we're with v2 space_cache (FREE_SPACE_TREE).
				2787	*/
				2788	dcs = BTRFS_DC_WRITTEN;
				2789	spin_unlock(&block_group->lock);
				2790	goto out_put;
				2791	}
				2792	spin_unlock(&block_group->lock);
				2793
				2794	/*
				2795	* We hit an ENOSPC when setting up the cache in this transaction, just
				2796	* skip doing the setup, we've already cleared the cache so we're safe.
				2797	*/
				2798	if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
				2799	ret = -ENOSPC;
				2800	goto out_put;
				2801	}
				2802
				2803	/*
				2804	* Try to preallocate enough space based on how big the block group is.
				2805	* Keep in mind this has to include any pinned space which could end up
				2806	* taking up quite a bit since it's not folded into the other space
				2807	* cache.
				2808	*/
Qu Wenruo	0044ae1	2021-04-13 14:23:14 +0800	[diff] [blame]	2809	cache_size = div_u64(block_group->length, SZ_256M);
				2810	if (!cache_size)
				2811	cache_size = 1;
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2812
Qu Wenruo	0044ae1	2021-04-13 14:23:14 +0800	[diff] [blame]	2813	cache_size *= 16;
				2814	cache_size *= fs_info->sectorsize;
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2815
Nikolay Borisov	36ea6f3	2020-06-03 08:55:41 +0300	[diff] [blame]	2816	ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0,
Qu Wenruo	0044ae1	2021-04-13 14:23:14 +0800	[diff] [blame]	2817	cache_size);
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2818	if (ret)
				2819	goto out_put;
				2820
Qu Wenruo	0044ae1	2021-04-13 14:23:14 +0800	[diff] [blame]	2821	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, cache_size,
				2822	cache_size, cache_size,
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2823	&alloc_hint);
				2824	/*
				2825	* Our cache requires contiguous chunks so that we don't modify a bunch
				2826	* of metadata or split extents when writing the cache out, which means
				2827	* we can enospc if we are heavily fragmented in addition to just normal
				2828	* out of space conditions. So if we hit this just skip setting up any
				2829	* other block groups for this transaction, maybe we'll unpin enough
				2830	* space the next time around.
				2831	*/
				2832	if (!ret)
				2833	dcs = BTRFS_DC_SETUP;
				2834	else if (ret == -ENOSPC)
				2835	set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
				2836
				2837	out_put:
				2838	iput(inode);
				2839	out_free:
				2840	btrfs_release_path(path);
				2841	out:
				2842	spin_lock(&block_group->lock);
				2843	if (!ret && dcs == BTRFS_DC_SETUP)
				2844	block_group->cache_generation = trans->transid;
				2845	block_group->disk_cache_state = dcs;
				2846	spin_unlock(&block_group->lock);
				2847
				2848	extent_changeset_free(data_reserved);
				2849	return ret;
				2850	}
				2851
				2852	int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
				2853	{
				2854	struct btrfs_fs_info *fs_info = trans->fs_info;
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	2855	struct btrfs_block_group cache, tmp;
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2856	struct btrfs_transaction *cur_trans = trans->transaction;
				2857	struct btrfs_path *path;
				2858
				2859	if (list_empty(&cur_trans->dirty_bgs) \|\|
				2860	!btrfs_test_opt(fs_info, SPACE_CACHE))
				2861	return 0;
				2862
				2863	path = btrfs_alloc_path();
				2864	if (!path)
				2865	return -ENOMEM;
				2866
				2867	/* Could add new block groups, use _safe just in case */
				2868	list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
				2869	dirty_list) {
				2870	if (cache->disk_cache_state == BTRFS_DC_CLEAR)
				2871	cache_save_setup(cache, trans, path);
				2872	}
				2873
				2874	btrfs_free_path(path);
				2875	return 0;
				2876	}
				2877
				2878	/*
				2879	* Transaction commit does final block group cache writeback during a critical
				2880	* section where nothing is allowed to change the FS. This is required in
				2881	* order for the cache to actually match the block group, but can introduce a
				2882	* lot of latency into the commit.
				2883	*
				2884	* So, btrfs_start_dirty_block_groups is here to kick off block group cache IO.
				2885	* There's a chance we'll have to redo some of it if the block group changes
				2886	* again during the commit, but it greatly reduces the commit latency by
				2887	* getting rid of the easy block groups while we're still allowing others to
				2888	* join the commit.
				2889	*/
				2890	int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
				2891	{
				2892	struct btrfs_fs_info *fs_info = trans->fs_info;
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	2893	struct btrfs_block_group *cache;
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2894	struct btrfs_transaction *cur_trans = trans->transaction;
				2895	int ret = 0;
				2896	int should_put;
				2897	struct btrfs_path *path = NULL;
				2898	LIST_HEAD(dirty);
				2899	struct list_head *io = &cur_trans->io_bgs;
				2900	int num_started = 0;
				2901	int loops = 0;
				2902
				2903	spin_lock(&cur_trans->dirty_bgs_lock);
				2904	if (list_empty(&cur_trans->dirty_bgs)) {
				2905	spin_unlock(&cur_trans->dirty_bgs_lock);
				2906	return 0;
				2907	}
				2908	list_splice_init(&cur_trans->dirty_bgs, &dirty);
				2909	spin_unlock(&cur_trans->dirty_bgs_lock);
				2910
				2911	again:
				2912	/* Make sure all the block groups on our dirty list actually exist */
				2913	btrfs_create_pending_block_groups(trans);
				2914
				2915	if (!path) {
				2916	path = btrfs_alloc_path();
Josef Bacik	938fcbf	2021-01-14 14:02:43 -0500	[diff] [blame]	2917	if (!path) {
				2918	ret = -ENOMEM;
				2919	goto out;
				2920	}
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2921	}
				2922
				2923	/*
				2924	* cache_write_mutex is here only to save us from balance or automatic
				2925	* removal of empty block groups deleting this block group while we are
				2926	* writing out the cache
				2927	*/
				2928	mutex_lock(&trans->transaction->cache_write_mutex);
				2929	while (!list_empty(&dirty)) {
				2930	bool drop_reserve = true;
				2931
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	2932	cache = list_first_entry(&dirty, struct btrfs_block_group,
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2933	dirty_list);
				2934	/*
				2935	* This can happen if something re-dirties a block group that
				2936	* is already under IO. Just wait for it to finish and then do
				2937	* it all again
				2938	*/
				2939	if (!list_empty(&cache->io_list)) {
				2940	list_del_init(&cache->io_list);
				2941	btrfs_wait_cache_io(trans, cache, path);
				2942	btrfs_put_block_group(cache);
				2943	}
				2944
				2945
				2946	/*
				2947	* btrfs_wait_cache_io uses the cache->dirty_list to decide if
				2948	* it should update the cache_state. Don't delete until after
				2949	* we wait.
				2950	*
				2951	* Since we're not running in the commit critical section
				2952	* we need the dirty_bgs_lock to protect from update_block_group
				2953	*/
				2954	spin_lock(&cur_trans->dirty_bgs_lock);
				2955	list_del_init(&cache->dirty_list);
				2956	spin_unlock(&cur_trans->dirty_bgs_lock);
				2957
				2958	should_put = 1;
				2959
				2960	cache_save_setup(cache, trans, path);
				2961
				2962	if (cache->disk_cache_state == BTRFS_DC_SETUP) {
				2963	cache->io_ctl.inode = NULL;
				2964	ret = btrfs_write_out_cache(trans, cache, path);
				2965	if (ret == 0 && cache->io_ctl.inode) {
				2966	num_started++;
				2967	should_put = 0;
				2968
				2969	/*
				2970	* The cache_write_mutex is protecting the
				2971	* io_list, also refer to the definition of
				2972	* btrfs_transaction::io_bgs for more details
				2973	*/
				2974	list_add_tail(&cache->io_list, io);
				2975	} else {
				2976	/*
				2977	* If we failed to write the cache, the
				2978	* generation will be bad and life goes on
				2979	*/
				2980	ret = 0;
				2981	}
				2982	}
				2983	if (!ret) {
Qu Wenruo	3be4d8e	2020-05-05 07:58:23 +0800	[diff] [blame]	2984	ret = update_block_group_item(trans, path, cache);
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2985	/*
				2986	* Our block group might still be attached to the list
				2987	* of new block groups in the transaction handle of some
				2988	* other task (struct btrfs_trans_handle->new_bgs). This
				2989	* means its block group item isn't yet in the extent
				2990	* tree. If this happens ignore the error, as we will
				2991	* try again later in the critical section of the
				2992	* transaction commit.
				2993	*/
				2994	if (ret == -ENOENT) {
				2995	ret = 0;
				2996	spin_lock(&cur_trans->dirty_bgs_lock);
				2997	if (list_empty(&cache->dirty_list)) {
				2998	list_add_tail(&cache->dirty_list,
				2999	&cur_trans->dirty_bgs);
				3000	btrfs_get_block_group(cache);
				3001	drop_reserve = false;
				3002	}
				3003	spin_unlock(&cur_trans->dirty_bgs_lock);
				3004	} else if (ret) {
				3005	btrfs_abort_transaction(trans, ret);
				3006	}
				3007	}
				3008
				3009	/* If it's not on the io list, we need to put the block group */
				3010	if (should_put)
				3011	btrfs_put_block_group(cache);
				3012	if (drop_reserve)
				3013	btrfs_delayed_refs_rsv_release(fs_info, 1);
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	3014	/*
				3015	* Avoid blocking other tasks for too long. It might even save
				3016	* us from writing caches for block groups that are going to be
				3017	* removed.
				3018	*/
				3019	mutex_unlock(&trans->transaction->cache_write_mutex);
Josef Bacik	938fcbf	2021-01-14 14:02:43 -0500	[diff] [blame]	3020	if (ret)
				3021	goto out;
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	3022	mutex_lock(&trans->transaction->cache_write_mutex);
				3023	}
				3024	mutex_unlock(&trans->transaction->cache_write_mutex);
				3025
				3026	/*
				3027	* Go through delayed refs for all the stuff we've just kicked off
				3028	* and then loop back (just once)
				3029	*/
Josef Bacik	34d1eb0	2020-12-16 11:22:17 -0500	[diff] [blame]	3030	if (!ret)
				3031	ret = btrfs_run_delayed_refs(trans, 0);
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	3032	if (!ret && loops == 0) {
				3033	loops++;
				3034	spin_lock(&cur_trans->dirty_bgs_lock);
				3035	list_splice_init(&cur_trans->dirty_bgs, &dirty);
				3036	/*
				3037	* dirty_bgs_lock protects us from concurrent block group
				3038	* deletes too (not just cache_write_mutex).
				3039	*/
				3040	if (!list_empty(&dirty)) {
				3041	spin_unlock(&cur_trans->dirty_bgs_lock);
				3042	goto again;
				3043	}
				3044	spin_unlock(&cur_trans->dirty_bgs_lock);
Josef Bacik	938fcbf	2021-01-14 14:02:43 -0500	[diff] [blame]	3045	}
				3046	out:
				3047	if (ret < 0) {
				3048	spin_lock(&cur_trans->dirty_bgs_lock);
				3049	list_splice_init(&dirty, &cur_trans->dirty_bgs);
				3050	spin_unlock(&cur_trans->dirty_bgs_lock);
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	3051	btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
				3052	}
				3053
				3054	btrfs_free_path(path);
				3055	return ret;
				3056	}
				3057
				3058	int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
				3059	{
				3060	struct btrfs_fs_info *fs_info = trans->fs_info;
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	3061	struct btrfs_block_group *cache;
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	3062	struct btrfs_transaction *cur_trans = trans->transaction;
				3063	int ret = 0;
				3064	int should_put;
				3065	struct btrfs_path *path;
				3066	struct list_head *io = &cur_trans->io_bgs;
				3067	int num_started = 0;
				3068
				3069	path = btrfs_alloc_path();
				3070	if (!path)
				3071	return -ENOMEM;
				3072
				3073	/*
				3074	* Even though we are in the critical section of the transaction commit,
				3075	* we can still have concurrent tasks adding elements to this
				3076	* transaction's list of dirty block groups. These tasks correspond to
				3077	* endio free space workers started when writeback finishes for a
				3078	* space cache, which run inode.c:btrfs_finish_ordered_io(), and can
				3079	* allocate new block groups as a result of COWing nodes of the root
				3080	* tree when updating the free space inode. The writeback for the space
				3081	* caches is triggered by an earlier call to
				3082	* btrfs_start_dirty_block_groups() and iterations of the following
				3083	* loop.
				3084	* Also we want to do the cache_save_setup first and then run the
				3085	* delayed refs to make sure we have the best chance at doing this all
				3086	* in one shot.
				3087	*/
				3088	spin_lock(&cur_trans->dirty_bgs_lock);
				3089	while (!list_empty(&cur_trans->dirty_bgs)) {
				3090	cache = list_first_entry(&cur_trans->dirty_bgs,
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	3091	struct btrfs_block_group,
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	3092	dirty_list);
				3093
				3094	/*
				3095	* This can happen if cache_save_setup re-dirties a block group
				3096	* that is already under IO. Just wait for it to finish and
				3097	* then do it all again
				3098	*/
				3099	if (!list_empty(&cache->io_list)) {
				3100	spin_unlock(&cur_trans->dirty_bgs_lock);
				3101	list_del_init(&cache->io_list);
				3102	btrfs_wait_cache_io(trans, cache, path);
				3103	btrfs_put_block_group(cache);
				3104	spin_lock(&cur_trans->dirty_bgs_lock);
				3105	}
				3106
				3107	/*
				3108	* Don't remove from the dirty list until after we've waited on
				3109	* any pending IO
				3110	*/
				3111	list_del_init(&cache->dirty_list);
				3112	spin_unlock(&cur_trans->dirty_bgs_lock);
				3113	should_put = 1;
				3114
				3115	cache_save_setup(cache, trans, path);
				3116
				3117	if (!ret)
				3118	ret = btrfs_run_delayed_refs(trans,
				3119	(unsigned long) -1);
				3120
				3121	if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
				3122	cache->io_ctl.inode = NULL;
				3123	ret = btrfs_write_out_cache(trans, cache, path);
				3124	if (ret == 0 && cache->io_ctl.inode) {
				3125	num_started++;
				3126	should_put = 0;
				3127	list_add_tail(&cache->io_list, io);
				3128	} else {
				3129	/*
				3130	* If we failed to write the cache, the
				3131	* generation will be bad and life goes on
				3132	*/
				3133	ret = 0;
				3134	}
				3135	}
				3136	if (!ret) {
Qu Wenruo	3be4d8e	2020-05-05 07:58:23 +0800	[diff] [blame]	3137	ret = update_block_group_item(trans, path, cache);
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	3138	/*
				3139	* One of the free space endio workers might have
				3140	* created a new block group while updating a free space
				3141	* cache's inode (at inode.c:btrfs_finish_ordered_io())
				3142	* and hasn't released its transaction handle yet, in
				3143	* which case the new block group is still attached to
				3144	* its transaction handle and its creation has not
				3145	* finished yet (no block group item in the extent tree
				3146	* yet, etc). If this is the case, wait for all free
				3147	* space endio workers to finish and retry. This is a
Randy Dunlap	260db43	2020-08-04 19:48:34 -0700	[diff] [blame]	3148	* very rare case so no need for a more efficient and
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	3149	* complex approach.
				3150	*/
				3151	if (ret == -ENOENT) {
				3152	wait_event(cur_trans->writer_wait,
				3153	atomic_read(&cur_trans->num_writers) == 1);
Qu Wenruo	3be4d8e	2020-05-05 07:58:23 +0800	[diff] [blame]	3154	ret = update_block_group_item(trans, path, cache);
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	3155	}
				3156	if (ret)
				3157	btrfs_abort_transaction(trans, ret);
				3158	}
				3159
				3160	/* If its not on the io list, we need to put the block group */
				3161	if (should_put)
				3162	btrfs_put_block_group(cache);
				3163	btrfs_delayed_refs_rsv_release(fs_info, 1);
				3164	spin_lock(&cur_trans->dirty_bgs_lock);
				3165	}
				3166	spin_unlock(&cur_trans->dirty_bgs_lock);
				3167
				3168	/*
				3169	* Refer to the definition of io_bgs member for details why it's safe
				3170	* to use it without any locking
				3171	*/
				3172	while (!list_empty(io)) {
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	3173	cache = list_first_entry(io, struct btrfs_block_group,
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	3174	io_list);
				3175	list_del_init(&cache->io_list);
				3176	btrfs_wait_cache_io(trans, cache, path);
				3177	btrfs_put_block_group(cache);
				3178	}
				3179
				3180	btrfs_free_path(path);
				3181	return ret;
				3182	}
Josef Bacik	606d1bf	2019-06-20 15:38:02 -0400	[diff] [blame]	3183
				3184	int btrfs_update_block_group(struct btrfs_trans_handle *trans,
Anand Jain	11b66fa	2021-10-13 14:05:14 +0800	[diff] [blame]	3185	u64 bytenr, u64 num_bytes, bool alloc)
Josef Bacik	606d1bf	2019-06-20 15:38:02 -0400	[diff] [blame]	3186	{
				3187	struct btrfs_fs_info *info = trans->fs_info;
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	3188	struct btrfs_block_group *cache = NULL;
Josef Bacik	606d1bf	2019-06-20 15:38:02 -0400	[diff] [blame]	3189	u64 total = num_bytes;
				3190	u64 old_val;
				3191	u64 byte_in_group;
				3192	int factor;
				3193	int ret = 0;
				3194
				3195	/* Block accounting for super block */
				3196	spin_lock(&info->delalloc_root_lock);
				3197	old_val = btrfs_super_bytes_used(info->super_copy);
				3198	if (alloc)
				3199	old_val += num_bytes;
				3200	else
				3201	old_val -= num_bytes;
				3202	btrfs_set_super_bytes_used(info->super_copy, old_val);
				3203	spin_unlock(&info->delalloc_root_lock);
				3204
				3205	while (total) {
				3206	cache = btrfs_lookup_block_group(info, bytenr);
				3207	if (!cache) {
				3208	ret = -ENOENT;
				3209	break;
				3210	}
				3211	factor = btrfs_bg_type_to_factor(cache->flags);
				3212
				3213	/*
				3214	* If this block group has free space cache written out, we
				3215	* need to make sure to load it if we are removing space. This
				3216	* is because we need the unpinning stage to actually add the
				3217	* space back to the block group, otherwise we will leak space.
				3218	*/
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	3219	if (!alloc && !btrfs_block_group_done(cache))
Josef Bacik	606d1bf	2019-06-20 15:38:02 -0400	[diff] [blame]	3220	btrfs_cache_block_group(cache, 1);
				3221
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	3222	byte_in_group = bytenr - cache->start;
				3223	WARN_ON(byte_in_group > cache->length);
Josef Bacik	606d1bf	2019-06-20 15:38:02 -0400	[diff] [blame]	3224
				3225	spin_lock(&cache->space_info->lock);
				3226	spin_lock(&cache->lock);
				3227
				3228	if (btrfs_test_opt(info, SPACE_CACHE) &&
				3229	cache->disk_cache_state < BTRFS_DC_CLEAR)
				3230	cache->disk_cache_state = BTRFS_DC_CLEAR;
				3231
David Sterba	bf38be6	2019-10-23 18:48:11 +0200	[diff] [blame]	3232	old_val = cache->used;
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	3233	num_bytes = min(total, cache->length - byte_in_group);
Josef Bacik	606d1bf	2019-06-20 15:38:02 -0400	[diff] [blame]	3234	if (alloc) {
				3235	old_val += num_bytes;
David Sterba	bf38be6	2019-10-23 18:48:11 +0200	[diff] [blame]	3236	cache->used = old_val;
Josef Bacik	606d1bf	2019-06-20 15:38:02 -0400	[diff] [blame]	3237	cache->reserved -= num_bytes;
				3238	cache->space_info->bytes_reserved -= num_bytes;
				3239	cache->space_info->bytes_used += num_bytes;
				3240	cache->space_info->disk_used += num_bytes * factor;
				3241	spin_unlock(&cache->lock);
				3242	spin_unlock(&cache->space_info->lock);
				3243	} else {
				3244	old_val -= num_bytes;
David Sterba	bf38be6	2019-10-23 18:48:11 +0200	[diff] [blame]	3245	cache->used = old_val;
Josef Bacik	606d1bf	2019-06-20 15:38:02 -0400	[diff] [blame]	3246	cache->pinned += num_bytes;
				3247	btrfs_space_info_update_bytes_pinned(info,
				3248	cache->space_info, num_bytes);
				3249	cache->space_info->bytes_used -= num_bytes;
				3250	cache->space_info->disk_used -= num_bytes * factor;
				3251	spin_unlock(&cache->lock);
				3252	spin_unlock(&cache->space_info->lock);
				3253
Nikolay Borisov	fe119a6	2020-01-20 16:09:18 +0200	[diff] [blame]	3254	set_extent_dirty(&trans->transaction->pinned_extents,
Josef Bacik	606d1bf	2019-06-20 15:38:02 -0400	[diff] [blame]	3255	bytenr, bytenr + num_bytes - 1,
				3256	GFP_NOFS \| __GFP_NOFAIL);
				3257	}
				3258
				3259	spin_lock(&trans->transaction->dirty_bgs_lock);
				3260	if (list_empty(&cache->dirty_list)) {
				3261	list_add_tail(&cache->dirty_list,
				3262	&trans->transaction->dirty_bgs);
				3263	trans->delayed_ref_updates++;
				3264	btrfs_get_block_group(cache);
				3265	}
				3266	spin_unlock(&trans->transaction->dirty_bgs_lock);
				3267
				3268	/*
				3269	* No longer have used bytes in this block group, queue it for
				3270	* deletion. We do this after adding the block group to the
				3271	* dirty list to avoid races between cleaner kthread and space
				3272	* cache writeout.
				3273	*/
Dennis Zhou	6e80d4f	2019-12-13 16:22:15 -0800	[diff] [blame]	3274	if (!alloc && old_val == 0) {
				3275	if (!btrfs_test_opt(info, DISCARD_ASYNC))
				3276	btrfs_mark_bg_unused(cache);
				3277	}
Josef Bacik	606d1bf	2019-06-20 15:38:02 -0400	[diff] [blame]	3278
				3279	btrfs_put_block_group(cache);
				3280	total -= num_bytes;
				3281	bytenr += num_bytes;
				3282	}
				3283
				3284	/* Modified block groups are accounted for in the delayed_refs_rsv. */
				3285	btrfs_update_delayed_refs_rsv(trans);
				3286	return ret;
				3287	}
				3288
				3289	/**
				3290	* btrfs_add_reserved_bytes - update the block_group and space info counters
				3291	* @cache: The cache we are manipulating
				3292	* @ram_bytes: The number of bytes of file content, and will be same to
				3293	* @num_bytes except for the compress path.
				3294	* @num_bytes: The number of bytes in question
				3295	* @delalloc: The blocks are allocated for the delalloc write
				3296	*
				3297	* This is called by the allocator when it reserves space. If this is a
				3298	* reservation and the block group has become read only we cannot make the
				3299	* reservation and return -EAGAIN, otherwise this function always succeeds.
				3300	*/
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	3301	int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
Josef Bacik	606d1bf	2019-06-20 15:38:02 -0400	[diff] [blame]	3302	u64 ram_bytes, u64 num_bytes, int delalloc)
				3303	{
				3304	struct btrfs_space_info *space_info = cache->space_info;
				3305	int ret = 0;
				3306
				3307	spin_lock(&space_info->lock);
				3308	spin_lock(&cache->lock);
				3309	if (cache->ro) {
				3310	ret = -EAGAIN;
				3311	} else {
				3312	cache->reserved += num_bytes;
				3313	space_info->bytes_reserved += num_bytes;
Josef Bacik	a43c383	2019-08-22 15:10:56 -0400	[diff] [blame]	3314	trace_btrfs_space_reservation(cache->fs_info, "space_info",
				3315	space_info->flags, num_bytes, 1);
Josef Bacik	606d1bf	2019-06-20 15:38:02 -0400	[diff] [blame]	3316	btrfs_space_info_update_bytes_may_use(cache->fs_info,
				3317	space_info, -ram_bytes);
				3318	if (delalloc)
				3319	cache->delalloc_bytes += num_bytes;
Josef Bacik	99ffb43	2020-07-21 10:22:19 -0400	[diff] [blame]	3320
				3321	/*
				3322	* Compression can use less space than we reserved, so wake
				3323	* tickets if that happens
				3324	*/
				3325	if (num_bytes < ram_bytes)
				3326	btrfs_try_granting_tickets(cache->fs_info, space_info);
Josef Bacik	606d1bf	2019-06-20 15:38:02 -0400	[diff] [blame]	3327	}
				3328	spin_unlock(&cache->lock);
				3329	spin_unlock(&space_info->lock);
				3330	return ret;
				3331	}
				3332
				3333	/**
				3334	* btrfs_free_reserved_bytes - update the block_group and space info counters
				3335	* @cache: The cache we are manipulating
				3336	* @num_bytes: The number of bytes in question
				3337	* @delalloc: The blocks are allocated for the delalloc write
				3338	*
				3339	* This is called by somebody who is freeing space that was never actually used
				3340	* on disk. For example if you reserve some space for a new leaf in transaction
				3341	* A and before transaction A commits you free that leaf, you call this with
				3342	* reserve set to 0 in order to clear the reservation.
				3343	*/
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	3344	void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
Josef Bacik	606d1bf	2019-06-20 15:38:02 -0400	[diff] [blame]	3345	u64 num_bytes, int delalloc)
				3346	{
				3347	struct btrfs_space_info *space_info = cache->space_info;
				3348
				3349	spin_lock(&space_info->lock);
				3350	spin_lock(&cache->lock);
				3351	if (cache->ro)
				3352	space_info->bytes_readonly += num_bytes;
				3353	cache->reserved -= num_bytes;
				3354	space_info->bytes_reserved -= num_bytes;
				3355	space_info->max_extent_size = 0;
				3356
				3357	if (delalloc)
				3358	cache->delalloc_bytes -= num_bytes;
				3359	spin_unlock(&cache->lock);
Josef Bacik	3308234	2020-07-21 10:22:17 -0400	[diff] [blame]	3360
				3361	btrfs_try_granting_tickets(cache->fs_info, space_info);
Josef Bacik	606d1bf	2019-06-20 15:38:02 -0400	[diff] [blame]	3362	spin_unlock(&space_info->lock);
				3363	}
Josef Bacik	07730d8	2019-06-20 15:38:04 -0400	[diff] [blame]	3364
				3365	static void force_metadata_allocation(struct btrfs_fs_info *info)
				3366	{
				3367	struct list_head *head = &info->space_info;
				3368	struct btrfs_space_info *found;
				3369
Josef Bacik	7280490	2020-09-01 17:40:37 -0400	[diff] [blame]	3370	list_for_each_entry(found, head, list) {
Josef Bacik	07730d8	2019-06-20 15:38:04 -0400	[diff] [blame]	3371	if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
				3372	found->force_alloc = CHUNK_ALLOC_FORCE;
				3373	}
Josef Bacik	07730d8	2019-06-20 15:38:04 -0400	[diff] [blame]	3374	}
				3375
				3376	static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
				3377	struct btrfs_space_info *sinfo, int force)
				3378	{
				3379	u64 bytes_used = btrfs_space_info_used(sinfo, false);
				3380	u64 thresh;
				3381
				3382	if (force == CHUNK_ALLOC_FORCE)
				3383	return 1;
				3384
				3385	/*
				3386	* in limited mode, we want to have some free space up to
				3387	* about 1% of the FS size.
				3388	*/
				3389	if (force == CHUNK_ALLOC_LIMITED) {
				3390	thresh = btrfs_super_total_bytes(fs_info->super_copy);
				3391	thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
				3392
				3393	if (sinfo->total_bytes - bytes_used < thresh)
				3394	return 1;
				3395	}
				3396
				3397	if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
				3398	return 0;
				3399	return 1;
				3400	}
				3401
				3402	int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
				3403	{
				3404	u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type);
				3405
				3406	return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
				3407	}
				3408
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	3409	static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
				3410	{
				3411	struct btrfs_block_group *bg;
				3412	int ret;
				3413
				3414	/*
				3415	* Check if we have enough space in the system space info because we
				3416	* will need to update device items in the chunk btree and insert a new
				3417	* chunk item in the chunk btree as well. This will allocate a new
				3418	* system block group if needed.
				3419	*/
				3420	check_system_chunk(trans, flags);
				3421
Nikolay Borisov	f6f39f7	2021-08-18 13:41:19 +0300	[diff] [blame]	3422	bg = btrfs_create_chunk(trans, flags);
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	3423	if (IS_ERR(bg)) {
				3424	ret = PTR_ERR(bg);
				3425	goto out;
				3426	}
				3427
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	3428	ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
				3429	/*
				3430	* Normally we are not expected to fail with -ENOSPC here, since we have
				3431	* previously reserved space in the system space_info and allocated one
Filipe Manana	ecd84d5	2021-10-13 10:12:50 +0100	[diff] [blame^]	3432	* new system chunk if necessary. However there are three exceptions:
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	3433	*
				3434	* 1) We may have enough free space in the system space_info but all the
				3435	* existing system block groups have a profile which can not be used
				3436	* for extent allocation.
				3437	*
				3438	* This happens when mounting in degraded mode. For example we have a
				3439	* RAID1 filesystem with 2 devices, lose one device and mount the fs
				3440	* using the other device in degraded mode. If we then allocate a chunk,
				3441	* we may have enough free space in the existing system space_info, but
				3442	* none of the block groups can be used for extent allocation since they
				3443	* have a RAID1 profile, and because we are in degraded mode with a
				3444	* single device, we are forced to allocate a new system chunk with a
				3445	* SINGLE profile. Making check_system_chunk() iterate over all system
				3446	* block groups and check if they have a usable profile and enough space
				3447	* can be slow on very large filesystems, so we tolerate the -ENOSPC and
				3448	* try again after forcing allocation of a new system chunk. Like this
				3449	* we avoid paying the cost of that search in normal circumstances, when
				3450	* we were not mounted in degraded mode;
				3451	*
				3452	* 2) We had enough free space info the system space_info, and one suitable
				3453	* block group to allocate from when we called check_system_chunk()
				3454	* above. However right after we called it, the only system block group
				3455	* with enough free space got turned into RO mode by a running scrub,
				3456	* and in this case we have to allocate a new one and retry. We only
				3457	* need do this allocate and retry once, since we have a transaction
Filipe Manana	ecd84d5	2021-10-13 10:12:50 +0100	[diff] [blame^]	3458	* handle and scrub uses the commit root to search for block groups;
				3459	*
				3460	* 3) We had one system block group with enough free space when we called
				3461	* check_system_chunk(), but after that, right before we tried to
				3462	* allocate the last extent buffer we needed, a discard operation came
				3463	* in and it temporarily removed the last free space entry from the
				3464	* block group (discard removes a free space entry, discards it, and
				3465	* then adds back the entry to the block group cache).
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	3466	*/
				3467	if (ret == -ENOSPC) {
				3468	const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
				3469	struct btrfs_block_group *sys_bg;
				3470
Nikolay Borisov	f6f39f7	2021-08-18 13:41:19 +0300	[diff] [blame]	3471	sys_bg = btrfs_create_chunk(trans, sys_flags);
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	3472	if (IS_ERR(sys_bg)) {
				3473	ret = PTR_ERR(sys_bg);
				3474	btrfs_abort_transaction(trans, ret);
				3475	goto out;
				3476	}
				3477
				3478	ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
				3479	if (ret) {
				3480	btrfs_abort_transaction(trans, ret);
				3481	goto out;
				3482	}
				3483
				3484	ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
				3485	if (ret) {
				3486	btrfs_abort_transaction(trans, ret);
				3487	goto out;
				3488	}
				3489	} else if (ret) {
				3490	btrfs_abort_transaction(trans, ret);
				3491	goto out;
				3492	}
				3493	out:
				3494	btrfs_trans_release_chunk_metadata(trans);
				3495
				3496	return ret;
				3497	}
				3498
Josef Bacik	07730d8	2019-06-20 15:38:04 -0400	[diff] [blame]	3499	/*
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	3500	* Chunk allocation is done in 2 phases:
				3501	*
				3502	* 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for
				3503	* the chunk, the chunk mapping, create its block group and add the items
				3504	* that belong in the chunk btree to it - more specifically, we need to
				3505	* update device items in the chunk btree and add a new chunk item to it.
				3506	*
				3507	* 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block
				3508	* group item to the extent btree and the device extent items to the devices
				3509	* btree.
				3510	*
				3511	* This is done to prevent deadlocks. For example when COWing a node from the
				3512	* extent btree we are holding a write lock on the node's parent and if we
				3513	* trigger chunk allocation and attempted to insert the new block group item
				3514	* in the extent btree right way, we could deadlock because the path for the
				3515	* insertion can include that parent node. At first glance it seems impossible
				3516	* to trigger chunk allocation after starting a transaction since tasks should
				3517	* reserve enough transaction units (metadata space), however while that is true
				3518	* most of the time, chunk allocation may still be triggered for several reasons:
				3519	*
				3520	* 1) When reserving metadata, we check if there is enough free space in the
				3521	* metadata space_info and therefore don't trigger allocation of a new chunk.
				3522	* However later when the task actually tries to COW an extent buffer from
				3523	* the extent btree or from the device btree for example, it is forced to
				3524	* allocate a new block group (chunk) because the only one that had enough
				3525	* free space was just turned to RO mode by a running scrub for example (or
				3526	* device replace, block group reclaim thread, etc), so we can not use it
				3527	* for allocating an extent and end up being forced to allocate a new one;
				3528	*
				3529	* 2) Because we only check that the metadata space_info has enough free bytes,
				3530	* we end up not allocating a new metadata chunk in that case. However if
				3531	* the filesystem was mounted in degraded mode, none of the existing block
				3532	* groups might be suitable for extent allocation due to their incompatible
				3533	* profile (for e.g. mounting a 2 devices filesystem, where all block groups
				3534	* use a RAID1 profile, in degraded mode using a single device). In this case
				3535	* when the task attempts to COW some extent buffer of the extent btree for
				3536	* example, it will trigger allocation of a new metadata block group with a
				3537	* suitable profile (SINGLE profile in the example of the degraded mount of
				3538	* the RAID1 filesystem);
				3539	*
				3540	* 3) The task has reserved enough transaction units / metadata space, but when
				3541	* it attempts to COW an extent buffer from the extent or device btree for
				3542	* example, it does not find any free extent in any metadata block group,
				3543	* therefore forced to try to allocate a new metadata block group.
				3544	* This is because some other task allocated all available extents in the
				3545	* meanwhile - this typically happens with tasks that don't reserve space
				3546	* properly, either intentionally or as a bug. One example where this is
				3547	* done intentionally is fsync, as it does not reserve any transaction units
				3548	* and ends up allocating a variable number of metadata extents for log
Filipe Manana	ecd84d5	2021-10-13 10:12:50 +0100	[diff] [blame^]	3549	* tree extent buffers;
				3550	*
				3551	* 4) The task has reserved enough transaction units / metadata space, but right
				3552	* before it tries to allocate the last extent buffer it needs, a discard
				3553	* operation comes in and, temporarily, removes the last free space entry from
				3554	* the only metadata block group that had free space (discard starts by
				3555	* removing a free space entry from a block group, then does the discard
				3556	* operation and, once it's done, it adds back the free space entry to the
				3557	* block group).
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	3558	*
				3559	* We also need this 2 phases setup when adding a device to a filesystem with
				3560	* a seed device - we must create new metadata and system chunks without adding
				3561	* any of the block group items to the chunk, extent and device btrees. If we
				3562	* did not do it this way, we would get ENOSPC when attempting to update those
				3563	* btrees, since all the chunks from the seed device are read-only.
				3564	*
				3565	* Phase 1 does the updates and insertions to the chunk btree because if we had
				3566	* it done in phase 2 and have a thundering herd of tasks allocating chunks in
				3567	* parallel, we risk having too many system chunks allocated by many tasks if
				3568	* many tasks reach phase 1 without the previous ones completing phase 2. In the
				3569	* extreme case this leads to exhaustion of the system chunk array in the
				3570	* superblock. This is easier to trigger if using a btree node/leaf size of 64K
				3571	* and with RAID filesystems (so we have more device items in the chunk btree).
				3572	* This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
				3573	* the system chunk array due to concurrent allocations") provides more details.
				3574	*
Filipe Manana	2bb2e00	2021-10-13 10:12:49 +0100	[diff] [blame]	3575	* Allocation of system chunks does not happen through this function. A task that
				3576	* needs to update the chunk btree (the only btree that uses system chunks), must
				3577	* preallocate chunk space by calling either check_system_chunk() or
				3578	* btrfs_reserve_chunk_metadata() - the former is used when allocating a data or
				3579	* metadata chunk or when removing a chunk, while the later is used before doing
				3580	* a modification to the chunk btree - use cases for the later are adding,
				3581	* removing and resizing a device as well as relocation of a system chunk.
				3582	* See the comment below for more details.
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	3583	*
				3584	* The reservation of system space, done through check_system_chunk(), as well
				3585	* as all the updates and insertions into the chunk btree must be done while
				3586	* holding fs_info->chunk_mutex. This is important to guarantee that while COWing
				3587	* an extent buffer from the chunks btree we never trigger allocation of a new
				3588	* system chunk, which would result in a deadlock (trying to lock twice an
				3589	* extent buffer of the chunk btree, first time before triggering the chunk
				3590	* allocation and the second time during chunk allocation while attempting to
				3591	* update the chunks btree). The system chunk array is also updated while holding
				3592	* that mutex. The same logic applies to removing chunks - we must reserve system
				3593	* space, update the chunk btree and the system chunk array in the superblock
				3594	* while holding fs_info->chunk_mutex.
				3595	*
				3596	* This function, btrfs_chunk_alloc(), belongs to phase 1.
				3597	*
				3598	* If @force is CHUNK_ALLOC_FORCE:
Josef Bacik	07730d8	2019-06-20 15:38:04 -0400	[diff] [blame]	3599	* - return 1 if it successfully allocates a chunk,
				3600	* - return errors including -ENOSPC otherwise.
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	3601	* If @force is NOT CHUNK_ALLOC_FORCE:
Josef Bacik	07730d8	2019-06-20 15:38:04 -0400	[diff] [blame]	3602	* - return 0 if it doesn't need to allocate a new chunk,
				3603	* - return 1 if it successfully allocates a chunk,
				3604	* - return errors including -ENOSPC otherwise.
				3605	*/
				3606	int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
				3607	enum btrfs_chunk_alloc_enum force)
				3608	{
				3609	struct btrfs_fs_info *fs_info = trans->fs_info;
				3610	struct btrfs_space_info *space_info;
				3611	bool wait_for_alloc = false;
				3612	bool should_alloc = false;
				3613	int ret = 0;
				3614
				3615	/* Don't re-enter if we're already allocating a chunk */
				3616	if (trans->allocating_chunk)
				3617	return -ENOSPC;
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	3618	/*
Filipe Manana	2bb2e00	2021-10-13 10:12:49 +0100	[diff] [blame]	3619	* Allocation of system chunks can not happen through this path, as we
				3620	* could end up in a deadlock if we are allocating a data or metadata
				3621	* chunk and there is another task modifying the chunk btree.
				3622	*
				3623	* This is because while we are holding the chunk mutex, we will attempt
				3624	* to add the new chunk item to the chunk btree or update an existing
				3625	* device item in the chunk btree, while the other task that is modifying
				3626	* the chunk btree is attempting to COW an extent buffer while holding a
				3627	* lock on it and on its parent - if the COW operation triggers a system
				3628	* chunk allocation, then we can deadlock because we are holding the
				3629	* chunk mutex and we may need to access that extent buffer or its parent
				3630	* in order to add the chunk item or update a device item.
				3631	*
				3632	* Tasks that want to modify the chunk tree should reserve system space
				3633	* before updating the chunk btree, by calling either
				3634	* btrfs_reserve_chunk_metadata() or check_system_chunk().
				3635	* It's possible that after a task reserves the space, it still ends up
				3636	* here - this happens in the cases described above at do_chunk_alloc().
				3637	* The task will have to either retry or fail.
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	3638	*/
Filipe Manana	2bb2e00	2021-10-13 10:12:49 +0100	[diff] [blame]	3639	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	3640	return -ENOSPC;
Josef Bacik	07730d8	2019-06-20 15:38:04 -0400	[diff] [blame]	3641
				3642	space_info = btrfs_find_space_info(fs_info, flags);
				3643	ASSERT(space_info);
				3644
				3645	do {
				3646	spin_lock(&space_info->lock);
				3647	if (force < space_info->force_alloc)
				3648	force = space_info->force_alloc;
				3649	should_alloc = should_alloc_chunk(fs_info, space_info, force);
				3650	if (space_info->full) {
				3651	/* No more free physical space */
				3652	if (should_alloc)
				3653	ret = -ENOSPC;
				3654	else
				3655	ret = 0;
				3656	spin_unlock(&space_info->lock);
				3657	return ret;
				3658	} else if (!should_alloc) {
				3659	spin_unlock(&space_info->lock);
				3660	return 0;
				3661	} else if (space_info->chunk_alloc) {
				3662	/*
				3663	* Someone is already allocating, so we need to block
				3664	* until this someone is finished and then loop to
				3665	* recheck if we should continue with our allocation
				3666	* attempt.
				3667	*/
				3668	wait_for_alloc = true;
				3669	spin_unlock(&space_info->lock);
				3670	mutex_lock(&fs_info->chunk_mutex);
				3671	mutex_unlock(&fs_info->chunk_mutex);
				3672	} else {
				3673	/* Proceed with allocation */
				3674	space_info->chunk_alloc = 1;
				3675	wait_for_alloc = false;
				3676	spin_unlock(&space_info->lock);
				3677	}
				3678
				3679	cond_resched();
				3680	} while (wait_for_alloc);
				3681
				3682	mutex_lock(&fs_info->chunk_mutex);
				3683	trans->allocating_chunk = true;
				3684
				3685	/*
				3686	* If we have mixed data/metadata chunks we want to make sure we keep
				3687	* allocating mixed chunks instead of individual chunks.
				3688	*/
				3689	if (btrfs_mixed_space_info(space_info))
				3690	flags \|= (BTRFS_BLOCK_GROUP_DATA \| BTRFS_BLOCK_GROUP_METADATA);
				3691
				3692	/*
				3693	* if we're doing a data chunk, go ahead and make sure that
				3694	* we keep a reasonable number of metadata chunks allocated in the
				3695	* FS as well.
				3696	*/
				3697	if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
				3698	fs_info->data_chunk_allocations++;
				3699	if (!(fs_info->data_chunk_allocations %
				3700	fs_info->metadata_ratio))
				3701	force_metadata_allocation(fs_info);
				3702	}
				3703
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	3704	ret = do_chunk_alloc(trans, flags);
Josef Bacik	07730d8	2019-06-20 15:38:04 -0400	[diff] [blame]	3705	trans->allocating_chunk = false;
				3706
				3707	spin_lock(&space_info->lock);
				3708	if (ret < 0) {
				3709	if (ret == -ENOSPC)
				3710	space_info->full = 1;
				3711	else
				3712	goto out;
				3713	} else {
				3714	ret = 1;
				3715	space_info->max_extent_size = 0;
				3716	}
				3717
				3718	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
				3719	out:
				3720	space_info->chunk_alloc = 0;
				3721	spin_unlock(&space_info->lock);
				3722	mutex_unlock(&fs_info->chunk_mutex);
Josef Bacik	07730d8	2019-06-20 15:38:04 -0400	[diff] [blame]	3723
				3724	return ret;
				3725	}
				3726
				3727	static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
				3728	{
				3729	u64 num_dev;
				3730
				3731	num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
				3732	if (!num_dev)
				3733	num_dev = fs_info->fs_devices->rw_devices;
				3734
				3735	return num_dev;
				3736	}
				3737
Filipe Manana	2bb2e00	2021-10-13 10:12:49 +0100	[diff] [blame]	3738	static void reserve_chunk_space(struct btrfs_trans_handle *trans,
				3739	u64 bytes,
				3740	u64 type)
Josef Bacik	07730d8	2019-06-20 15:38:04 -0400	[diff] [blame]	3741	{
				3742	struct btrfs_fs_info *fs_info = trans->fs_info;
				3743	struct btrfs_space_info *info;
				3744	u64 left;
Josef Bacik	07730d8	2019-06-20 15:38:04 -0400	[diff] [blame]	3745	int ret = 0;
Josef Bacik	07730d8	2019-06-20 15:38:04 -0400	[diff] [blame]	3746
				3747	/*
				3748	* Needed because we can end up allocating a system chunk and for an
				3749	* atomic and race free space reservation in the chunk block reserve.
				3750	*/
				3751	lockdep_assert_held(&fs_info->chunk_mutex);
				3752
				3753	info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
				3754	spin_lock(&info->lock);
				3755	left = info->total_bytes - btrfs_space_info_used(info, true);
				3756	spin_unlock(&info->lock);
				3757
Filipe Manana	2bb2e00	2021-10-13 10:12:49 +0100	[diff] [blame]	3758	if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
Josef Bacik	07730d8	2019-06-20 15:38:04 -0400	[diff] [blame]	3759	btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
Filipe Manana	2bb2e00	2021-10-13 10:12:49 +0100	[diff] [blame]	3760	left, bytes, type);
Josef Bacik	07730d8	2019-06-20 15:38:04 -0400	[diff] [blame]	3761	btrfs_dump_space_info(fs_info, info, 0, 0);
				3762	}
				3763
Filipe Manana	2bb2e00	2021-10-13 10:12:49 +0100	[diff] [blame]	3764	if (left < bytes) {
Josef Bacik	07730d8	2019-06-20 15:38:04 -0400	[diff] [blame]	3765	u64 flags = btrfs_system_alloc_profile(fs_info);
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	3766	struct btrfs_block_group *bg;
Josef Bacik	07730d8	2019-06-20 15:38:04 -0400	[diff] [blame]	3767
				3768	/*
				3769	* Ignore failure to create system chunk. We might end up not
				3770	* needing it, as we might not need to COW all nodes/leafs from
				3771	* the paths we visit in the chunk tree (they were already COWed
				3772	* or created in the current transaction for example).
				3773	*/
Nikolay Borisov	f6f39f7	2021-08-18 13:41:19 +0300	[diff] [blame]	3774	bg = btrfs_create_chunk(trans, flags);
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	3775	if (IS_ERR(bg)) {
				3776	ret = PTR_ERR(bg);
Filipe Manana	2bb2e00	2021-10-13 10:12:49 +0100	[diff] [blame]	3777	} else {
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	3778	/*
				3779	* If we fail to add the chunk item here, we end up
				3780	* trying again at phase 2 of chunk allocation, at
				3781	* btrfs_create_pending_block_groups(). So ignore
Filipe Manana	2bb2e00	2021-10-13 10:12:49 +0100	[diff] [blame]	3782	* any error here. An ENOSPC here could happen, due to
				3783	* the cases described at do_chunk_alloc() - the system
				3784	* block group we just created was just turned into RO
				3785	* mode by a scrub for example, or a running discard
				3786	* temporarily removed its free space entries, etc.
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	3787	*/
				3788	btrfs_chunk_alloc_add_chunk_item(trans, bg);
				3789	}
Josef Bacik	07730d8	2019-06-20 15:38:04 -0400	[diff] [blame]	3790	}
				3791
				3792	if (!ret) {
				3793	ret = btrfs_block_rsv_add(fs_info->chunk_root,
				3794	&fs_info->chunk_block_rsv,
Filipe Manana	2bb2e00	2021-10-13 10:12:49 +0100	[diff] [blame]	3795	bytes, BTRFS_RESERVE_NO_FLUSH);
Filipe Manana	1cb3db1	2021-06-29 14:43:05 +0100	[diff] [blame]	3796	if (!ret)
Filipe Manana	2bb2e00	2021-10-13 10:12:49 +0100	[diff] [blame]	3797	trans->chunk_bytes_reserved += bytes;
Josef Bacik	07730d8	2019-06-20 15:38:04 -0400	[diff] [blame]	3798	}
				3799	}
				3800
Filipe Manana	2bb2e00	2021-10-13 10:12:49 +0100	[diff] [blame]	3801	/*
				3802	* Reserve space in the system space for allocating or removing a chunk.
				3803	* The caller must be holding fs_info->chunk_mutex.
				3804	*/
				3805	void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
				3806	{
				3807	struct btrfs_fs_info *fs_info = trans->fs_info;
				3808	const u64 num_devs = get_profile_num_devs(fs_info, type);
				3809	u64 bytes;
				3810
				3811	/* num_devs device items to update and 1 chunk item to add or remove. */
				3812	bytes = btrfs_calc_metadata_size(fs_info, num_devs) +
				3813	btrfs_calc_insert_metadata_size(fs_info, 1);
				3814
				3815	reserve_chunk_space(trans, bytes, type);
				3816	}
				3817
				3818	/*
				3819	* Reserve space in the system space, if needed, for doing a modification to the
				3820	* chunk btree.
				3821	*
				3822	* @trans: A transaction handle.
				3823	* @is_item_insertion: Indicate if the modification is for inserting a new item
				3824	* in the chunk btree or if it's for the deletion or update
				3825	* of an existing item.
				3826	*
				3827	* This is used in a context where we need to update the chunk btree outside
				3828	* block group allocation and removal, to avoid a deadlock with a concurrent
				3829	* task that is allocating a metadata or data block group and therefore needs to
				3830	* update the chunk btree while holding the chunk mutex. After the update to the
				3831	* chunk btree is done, btrfs_trans_release_chunk_metadata() should be called.
				3832	*
				3833	*/
				3834	void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
				3835	bool is_item_insertion)
				3836	{
				3837	struct btrfs_fs_info *fs_info = trans->fs_info;
				3838	u64 bytes;
				3839
				3840	if (is_item_insertion)
				3841	bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
				3842	else
				3843	bytes = btrfs_calc_metadata_size(fs_info, 1);
				3844
				3845	mutex_lock(&fs_info->chunk_mutex);
				3846	reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM);
				3847	mutex_unlock(&fs_info->chunk_mutex);
				3848	}
				3849
Josef Bacik	3e43c27	2019-06-20 15:38:06 -0400	[diff] [blame]	3850	void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
				3851	{
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	3852	struct btrfs_block_group *block_group;
Josef Bacik	3e43c27	2019-06-20 15:38:06 -0400	[diff] [blame]	3853	u64 last = 0;
				3854
				3855	while (1) {
				3856	struct inode *inode;
				3857
				3858	block_group = btrfs_lookup_first_block_group(info, last);
				3859	while (block_group) {
				3860	btrfs_wait_block_group_cache_done(block_group);
				3861	spin_lock(&block_group->lock);
				3862	if (block_group->iref)
				3863	break;
				3864	spin_unlock(&block_group->lock);
				3865	block_group = btrfs_next_block_group(block_group);
				3866	}
				3867	if (!block_group) {
				3868	if (last == 0)
				3869	break;
				3870	last = 0;
				3871	continue;
				3872	}
				3873
				3874	inode = block_group->inode;
				3875	block_group->iref = 0;
				3876	block_group->inode = NULL;
				3877	spin_unlock(&block_group->lock);
				3878	ASSERT(block_group->io_ctl.inode == NULL);
				3879	iput(inode);
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	3880	last = block_group->start + block_group->length;
Josef Bacik	3e43c27	2019-06-20 15:38:06 -0400	[diff] [blame]	3881	btrfs_put_block_group(block_group);
				3882	}
				3883	}
				3884
				3885	/*
				3886	* Must be called only after stopping all workers, since we could have block
				3887	* group caching kthreads running, and therefore they could race with us if we
				3888	* freed the block groups before stopping them.
				3889	*/
				3890	int btrfs_free_block_groups(struct btrfs_fs_info *info)
				3891	{
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	3892	struct btrfs_block_group *block_group;
Josef Bacik	3e43c27	2019-06-20 15:38:06 -0400	[diff] [blame]	3893	struct btrfs_space_info *space_info;
				3894	struct btrfs_caching_control *caching_ctl;
				3895	struct rb_node *n;
				3896
Josef Bacik	bbb86a3	2020-10-23 09:58:11 -0400	[diff] [blame]	3897	spin_lock(&info->block_group_cache_lock);
Josef Bacik	3e43c27	2019-06-20 15:38:06 -0400	[diff] [blame]	3898	while (!list_empty(&info->caching_block_groups)) {
				3899	caching_ctl = list_entry(info->caching_block_groups.next,
				3900	struct btrfs_caching_control, list);
				3901	list_del(&caching_ctl->list);
				3902	btrfs_put_caching_control(caching_ctl);
				3903	}
Josef Bacik	bbb86a3	2020-10-23 09:58:11 -0400	[diff] [blame]	3904	spin_unlock(&info->block_group_cache_lock);
Josef Bacik	3e43c27	2019-06-20 15:38:06 -0400	[diff] [blame]	3905
				3906	spin_lock(&info->unused_bgs_lock);
				3907	while (!list_empty(&info->unused_bgs)) {
				3908	block_group = list_first_entry(&info->unused_bgs,
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	3909	struct btrfs_block_group,
Josef Bacik	3e43c27	2019-06-20 15:38:06 -0400	[diff] [blame]	3910	bg_list);
				3911	list_del_init(&block_group->bg_list);
				3912	btrfs_put_block_group(block_group);
				3913	}
				3914	spin_unlock(&info->unused_bgs_lock);
				3915
Johannes Thumshirn	18bb8bb	2021-04-19 16:41:02 +0900	[diff] [blame]	3916	spin_lock(&info->unused_bgs_lock);
				3917	while (!list_empty(&info->reclaim_bgs)) {
				3918	block_group = list_first_entry(&info->reclaim_bgs,
				3919	struct btrfs_block_group,
				3920	bg_list);
				3921	list_del_init(&block_group->bg_list);
				3922	btrfs_put_block_group(block_group);
				3923	}
				3924	spin_unlock(&info->unused_bgs_lock);
				3925
Naohiro Aota	afba2bc	2021-08-19 21:19:17 +0900	[diff] [blame]	3926	spin_lock(&info->zone_active_bgs_lock);
				3927	while (!list_empty(&info->zone_active_bgs)) {
				3928	block_group = list_first_entry(&info->zone_active_bgs,
				3929	struct btrfs_block_group,
				3930	active_bg_list);
				3931	list_del_init(&block_group->active_bg_list);
				3932	btrfs_put_block_group(block_group);
				3933	}
				3934	spin_unlock(&info->zone_active_bgs_lock);
				3935
Josef Bacik	3e43c27	2019-06-20 15:38:06 -0400	[diff] [blame]	3936	spin_lock(&info->block_group_cache_lock);
				3937	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	3938	block_group = rb_entry(n, struct btrfs_block_group,
Josef Bacik	3e43c27	2019-06-20 15:38:06 -0400	[diff] [blame]	3939	cache_node);
				3940	rb_erase(&block_group->cache_node,
				3941	&info->block_group_cache_tree);
				3942	RB_CLEAR_NODE(&block_group->cache_node);
				3943	spin_unlock(&info->block_group_cache_lock);
				3944
				3945	down_write(&block_group->space_info->groups_sem);
				3946	list_del(&block_group->list);
				3947	up_write(&block_group->space_info->groups_sem);
				3948
				3949	/*
				3950	* We haven't cached this block group, which means we could
				3951	* possibly have excluded extents on this block group.
				3952	*/
				3953	if (block_group->cached == BTRFS_CACHE_NO \|\|
				3954	block_group->cached == BTRFS_CACHE_ERROR)
				3955	btrfs_free_excluded_extents(block_group);
				3956
				3957	btrfs_remove_free_space_cache(block_group);
				3958	ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
				3959	ASSERT(list_empty(&block_group->dirty_list));
				3960	ASSERT(list_empty(&block_group->io_list));
				3961	ASSERT(list_empty(&block_group->bg_list));
Josef Bacik	48aaeeb	2020-07-06 09:14:11 -0400	[diff] [blame]	3962	ASSERT(refcount_read(&block_group->refs) == 1);
Filipe Manana	195a49e	2021-02-05 12:55:37 +0000	[diff] [blame]	3963	ASSERT(block_group->swap_extents == 0);
Josef Bacik	3e43c27	2019-06-20 15:38:06 -0400	[diff] [blame]	3964	btrfs_put_block_group(block_group);
				3965
				3966	spin_lock(&info->block_group_cache_lock);
				3967	}
				3968	spin_unlock(&info->block_group_cache_lock);
				3969
Josef Bacik	3e43c27	2019-06-20 15:38:06 -0400	[diff] [blame]	3970	btrfs_release_global_block_rsv(info);
				3971
				3972	while (!list_empty(&info->space_info)) {
				3973	space_info = list_entry(info->space_info.next,
				3974	struct btrfs_space_info,
				3975	list);
				3976
				3977	/*
				3978	* Do not hide this behind enospc_debug, this is actually
				3979	* important and indicates a real bug if this happens.
				3980	*/
				3981	if (WARN_ON(space_info->bytes_pinned > 0 \|\|
				3982	space_info->bytes_reserved > 0 \|\|
				3983	space_info->bytes_may_use > 0))
				3984	btrfs_dump_space_info(info, space_info, 0, 0);
Filipe Manana	d611add	2020-04-07 11:38:49 +0100	[diff] [blame]	3985	WARN_ON(space_info->reclaim_size > 0);
Josef Bacik	3e43c27	2019-06-20 15:38:06 -0400	[diff] [blame]	3986	list_del(&space_info->list);
				3987	btrfs_sysfs_remove_space_info(space_info);
				3988	}
				3989	return 0;
				3990	}
Filipe Manana	684b752	2020-05-08 11:01:59 +0100	[diff] [blame]	3991
				3992	void btrfs_freeze_block_group(struct btrfs_block_group *cache)
				3993	{
				3994	atomic_inc(&cache->frozen);
				3995	}
				3996
				3997	void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
				3998	{
				3999	struct btrfs_fs_info *fs_info = block_group->fs_info;
				4000	struct extent_map_tree *em_tree;
				4001	struct extent_map *em;
				4002	bool cleanup;
				4003
				4004	spin_lock(&block_group->lock);
				4005	cleanup = (atomic_dec_and_test(&block_group->frozen) &&
				4006	block_group->removed);
				4007	spin_unlock(&block_group->lock);
				4008
				4009	if (cleanup) {
Filipe Manana	684b752	2020-05-08 11:01:59 +0100	[diff] [blame]	4010	em_tree = &fs_info->mapping_tree;
				4011	write_lock(&em_tree->lock);
				4012	em = lookup_extent_mapping(em_tree, block_group->start,
				4013	1);
				4014	BUG_ON(!em); /* logic error, can't happen */
				4015	remove_extent_mapping(em_tree, em);
				4016	write_unlock(&em_tree->lock);
Filipe Manana	684b752	2020-05-08 11:01:59 +0100	[diff] [blame]	4017
				4018	/* once for us and once for the tree */
				4019	free_extent_map(em);
				4020	free_extent_map(em);
				4021
				4022	/*
				4023	* We may have left one free space entry and other possible
				4024	* tasks trimming this block group have left 1 entry each one.
				4025	* Free them if any.
				4026	*/
				4027	__btrfs_remove_free_space_cache(block_group->free_space_ctl);
				4028	}
				4029	}
Filipe Manana	195a49e	2021-02-05 12:55:37 +0000	[diff] [blame]	4030
				4031	bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg)
				4032	{
				4033	bool ret = true;
				4034
				4035	spin_lock(&bg->lock);
				4036	if (bg->ro)
				4037	ret = false;
				4038	else
				4039	bg->swap_extents++;
				4040	spin_unlock(&bg->lock);
				4041
				4042	return ret;
				4043	}
				4044
				4045	void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount)
				4046	{
				4047	spin_lock(&bg->lock);
				4048	ASSERT(!bg->ro);
				4049	ASSERT(bg->swap_extents >= amount);
				4050	bg->swap_extents -= amount;
				4051	spin_unlock(&bg->lock);
				4052	}