Blame - fs/btrfs/block-group.c - SHIFTPHONES/mainline/linux

blob: 1db24e6d6d906d462e29fc8c50523ae71e685780 [file] [log] [blame]

Josef Bacik	2e405ad	2019-06-20 15:37:45 -0400	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2
Johannes Thumshirn	2ca0ec7	2021-10-14 18:39:02 +0900	[diff] [blame]	3	#include <linux/list_sort.h>
David Sterba	784352f	2019-08-21 18:54:28 +0200	[diff] [blame]	4	#include "misc.h"
Josef Bacik	2e405ad	2019-06-20 15:37:45 -0400	[diff] [blame]	5	#include "ctree.h"
				6	#include "block-group.h"
Josef Bacik	3eeb322	2019-06-20 15:37:47 -0400	[diff] [blame]	7	#include "space-info.h"
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	8	#include "disk-io.h"
				9	#include "free-space-cache.h"
				10	#include "free-space-tree.h"
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	11	#include "volumes.h"
				12	#include "transaction.h"
				13	#include "ref-verify.h"
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	14	#include "sysfs.h"
				15	#include "tree-log.h"
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	16	#include "delalloc-space.h"
Dennis Zhou	b0643e5	2019-12-13 16:22:14 -0800	[diff] [blame]	17	#include "discard.h"
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	18	#include "raid56.h"
Naohiro Aota	08e11a3	2021-02-04 19:21:50 +0900	[diff] [blame]	19	#include "zoned.h"
Josef Bacik	2e405ad	2019-06-20 15:37:45 -0400	[diff] [blame]	20
Josef Bacik	878d7b6	2019-06-20 15:38:05 -0400	[diff] [blame]	21	/*
				22	* Return target flags in extended format or 0 if restripe for this chunk_type
				23	* is not in progress
				24	*
				25	* Should be called with balance_lock held
				26	*/
Josef Bacik	e11c040	2019-06-20 15:38:07 -0400	[diff] [blame]	27	static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
Josef Bacik	878d7b6	2019-06-20 15:38:05 -0400	[diff] [blame]	28	{
				29	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
				30	u64 target = 0;
				31
				32	if (!bctl)
				33	return 0;
				34
				35	if (flags & BTRFS_BLOCK_GROUP_DATA &&
				36	bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
				37	target = BTRFS_BLOCK_GROUP_DATA \| bctl->data.target;
				38	} else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
				39	bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
				40	target = BTRFS_BLOCK_GROUP_SYSTEM \| bctl->sys.target;
				41	} else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
				42	bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
				43	target = BTRFS_BLOCK_GROUP_METADATA \| bctl->meta.target;
				44	}
				45
				46	return target;
				47	}
				48
				49	/*
				50	* @flags: available profiles in extended format (see ctree.h)
				51	*
				52	* Return reduced profile in chunk format. If profile changing is in progress
				53	* (either running or paused) picks the target profile (if it's already
				54	* available), otherwise falls back to plain reducing.
				55	*/
				56	static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
				57	{
				58	u64 num_devices = fs_info->fs_devices->rw_devices;
				59	u64 target;
				60	u64 raid_type;
				61	u64 allowed = 0;
				62
				63	/*
				64	* See if restripe for this chunk_type is in progress, if so try to
				65	* reduce to the target profile
				66	*/
				67	spin_lock(&fs_info->balance_lock);
Josef Bacik	e11c040	2019-06-20 15:38:07 -0400	[diff] [blame]	68	target = get_restripe_target(fs_info, flags);
Josef Bacik	878d7b6	2019-06-20 15:38:05 -0400	[diff] [blame]	69	if (target) {
Josef Bacik	162e0a1	2020-07-21 10:48:46 -0400	[diff] [blame]	70	spin_unlock(&fs_info->balance_lock);
				71	return extended_to_chunk(target);
Josef Bacik	878d7b6	2019-06-20 15:38:05 -0400	[diff] [blame]	72	}
				73	spin_unlock(&fs_info->balance_lock);
				74
				75	/* First, mask out the RAID levels which aren't possible */
				76	for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
				77	if (num_devices >= btrfs_raid_array[raid_type].devs_min)
				78	allowed \|= btrfs_raid_array[raid_type].bg_flag;
				79	}
				80	allowed &= flags;
				81
				82	if (allowed & BTRFS_BLOCK_GROUP_RAID6)
				83	allowed = BTRFS_BLOCK_GROUP_RAID6;
				84	else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
				85	allowed = BTRFS_BLOCK_GROUP_RAID5;
				86	else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
				87	allowed = BTRFS_BLOCK_GROUP_RAID10;
				88	else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
				89	allowed = BTRFS_BLOCK_GROUP_RAID1;
				90	else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
				91	allowed = BTRFS_BLOCK_GROUP_RAID0;
				92
				93	flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
				94
				95	return extended_to_chunk(flags \| allowed);
				96	}
				97
Johannes Thumshirn	ef0a82d	2020-01-02 17:14:57 +0100	[diff] [blame]	98	u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
Josef Bacik	878d7b6	2019-06-20 15:38:05 -0400	[diff] [blame]	99	{
				100	unsigned seq;
				101	u64 flags;
				102
				103	do {
				104	flags = orig_flags;
				105	seq = read_seqbegin(&fs_info->profiles_lock);
				106
				107	if (flags & BTRFS_BLOCK_GROUP_DATA)
				108	flags \|= fs_info->avail_data_alloc_bits;
				109	else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
				110	flags \|= fs_info->avail_system_alloc_bits;
				111	else if (flags & BTRFS_BLOCK_GROUP_METADATA)
				112	flags \|= fs_info->avail_metadata_alloc_bits;
				113	} while (read_seqretry(&fs_info->profiles_lock, seq));
				114
				115	return btrfs_reduce_alloc_profile(fs_info, flags);
				116	}
				117
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	118	void btrfs_get_block_group(struct btrfs_block_group *cache)
Josef Bacik	3cad128	2019-06-20 15:37:46 -0400	[diff] [blame]	119	{
Josef Bacik	48aaeeb	2020-07-06 09:14:11 -0400	[diff] [blame]	120	refcount_inc(&cache->refs);
Josef Bacik	3cad128	2019-06-20 15:37:46 -0400	[diff] [blame]	121	}
				122
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	123	void btrfs_put_block_group(struct btrfs_block_group *cache)
Josef Bacik	3cad128	2019-06-20 15:37:46 -0400	[diff] [blame]	124	{
Josef Bacik	48aaeeb	2020-07-06 09:14:11 -0400	[diff] [blame]	125	if (refcount_dec_and_test(&cache->refs)) {
Josef Bacik	3cad128	2019-06-20 15:37:46 -0400	[diff] [blame]	126	WARN_ON(cache->pinned > 0);
				127	WARN_ON(cache->reserved > 0);
				128
				129	/*
Dennis Zhou	b0643e5	2019-12-13 16:22:14 -0800	[diff] [blame]	130	* A block_group shouldn't be on the discard_list anymore.
				131	* Remove the block_group from the discard_list to prevent us
				132	* from causing a panic due to NULL pointer dereference.
				133	*/
				134	if (WARN_ON(!list_empty(&cache->discard_list)))
				135	btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
				136	cache);
				137
				138	/*
Josef Bacik	3cad128	2019-06-20 15:37:46 -0400	[diff] [blame]	139	* If not empty, someone is still holding mutex of
				140	* full_stripe_lock, which can only be released by caller.
				141	* And it will definitely cause use-after-free when caller
				142	* tries to release full stripe lock.
				143	*
				144	* No better way to resolve, but only to warn.
				145	*/
				146	WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
				147	kfree(cache->free_space_ctl);
Naohiro Aota	dafc340d	2021-08-19 21:19:16 +0900	[diff] [blame]	148	kfree(cache->physical_map);
Josef Bacik	3cad128	2019-06-20 15:37:46 -0400	[diff] [blame]	149	kfree(cache);
				150	}
				151	}
				152
Josef Bacik	2e405ad	2019-06-20 15:37:45 -0400	[diff] [blame]	153	/*
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	154	* This adds the block group to the fs_info rb tree for the block group cache
				155	*/
				156	static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	157	struct btrfs_block_group *block_group)
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	158	{
				159	struct rb_node **p;
				160	struct rb_node *parent = NULL;
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	161	struct btrfs_block_group *cache;
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	162
Qu Wenruo	9afc664	2020-05-05 07:58:20 +0800	[diff] [blame]	163	ASSERT(block_group->length != 0);
				164
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	165	spin_lock(&info->block_group_cache_lock);
				166	p = &info->block_group_cache_tree.rb_node;
				167
				168	while (*p) {
				169	parent = *p;
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	170	cache = rb_entry(parent, struct btrfs_block_group, cache_node);
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	171	if (block_group->start < cache->start) {
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	172	p = &(*p)->rb_left;
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	173	} else if (block_group->start > cache->start) {
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	174	p = &(*p)->rb_right;
				175	} else {
				176	spin_unlock(&info->block_group_cache_lock);
				177	return -EEXIST;
				178	}
				179	}
				180
				181	rb_link_node(&block_group->cache_node, parent, p);
				182	rb_insert_color(&block_group->cache_node,
				183	&info->block_group_cache_tree);
				184
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	185	if (info->first_logical_byte > block_group->start)
				186	info->first_logical_byte = block_group->start;
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	187
				188	spin_unlock(&info->block_group_cache_lock);
				189
				190	return 0;
				191	}
				192
				193	/*
Josef Bacik	2e405ad	2019-06-20 15:37:45 -0400	[diff] [blame]	194	* This will return the block group at or after bytenr if contains is 0, else
				195	* it will return the block group that contains the bytenr
				196	*/
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	197	static struct btrfs_block_group *block_group_cache_tree_search(
Josef Bacik	2e405ad	2019-06-20 15:37:45 -0400	[diff] [blame]	198	struct btrfs_fs_info *info, u64 bytenr, int contains)
				199	{
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	200	struct btrfs_block_group cache, ret = NULL;
Josef Bacik	2e405ad	2019-06-20 15:37:45 -0400	[diff] [blame]	201	struct rb_node *n;
				202	u64 end, start;
				203
				204	spin_lock(&info->block_group_cache_lock);
				205	n = info->block_group_cache_tree.rb_node;
				206
				207	while (n) {
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	208	cache = rb_entry(n, struct btrfs_block_group, cache_node);
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	209	end = cache->start + cache->length - 1;
				210	start = cache->start;
Josef Bacik	2e405ad	2019-06-20 15:37:45 -0400	[diff] [blame]	211
				212	if (bytenr < start) {
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	213	if (!contains && (!ret \|\| start < ret->start))
Josef Bacik	2e405ad	2019-06-20 15:37:45 -0400	[diff] [blame]	214	ret = cache;
				215	n = n->rb_left;
				216	} else if (bytenr > start) {
				217	if (contains && bytenr <= end) {
				218	ret = cache;
				219	break;
				220	}
				221	n = n->rb_right;
				222	} else {
				223	ret = cache;
				224	break;
				225	}
				226	}
				227	if (ret) {
				228	btrfs_get_block_group(ret);
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	229	if (bytenr == 0 && info->first_logical_byte > ret->start)
				230	info->first_logical_byte = ret->start;
Josef Bacik	2e405ad	2019-06-20 15:37:45 -0400	[diff] [blame]	231	}
				232	spin_unlock(&info->block_group_cache_lock);
				233
				234	return ret;
				235	}
				236
				237	/*
				238	* Return the block group that starts at or after bytenr
				239	*/
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	240	struct btrfs_block_group *btrfs_lookup_first_block_group(
Josef Bacik	2e405ad	2019-06-20 15:37:45 -0400	[diff] [blame]	241	struct btrfs_fs_info *info, u64 bytenr)
				242	{
				243	return block_group_cache_tree_search(info, bytenr, 0);
				244	}
				245
				246	/*
				247	* Return the block group that contains the given bytenr
				248	*/
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	249	struct btrfs_block_group *btrfs_lookup_block_group(
Josef Bacik	2e405ad	2019-06-20 15:37:45 -0400	[diff] [blame]	250	struct btrfs_fs_info *info, u64 bytenr)
				251	{
				252	return block_group_cache_tree_search(info, bytenr, 1);
				253	}
				254
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	255	struct btrfs_block_group *btrfs_next_block_group(
				256	struct btrfs_block_group *cache)
Josef Bacik	2e405ad	2019-06-20 15:37:45 -0400	[diff] [blame]	257	{
				258	struct btrfs_fs_info *fs_info = cache->fs_info;
				259	struct rb_node *node;
				260
				261	spin_lock(&fs_info->block_group_cache_lock);
				262
				263	/* If our block group was removed, we need a full search. */
				264	if (RB_EMPTY_NODE(&cache->cache_node)) {
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	265	const u64 next_bytenr = cache->start + cache->length;
Josef Bacik	2e405ad	2019-06-20 15:37:45 -0400	[diff] [blame]	266
				267	spin_unlock(&fs_info->block_group_cache_lock);
				268	btrfs_put_block_group(cache);
				269	cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
				270	}
				271	node = rb_next(&cache->cache_node);
				272	btrfs_put_block_group(cache);
				273	if (node) {
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	274	cache = rb_entry(node, struct btrfs_block_group, cache_node);
Josef Bacik	2e405ad	2019-06-20 15:37:45 -0400	[diff] [blame]	275	btrfs_get_block_group(cache);
				276	} else
				277	cache = NULL;
				278	spin_unlock(&fs_info->block_group_cache_lock);
				279	return cache;
				280	}
Josef Bacik	3eeb322	2019-06-20 15:37:47 -0400	[diff] [blame]	281
				282	bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
				283	{
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	284	struct btrfs_block_group *bg;
Josef Bacik	3eeb322	2019-06-20 15:37:47 -0400	[diff] [blame]	285	bool ret = true;
				286
				287	bg = btrfs_lookup_block_group(fs_info, bytenr);
				288	if (!bg)
				289	return false;
				290
				291	spin_lock(&bg->lock);
				292	if (bg->ro)
				293	ret = false;
				294	else
				295	atomic_inc(&bg->nocow_writers);
				296	spin_unlock(&bg->lock);
				297
				298	/* No put on block group, done by btrfs_dec_nocow_writers */
				299	if (!ret)
				300	btrfs_put_block_group(bg);
				301
				302	return ret;
				303	}
				304
				305	void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
				306	{
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	307	struct btrfs_block_group *bg;
Josef Bacik	3eeb322	2019-06-20 15:37:47 -0400	[diff] [blame]	308
				309	bg = btrfs_lookup_block_group(fs_info, bytenr);
				310	ASSERT(bg);
				311	if (atomic_dec_and_test(&bg->nocow_writers))
				312	wake_up_var(&bg->nocow_writers);
				313	/*
				314	* Once for our lookup and once for the lookup done by a previous call
				315	* to btrfs_inc_nocow_writers()
				316	*/
				317	btrfs_put_block_group(bg);
				318	btrfs_put_block_group(bg);
				319	}
				320
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	321	void btrfs_wait_nocow_writers(struct btrfs_block_group *bg)
Josef Bacik	3eeb322	2019-06-20 15:37:47 -0400	[diff] [blame]	322	{
				323	wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
				324	}
				325
				326	void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
				327	const u64 start)
				328	{
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	329	struct btrfs_block_group *bg;
Josef Bacik	3eeb322	2019-06-20 15:37:47 -0400	[diff] [blame]	330
				331	bg = btrfs_lookup_block_group(fs_info, start);
				332	ASSERT(bg);
				333	if (atomic_dec_and_test(&bg->reservations))
				334	wake_up_var(&bg->reservations);
				335	btrfs_put_block_group(bg);
				336	}
				337
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	338	void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg)
Josef Bacik	3eeb322	2019-06-20 15:37:47 -0400	[diff] [blame]	339	{
				340	struct btrfs_space_info *space_info = bg->space_info;
				341
				342	ASSERT(bg->ro);
				343
				344	if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
				345	return;
				346
				347	/*
				348	* Our block group is read only but before we set it to read only,
				349	* some task might have had allocated an extent from it already, but it
				350	* has not yet created a respective ordered extent (and added it to a
				351	* root's list of ordered extents).
				352	* Therefore wait for any task currently allocating extents, since the
				353	* block group's reservations counter is incremented while a read lock
				354	* on the groups' semaphore is held and decremented after releasing
				355	* the read access on that semaphore and creating the ordered extent.
				356	*/
				357	down_write(&space_info->groups_sem);
				358	up_write(&space_info->groups_sem);
				359
				360	wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
				361	}
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	362
				363	struct btrfs_caching_control *btrfs_get_caching_control(
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	364	struct btrfs_block_group *cache)
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	365	{
				366	struct btrfs_caching_control *ctl;
				367
				368	spin_lock(&cache->lock);
				369	if (!cache->caching_ctl) {
				370	spin_unlock(&cache->lock);
				371	return NULL;
				372	}
				373
				374	ctl = cache->caching_ctl;
				375	refcount_inc(&ctl->count);
				376	spin_unlock(&cache->lock);
				377	return ctl;
				378	}
				379
				380	void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
				381	{
				382	if (refcount_dec_and_test(&ctl->count))
				383	kfree(ctl);
				384	}
				385
				386	/*
				387	* When we wait for progress in the block group caching, its because our
				388	* allocation attempt failed at least once. So, we must sleep and let some
				389	* progress happen before we try again.
				390	*
				391	* This function will sleep at least once waiting for new free space to show
				392	* up, and then it will check the block group free space numbers for our min
				393	* num_bytes. Another option is to have it go ahead and look in the rbtree for
				394	* a free extent of a given size, but this is a good start.
				395	*
				396	* Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
				397	* any of the information in this block group.
				398	*/
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	399	void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	400	u64 num_bytes)
				401	{
				402	struct btrfs_caching_control *caching_ctl;
				403
				404	caching_ctl = btrfs_get_caching_control(cache);
				405	if (!caching_ctl)
				406	return;
				407
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	408	wait_event(caching_ctl->wait, btrfs_block_group_done(cache) \|\|
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	409	(cache->free_space_ctl->free_space >= num_bytes));
				410
				411	btrfs_put_caching_control(caching_ctl);
				412	}
				413
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	414	int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	415	{
				416	struct btrfs_caching_control *caching_ctl;
				417	int ret = 0;
				418
				419	caching_ctl = btrfs_get_caching_control(cache);
				420	if (!caching_ctl)
				421	return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
				422
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	423	wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	424	if (cache->cached == BTRFS_CACHE_ERROR)
				425	ret = -EIO;
				426	btrfs_put_caching_control(caching_ctl);
				427	return ret;
				428	}
				429
Josef Bacik	e747853	2020-10-23 09:58:10 -0400	[diff] [blame]	430	static bool space_cache_v1_done(struct btrfs_block_group *cache)
				431	{
				432	bool ret;
				433
				434	spin_lock(&cache->lock);
				435	ret = cache->cached != BTRFS_CACHE_FAST;
				436	spin_unlock(&cache->lock);
				437
				438	return ret;
				439	}
				440
				441	void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache,
				442	struct btrfs_caching_control *caching_ctl)
				443	{
				444	wait_event(caching_ctl->wait, space_cache_v1_done(cache));
				445	}
				446
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	447	#ifdef CONFIG_BTRFS_DEBUG
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	448	static void fragment_free_space(struct btrfs_block_group *block_group)
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	449	{
				450	struct btrfs_fs_info *fs_info = block_group->fs_info;
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	451	u64 start = block_group->start;
				452	u64 len = block_group->length;
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	453	u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
				454	fs_info->nodesize : fs_info->sectorsize;
				455	u64 step = chunk << 1;
				456
				457	while (len > chunk) {
				458	btrfs_remove_free_space(block_group, start, chunk);
				459	start += step;
				460	if (len < step)
				461	len = 0;
				462	else
				463	len -= step;
				464	}
				465	}
				466	#endif
				467
				468	/*
				469	* This is only called by btrfs_cache_block_group, since we could have freed
				470	* extents we need to check the pinned_extents for any extents that can't be
				471	* used yet since their free space will be released as soon as the transaction
				472	* commits.
				473	*/
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	474	u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end)
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	475	{
				476	struct btrfs_fs_info *info = block_group->fs_info;
				477	u64 extent_start, extent_end, size, total_added = 0;
				478	int ret;
				479
				480	while (start < end) {
Nikolay Borisov	fe119a6	2020-01-20 16:09:18 +0200	[diff] [blame]	481	ret = find_first_extent_bit(&info->excluded_extents, start,
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	482	&extent_start, &extent_end,
				483	EXTENT_DIRTY \| EXTENT_UPTODATE,
				484	NULL);
				485	if (ret)
				486	break;
				487
				488	if (extent_start <= start) {
				489	start = extent_end + 1;
				490	} else if (extent_start > start && extent_start < end) {
				491	size = extent_start - start;
				492	total_added += size;
Dennis Zhou	b0643e5	2019-12-13 16:22:14 -0800	[diff] [blame]	493	ret = btrfs_add_free_space_async_trimmed(block_group,
				494	start, size);
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	495	BUG_ON(ret); /* -ENOMEM or logic error */
				496	start = extent_end + 1;
				497	} else {
				498	break;
				499	}
				500	}
				501
				502	if (start < end) {
				503	size = end - start;
				504	total_added += size;
Dennis Zhou	b0643e5	2019-12-13 16:22:14 -0800	[diff] [blame]	505	ret = btrfs_add_free_space_async_trimmed(block_group, start,
				506	size);
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	507	BUG_ON(ret); /* -ENOMEM or logic error */
				508	}
				509
				510	return total_added;
				511	}
				512
				513	static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
				514	{
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	515	struct btrfs_block_group *block_group = caching_ctl->block_group;
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	516	struct btrfs_fs_info *fs_info = block_group->fs_info;
Josef Bacik	29cbcf4	2021-11-05 16:45:45 -0400	[diff] [blame]	517	struct btrfs_root *extent_root;
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	518	struct btrfs_path *path;
				519	struct extent_buffer *leaf;
				520	struct btrfs_key key;
				521	u64 total_found = 0;
				522	u64 last = 0;
				523	u32 nritems;
				524	int ret;
				525	bool wakeup = true;
				526
				527	path = btrfs_alloc_path();
				528	if (!path)
				529	return -ENOMEM;
				530
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	531	last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET);
Josef Bacik	29cbcf4	2021-11-05 16:45:45 -0400	[diff] [blame]	532	extent_root = btrfs_extent_root(fs_info, last);
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	533
				534	#ifdef CONFIG_BTRFS_DEBUG
				535	/*
				536	* If we're fragmenting we don't want to make anybody think we can
				537	* allocate from this block group until we've had a chance to fragment
				538	* the free space.
				539	*/
				540	if (btrfs_should_fragment_free_space(block_group))
				541	wakeup = false;
				542	#endif
				543	/*
				544	* We don't want to deadlock with somebody trying to allocate a new
				545	* extent for the extent root while also trying to search the extent
				546	* root to add free space. So we skip locking and search the commit
				547	* root, since its read-only
				548	*/
				549	path->skip_locking = 1;
				550	path->search_commit_root = 1;
				551	path->reada = READA_FORWARD;
				552
				553	key.objectid = last;
				554	key.offset = 0;
				555	key.type = BTRFS_EXTENT_ITEM_KEY;
				556
				557	next:
				558	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
				559	if (ret < 0)
				560	goto out;
				561
				562	leaf = path->nodes[0];
				563	nritems = btrfs_header_nritems(leaf);
				564
				565	while (1) {
				566	if (btrfs_fs_closing(fs_info) > 1) {
				567	last = (u64)-1;
				568	break;
				569	}
				570
				571	if (path->slots[0] < nritems) {
				572	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
				573	} else {
				574	ret = btrfs_find_next_key(extent_root, path, &key, 0, 0);
				575	if (ret)
				576	break;
				577
				578	if (need_resched() \|\|
				579	rwsem_is_contended(&fs_info->commit_root_sem)) {
				580	if (wakeup)
				581	caching_ctl->progress = last;
				582	btrfs_release_path(path);
				583	up_read(&fs_info->commit_root_sem);
				584	mutex_unlock(&caching_ctl->mutex);
				585	cond_resched();
				586	mutex_lock(&caching_ctl->mutex);
				587	down_read(&fs_info->commit_root_sem);
				588	goto next;
				589	}
				590
				591	ret = btrfs_next_leaf(extent_root, path);
				592	if (ret < 0)
				593	goto out;
				594	if (ret)
				595	break;
				596	leaf = path->nodes[0];
				597	nritems = btrfs_header_nritems(leaf);
				598	continue;
				599	}
				600
				601	if (key.objectid < last) {
				602	key.objectid = last;
				603	key.offset = 0;
				604	key.type = BTRFS_EXTENT_ITEM_KEY;
				605
				606	if (wakeup)
				607	caching_ctl->progress = last;
				608	btrfs_release_path(path);
				609	goto next;
				610	}
				611
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	612	if (key.objectid < block_group->start) {
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	613	path->slots[0]++;
				614	continue;
				615	}
				616
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	617	if (key.objectid >= block_group->start + block_group->length)
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	618	break;
				619
				620	if (key.type == BTRFS_EXTENT_ITEM_KEY \|\|
				621	key.type == BTRFS_METADATA_ITEM_KEY) {
				622	total_found += add_new_free_space(block_group, last,
				623	key.objectid);
				624	if (key.type == BTRFS_METADATA_ITEM_KEY)
				625	last = key.objectid +
				626	fs_info->nodesize;
				627	else
				628	last = key.objectid + key.offset;
				629
				630	if (total_found > CACHING_CTL_WAKE_UP) {
				631	total_found = 0;
				632	if (wakeup)
				633	wake_up(&caching_ctl->wait);
				634	}
				635	}
				636	path->slots[0]++;
				637	}
				638	ret = 0;
				639
				640	total_found += add_new_free_space(block_group, last,
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	641	block_group->start + block_group->length);
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	642	caching_ctl->progress = (u64)-1;
				643
				644	out:
				645	btrfs_free_path(path);
				646	return ret;
				647	}
				648
				649	static noinline void caching_thread(struct btrfs_work *work)
				650	{
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	651	struct btrfs_block_group *block_group;
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	652	struct btrfs_fs_info *fs_info;
				653	struct btrfs_caching_control *caching_ctl;
				654	int ret;
				655
				656	caching_ctl = container_of(work, struct btrfs_caching_control, work);
				657	block_group = caching_ctl->block_group;
				658	fs_info = block_group->fs_info;
				659
				660	mutex_lock(&caching_ctl->mutex);
				661	down_read(&fs_info->commit_root_sem);
				662
Josef Bacik	e747853	2020-10-23 09:58:10 -0400	[diff] [blame]	663	if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
				664	ret = load_free_space_cache(block_group);
				665	if (ret == 1) {
				666	ret = 0;
				667	goto done;
				668	}
				669
				670	/*
				671	* We failed to load the space cache, set ourselves to
				672	* CACHE_STARTED and carry on.
				673	*/
				674	spin_lock(&block_group->lock);
				675	block_group->cached = BTRFS_CACHE_STARTED;
				676	spin_unlock(&block_group->lock);
				677	wake_up(&caching_ctl->wait);
				678	}
				679
Josef Bacik	2f96e40	2021-01-15 16:26:17 -0500	[diff] [blame]	680	/*
				681	* If we are in the transaction that populated the free space tree we
				682	* can't actually cache from the free space tree as our commit root and
				683	* real root are the same, so we could change the contents of the blocks
				684	* while caching. Instead do the slow caching in this case, and after
				685	* the transaction has committed we will be safe.
				686	*/
				687	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
				688	!(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags)))
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	689	ret = load_free_space_tree(caching_ctl);
				690	else
				691	ret = load_extent_tree_free(caching_ctl);
Josef Bacik	e747853	2020-10-23 09:58:10 -0400	[diff] [blame]	692	done:
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	693	spin_lock(&block_group->lock);
				694	block_group->caching_ctl = NULL;
				695	block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
				696	spin_unlock(&block_group->lock);
				697
				698	#ifdef CONFIG_BTRFS_DEBUG
				699	if (btrfs_should_fragment_free_space(block_group)) {
				700	u64 bytes_used;
				701
				702	spin_lock(&block_group->space_info->lock);
				703	spin_lock(&block_group->lock);
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	704	bytes_used = block_group->length - block_group->used;
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	705	block_group->space_info->bytes_used += bytes_used >> 1;
				706	spin_unlock(&block_group->lock);
				707	spin_unlock(&block_group->space_info->lock);
Josef Bacik	e11c040	2019-06-20 15:38:07 -0400	[diff] [blame]	708	fragment_free_space(block_group);
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	709	}
				710	#endif
				711
				712	caching_ctl->progress = (u64)-1;
				713
				714	up_read(&fs_info->commit_root_sem);
				715	btrfs_free_excluded_extents(block_group);
				716	mutex_unlock(&caching_ctl->mutex);
				717
				718	wake_up(&caching_ctl->wait);
				719
				720	btrfs_put_caching_control(caching_ctl);
				721	btrfs_put_block_group(block_group);
				722	}
				723
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	724	int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only)
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	725	{
				726	DEFINE_WAIT(wait);
				727	struct btrfs_fs_info *fs_info = cache->fs_info;
Josef Bacik	e747853	2020-10-23 09:58:10 -0400	[diff] [blame]	728	struct btrfs_caching_control *caching_ctl = NULL;
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	729	int ret = 0;
				730
Naohiro Aota	2eda570	2021-02-04 19:21:53 +0900	[diff] [blame]	731	/* Allocator for zoned filesystems does not use the cache at all */
				732	if (btrfs_is_zoned(fs_info))
				733	return 0;
				734
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	735	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
				736	if (!caching_ctl)
				737	return -ENOMEM;
				738
				739	INIT_LIST_HEAD(&caching_ctl->list);
				740	mutex_init(&caching_ctl->mutex);
				741	init_waitqueue_head(&caching_ctl->wait);
				742	caching_ctl->block_group = cache;
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	743	caching_ctl->progress = cache->start;
Josef Bacik	e747853	2020-10-23 09:58:10 -0400	[diff] [blame]	744	refcount_set(&caching_ctl->count, 2);
Omar Sandoval	a0cac0e	2019-09-16 11:30:57 -0700	[diff] [blame]	745	btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	746
				747	spin_lock(&cache->lock);
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	748	if (cache->cached != BTRFS_CACHE_NO) {
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	749	kfree(caching_ctl);
Josef Bacik	e747853	2020-10-23 09:58:10 -0400	[diff] [blame]	750
				751	caching_ctl = cache->caching_ctl;
				752	if (caching_ctl)
				753	refcount_inc(&caching_ctl->count);
				754	spin_unlock(&cache->lock);
				755	goto out;
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	756	}
				757	WARN_ON(cache->caching_ctl);
				758	cache->caching_ctl = caching_ctl;
Josef Bacik	e747853	2020-10-23 09:58:10 -0400	[diff] [blame]	759	if (btrfs_test_opt(fs_info, SPACE_CACHE))
				760	cache->cached = BTRFS_CACHE_FAST;
				761	else
				762	cache->cached = BTRFS_CACHE_STARTED;
				763	cache->has_caching_ctl = 1;
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	764	spin_unlock(&cache->lock);
				765
Josef Bacik	bbb86a3	2020-10-23 09:58:11 -0400	[diff] [blame]	766	spin_lock(&fs_info->block_group_cache_lock);
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	767	refcount_inc(&caching_ctl->count);
				768	list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
Josef Bacik	bbb86a3	2020-10-23 09:58:11 -0400	[diff] [blame]	769	spin_unlock(&fs_info->block_group_cache_lock);
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	770
				771	btrfs_get_block_group(cache);
				772
				773	btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
Josef Bacik	e747853	2020-10-23 09:58:10 -0400	[diff] [blame]	774	out:
				775	if (load_cache_only && caching_ctl)
				776	btrfs_wait_space_cache_v1_finished(cache, caching_ctl);
				777	if (caching_ctl)
				778	btrfs_put_caching_control(caching_ctl);
Josef Bacik	9f21246	2019-08-06 16:43:19 +0200	[diff] [blame]	779
				780	return ret;
				781	}
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	782
				783	static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
				784	{
				785	u64 extra_flags = chunk_to_extended(flags) &
				786	BTRFS_EXTENDED_PROFILE_MASK;
				787
				788	write_seqlock(&fs_info->profiles_lock);
				789	if (flags & BTRFS_BLOCK_GROUP_DATA)
				790	fs_info->avail_data_alloc_bits &= ~extra_flags;
				791	if (flags & BTRFS_BLOCK_GROUP_METADATA)
				792	fs_info->avail_metadata_alloc_bits &= ~extra_flags;
				793	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
				794	fs_info->avail_system_alloc_bits &= ~extra_flags;
				795	write_sequnlock(&fs_info->profiles_lock);
				796	}
				797
				798	/*
				799	* Clear incompat bits for the following feature(s):
				800	*
				801	* - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
				802	* in the whole filesystem
David Sterba	9c90744	2019-10-31 15:52:01 +0100	[diff] [blame]	803	*
				804	* - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	805	*/
				806	static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
				807	{
David Sterba	9c90744	2019-10-31 15:52:01 +0100	[diff] [blame]	808	bool found_raid56 = false;
				809	bool found_raid1c34 = false;
				810
				811	if ((flags & BTRFS_BLOCK_GROUP_RAID56_MASK) \|\|
				812	(flags & BTRFS_BLOCK_GROUP_RAID1C3) \|\|
				813	(flags & BTRFS_BLOCK_GROUP_RAID1C4)) {
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	814	struct list_head *head = &fs_info->space_info;
				815	struct btrfs_space_info *sinfo;
				816
				817	list_for_each_entry_rcu(sinfo, head, list) {
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	818	down_read(&sinfo->groups_sem);
				819	if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
David Sterba	9c90744	2019-10-31 15:52:01 +0100	[diff] [blame]	820	found_raid56 = true;
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	821	if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
David Sterba	9c90744	2019-10-31 15:52:01 +0100	[diff] [blame]	822	found_raid56 = true;
				823	if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C3]))
				824	found_raid1c34 = true;
				825	if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C4]))
				826	found_raid1c34 = true;
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	827	up_read(&sinfo->groups_sem);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	828	}
Filipe Manana	d8e6fd5	2020-03-20 18:43:48 +0000	[diff] [blame]	829	if (!found_raid56)
David Sterba	9c90744	2019-10-31 15:52:01 +0100	[diff] [blame]	830	btrfs_clear_fs_incompat(fs_info, RAID56);
Filipe Manana	d8e6fd5	2020-03-20 18:43:48 +0000	[diff] [blame]	831	if (!found_raid1c34)
David Sterba	9c90744	2019-10-31 15:52:01 +0100	[diff] [blame]	832	btrfs_clear_fs_incompat(fs_info, RAID1C34);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	833	}
				834	}
				835
Qu Wenruo	7357623	2020-05-05 07:58:21 +0800	[diff] [blame]	836	static int remove_block_group_item(struct btrfs_trans_handle *trans,
				837	struct btrfs_path *path,
				838	struct btrfs_block_group *block_group)
				839	{
				840	struct btrfs_fs_info *fs_info = trans->fs_info;
				841	struct btrfs_root *root;
				842	struct btrfs_key key;
				843	int ret;
				844
Josef Bacik	dfe8aec	2021-11-05 16:45:36 -0400	[diff] [blame]	845	root = btrfs_block_group_root(fs_info);
Qu Wenruo	7357623	2020-05-05 07:58:21 +0800	[diff] [blame]	846	key.objectid = block_group->start;
				847	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
				848	key.offset = block_group->length;
				849
				850	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
				851	if (ret > 0)
				852	ret = -ENOENT;
				853	if (ret < 0)
				854	return ret;
				855
				856	ret = btrfs_del_item(trans, root, path);
				857	return ret;
				858	}
				859
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	860	int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
				861	u64 group_start, struct extent_map *em)
				862	{
				863	struct btrfs_fs_info *fs_info = trans->fs_info;
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	864	struct btrfs_path *path;
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	865	struct btrfs_block_group *block_group;
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	866	struct btrfs_free_cluster *cluster;
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	867	struct inode *inode;
				868	struct kobject *kobj = NULL;
				869	int ret;
				870	int index;
				871	int factor;
				872	struct btrfs_caching_control *caching_ctl = NULL;
				873	bool remove_em;
				874	bool remove_rsv = false;
				875
				876	block_group = btrfs_lookup_block_group(fs_info, group_start);
				877	BUG_ON(!block_group);
				878	BUG_ON(!block_group->ro);
				879
				880	trace_btrfs_remove_block_group(block_group);
				881	/*
				882	* Free the reserved super bytes from this block group before
				883	* remove it.
				884	*/
				885	btrfs_free_excluded_extents(block_group);
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	886	btrfs_free_ref_tree_range(fs_info, block_group->start,
				887	block_group->length);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	888
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	889	index = btrfs_bg_flags_to_raid_index(block_group->flags);
				890	factor = btrfs_bg_type_to_factor(block_group->flags);
				891
				892	/* make sure this block group isn't part of an allocation cluster */
				893	cluster = &fs_info->data_alloc_cluster;
				894	spin_lock(&cluster->refill_lock);
				895	btrfs_return_cluster_to_free_space(block_group, cluster);
				896	spin_unlock(&cluster->refill_lock);
				897
				898	/*
				899	* make sure this block group isn't part of a metadata
				900	* allocation cluster
				901	*/
				902	cluster = &fs_info->meta_alloc_cluster;
				903	spin_lock(&cluster->refill_lock);
				904	btrfs_return_cluster_to_free_space(block_group, cluster);
				905	spin_unlock(&cluster->refill_lock);
				906
Naohiro Aota	40ab3be	2021-02-04 19:22:18 +0900	[diff] [blame]	907	btrfs_clear_treelog_bg(block_group);
Johannes Thumshirn	c2707a2	2021-09-09 01:19:26 +0900	[diff] [blame]	908	btrfs_clear_data_reloc_bg(block_group);
Naohiro Aota	40ab3be	2021-02-04 19:22:18 +0900	[diff] [blame]	909
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	910	path = btrfs_alloc_path();
				911	if (!path) {
				912	ret = -ENOMEM;
Filipe Manana	9fecd13	2020-06-01 19:12:06 +0100	[diff] [blame]	913	goto out;
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	914	}
				915
				916	/*
				917	* get the inode first so any iput calls done for the io_list
				918	* aren't the final iput (no unlinks allowed now)
				919	*/
				920	inode = lookup_free_space_inode(block_group, path);
				921
				922	mutex_lock(&trans->transaction->cache_write_mutex);
				923	/*
				924	* Make sure our free space cache IO is done before removing the
				925	* free space inode
				926	*/
				927	spin_lock(&trans->transaction->dirty_bgs_lock);
				928	if (!list_empty(&block_group->io_list)) {
				929	list_del_init(&block_group->io_list);
				930
				931	WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
				932
				933	spin_unlock(&trans->transaction->dirty_bgs_lock);
				934	btrfs_wait_cache_io(trans, block_group, path);
				935	btrfs_put_block_group(block_group);
				936	spin_lock(&trans->transaction->dirty_bgs_lock);
				937	}
				938
				939	if (!list_empty(&block_group->dirty_list)) {
				940	list_del_init(&block_group->dirty_list);
				941	remove_rsv = true;
				942	btrfs_put_block_group(block_group);
				943	}
				944	spin_unlock(&trans->transaction->dirty_bgs_lock);
				945	mutex_unlock(&trans->transaction->cache_write_mutex);
				946
Boris Burkov	36b216c	2020-11-18 15:06:25 -0800	[diff] [blame]	947	ret = btrfs_remove_free_space_inode(trans, inode, block_group);
				948	if (ret)
Filipe Manana	9fecd13	2020-06-01 19:12:06 +0100	[diff] [blame]	949	goto out;
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	950
				951	spin_lock(&fs_info->block_group_cache_lock);
				952	rb_erase(&block_group->cache_node,
				953	&fs_info->block_group_cache_tree);
				954	RB_CLEAR_NODE(&block_group->cache_node);
				955
Filipe Manana	9fecd13	2020-06-01 19:12:06 +0100	[diff] [blame]	956	/* Once for the block groups rbtree */
				957	btrfs_put_block_group(block_group);
				958
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	959	if (fs_info->first_logical_byte == block_group->start)
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	960	fs_info->first_logical_byte = (u64)-1;
				961	spin_unlock(&fs_info->block_group_cache_lock);
				962
				963	down_write(&block_group->space_info->groups_sem);
				964	/*
				965	* we must use list_del_init so people can check to see if they
				966	* are still on the list after taking the semaphore
				967	*/
				968	list_del_init(&block_group->list);
				969	if (list_empty(&block_group->space_info->block_groups[index])) {
				970	kobj = block_group->space_info->block_group_kobjs[index];
				971	block_group->space_info->block_group_kobjs[index] = NULL;
				972	clear_avail_alloc_bits(fs_info, block_group->flags);
				973	}
				974	up_write(&block_group->space_info->groups_sem);
				975	clear_incompat_bg_bits(fs_info, block_group->flags);
				976	if (kobj) {
				977	kobject_del(kobj);
				978	kobject_put(kobj);
				979	}
				980
				981	if (block_group->has_caching_ctl)
				982	caching_ctl = btrfs_get_caching_control(block_group);
				983	if (block_group->cached == BTRFS_CACHE_STARTED)
				984	btrfs_wait_block_group_cache_done(block_group);
				985	if (block_group->has_caching_ctl) {
Josef Bacik	bbb86a3	2020-10-23 09:58:11 -0400	[diff] [blame]	986	spin_lock(&fs_info->block_group_cache_lock);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	987	if (!caching_ctl) {
				988	struct btrfs_caching_control *ctl;
				989
				990	list_for_each_entry(ctl,
				991	&fs_info->caching_block_groups, list)
				992	if (ctl->block_group == block_group) {
				993	caching_ctl = ctl;
				994	refcount_inc(&caching_ctl->count);
				995	break;
				996	}
				997	}
				998	if (caching_ctl)
				999	list_del_init(&caching_ctl->list);
Josef Bacik	bbb86a3	2020-10-23 09:58:11 -0400	[diff] [blame]	1000	spin_unlock(&fs_info->block_group_cache_lock);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1001	if (caching_ctl) {
				1002	/* Once for the caching bgs list and once for us. */
				1003	btrfs_put_caching_control(caching_ctl);
				1004	btrfs_put_caching_control(caching_ctl);
				1005	}
				1006	}
				1007
				1008	spin_lock(&trans->transaction->dirty_bgs_lock);
				1009	WARN_ON(!list_empty(&block_group->dirty_list));
				1010	WARN_ON(!list_empty(&block_group->io_list));
				1011	spin_unlock(&trans->transaction->dirty_bgs_lock);
				1012
				1013	btrfs_remove_free_space_cache(block_group);
				1014
				1015	spin_lock(&block_group->space_info->lock);
				1016	list_del_init(&block_group->ro_list);
				1017
				1018	if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
				1019	WARN_ON(block_group->space_info->total_bytes
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	1020	< block_group->length);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1021	WARN_ON(block_group->space_info->bytes_readonly
Naohiro Aota	169e0da	2021-02-04 19:21:52 +0900	[diff] [blame]	1022	< block_group->length - block_group->zone_unusable);
				1023	WARN_ON(block_group->space_info->bytes_zone_unusable
				1024	< block_group->zone_unusable);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1025	WARN_ON(block_group->space_info->disk_total
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	1026	< block_group->length * factor);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1027	}
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	1028	block_group->space_info->total_bytes -= block_group->length;
Naohiro Aota	169e0da	2021-02-04 19:21:52 +0900	[diff] [blame]	1029	block_group->space_info->bytes_readonly -=
				1030	(block_group->length - block_group->zone_unusable);
				1031	block_group->space_info->bytes_zone_unusable -=
				1032	block_group->zone_unusable;
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	1033	block_group->space_info->disk_total -= block_group->length * factor;
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1034
				1035	spin_unlock(&block_group->space_info->lock);
				1036
Filipe Manana	ffcb9d4	2020-06-01 19:12:19 +0100	[diff] [blame]	1037	/*
				1038	* Remove the free space for the block group from the free space tree
				1039	* and the block group's item from the extent tree before marking the
				1040	* block group as removed. This is to prevent races with tasks that
				1041	* freeze and unfreeze a block group, this task and another task
				1042	* allocating a new block group - the unfreeze task ends up removing
				1043	* the block group's extent map before the task calling this function
				1044	* deletes the block group item from the extent tree, allowing for
				1045	* another task to attempt to create another block group with the same
				1046	* item key (and failing with -EEXIST and a transaction abort).
				1047	*/
				1048	ret = remove_block_group_free_space(trans, block_group);
				1049	if (ret)
				1050	goto out;
				1051
				1052	ret = remove_block_group_item(trans, path, block_group);
				1053	if (ret < 0)
				1054	goto out;
				1055
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1056	spin_lock(&block_group->lock);
				1057	block_group->removed = 1;
				1058	/*
Filipe Manana	6b7304a	2020-05-08 11:01:47 +0100	[diff] [blame]	1059	* At this point trimming or scrub can't start on this block group,
				1060	* because we removed the block group from the rbtree
				1061	* fs_info->block_group_cache_tree so no one can't find it anymore and
				1062	* even if someone already got this block group before we removed it
				1063	* from the rbtree, they have already incremented block_group->frozen -
				1064	* if they didn't, for the trimming case they won't find any free space
				1065	* entries because we already removed them all when we called
				1066	* btrfs_remove_free_space_cache().
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1067	*
				1068	* And we must not remove the extent map from the fs_info->mapping_tree
				1069	* to prevent the same logical address range and physical device space
Filipe Manana	6b7304a	2020-05-08 11:01:47 +0100	[diff] [blame]	1070	* ranges from being reused for a new block group. This is needed to
				1071	* avoid races with trimming and scrub.
				1072	*
				1073	* An fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1074	* completely transactionless, so while it is trimming a range the
				1075	* currently running transaction might finish and a new one start,
				1076	* allowing for new block groups to be created that can reuse the same
				1077	* physical device locations unless we take this special care.
				1078	*
				1079	* There may also be an implicit trim operation if the file system
				1080	* is mounted with -odiscard. The same protections must remain
				1081	* in place until the extents have been discarded completely when
				1082	* the transaction commit has completed.
				1083	*/
Filipe Manana	6b7304a	2020-05-08 11:01:47 +0100	[diff] [blame]	1084	remove_em = (atomic_read(&block_group->frozen) == 0);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1085	spin_unlock(&block_group->lock);
				1086
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1087	if (remove_em) {
				1088	struct extent_map_tree *em_tree;
				1089
				1090	em_tree = &fs_info->mapping_tree;
				1091	write_lock(&em_tree->lock);
				1092	remove_extent_mapping(em_tree, em);
				1093	write_unlock(&em_tree->lock);
				1094	/* once for the tree */
				1095	free_extent_map(em);
				1096	}
Xiyu Yang	f6033c5	2020-04-21 10:54:11 +0800	[diff] [blame]	1097
Filipe Manana	9fecd13	2020-06-01 19:12:06 +0100	[diff] [blame]	1098	out:
Xiyu Yang	f6033c5	2020-04-21 10:54:11 +0800	[diff] [blame]	1099	/* Once for the lookup reference */
				1100	btrfs_put_block_group(block_group);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1101	if (remove_rsv)
				1102	btrfs_delayed_refs_rsv_release(fs_info, 1);
				1103	btrfs_free_path(path);
				1104	return ret;
				1105	}
				1106
				1107	struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
				1108	struct btrfs_fs_info *fs_info, const u64 chunk_offset)
				1109	{
Josef Bacik	dfe8aec	2021-11-05 16:45:36 -0400	[diff] [blame]	1110	struct btrfs_root *root = btrfs_block_group_root(fs_info);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1111	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
				1112	struct extent_map *em;
				1113	struct map_lookup *map;
				1114	unsigned int num_items;
				1115
				1116	read_lock(&em_tree->lock);
				1117	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
				1118	read_unlock(&em_tree->lock);
				1119	ASSERT(em && em->start == chunk_offset);
				1120
				1121	/*
				1122	* We need to reserve 3 + N units from the metadata space info in order
				1123	* to remove a block group (done at btrfs_remove_chunk() and at
				1124	* btrfs_remove_block_group()), which are used for:
				1125	*
				1126	* 1 unit for adding the free space inode's orphan (located in the tree
				1127	* of tree roots).
				1128	* 1 unit for deleting the block group item (located in the extent
				1129	* tree).
				1130	* 1 unit for deleting the free space item (located in tree of tree
				1131	* roots).
				1132	* N units for deleting N device extent items corresponding to each
				1133	* stripe (located in the device tree).
				1134	*
				1135	* In order to remove a block group we also need to reserve units in the
				1136	* system space info in order to update the chunk tree (update one or
				1137	* more device items and remove one chunk item), but this is done at
				1138	* btrfs_remove_chunk() through a call to check_system_chunk().
				1139	*/
				1140	map = em->map_lookup;
				1141	num_items = 3 + map->num_stripes;
				1142	free_extent_map(em);
				1143
Josef Bacik	dfe8aec	2021-11-05 16:45:36 -0400	[diff] [blame]	1144	return btrfs_start_transaction_fallback_global_rsv(root, num_items);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1145	}
				1146
				1147	/*
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	1148	* Mark block group @cache read-only, so later write won't happen to block
				1149	* group @cache.
				1150	*
				1151	* If @force is not set, this function will only mark the block group readonly
				1152	* if we have enough free space (1M) in other metadata/system block groups.
				1153	* If @force is not set, this function will mark the block group readonly
				1154	* without checking free space.
				1155	*
				1156	* NOTE: This function doesn't care if other block groups can contain all the
				1157	* data in this block group. That check should be done by relocation routine,
				1158	* not this function.
				1159	*/
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	1160	static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	1161	{
				1162	struct btrfs_space_info *sinfo = cache->space_info;
				1163	u64 num_bytes;
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	1164	int ret = -ENOSPC;
				1165
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	1166	spin_lock(&sinfo->lock);
				1167	spin_lock(&cache->lock);
				1168
Filipe Manana	195a49e	2021-02-05 12:55:37 +0000	[diff] [blame]	1169	if (cache->swap_extents) {
				1170	ret = -ETXTBSY;
				1171	goto out;
				1172	}
				1173
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	1174	if (cache->ro) {
				1175	cache->ro++;
				1176	ret = 0;
				1177	goto out;
				1178	}
				1179
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	1180	num_bytes = cache->length - cache->reserved - cache->pinned -
Naohiro Aota	169e0da	2021-02-04 19:21:52 +0900	[diff] [blame]	1181	cache->bytes_super - cache->zone_unusable - cache->used;
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	1182
				1183	/*
Josef Bacik	a30a3d2	2020-01-17 09:07:39 -0500	[diff] [blame]	1184	* Data never overcommits, even in mixed mode, so do just the straight
				1185	* check of left over space in how much we have allocated.
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	1186	*/
Josef Bacik	a30a3d2	2020-01-17 09:07:39 -0500	[diff] [blame]	1187	if (force) {
				1188	ret = 0;
				1189	} else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) {
				1190	u64 sinfo_used = btrfs_space_info_used(sinfo, true);
				1191
				1192	/*
				1193	* Here we make sure if we mark this bg RO, we still have enough
				1194	* free space as buffer.
				1195	*/
				1196	if (sinfo_used + num_bytes <= sinfo->total_bytes)
				1197	ret = 0;
				1198	} else {
				1199	/*
				1200	* We overcommit metadata, so we need to do the
				1201	* btrfs_can_overcommit check here, and we need to pass in
				1202	* BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of
				1203	* leeway to allow us to mark this block group as read only.
				1204	*/
				1205	if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes,
				1206	BTRFS_RESERVE_NO_FLUSH))
				1207	ret = 0;
				1208	}
				1209
				1210	if (!ret) {
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	1211	sinfo->bytes_readonly += num_bytes;
Naohiro Aota	169e0da	2021-02-04 19:21:52 +0900	[diff] [blame]	1212	if (btrfs_is_zoned(cache->fs_info)) {
				1213	/* Migrate zone_unusable bytes to readonly */
				1214	sinfo->bytes_readonly += cache->zone_unusable;
				1215	sinfo->bytes_zone_unusable -= cache->zone_unusable;
				1216	cache->zone_unusable = 0;
				1217	}
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	1218	cache->ro++;
				1219	list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	1220	}
				1221	out:
				1222	spin_unlock(&cache->lock);
				1223	spin_unlock(&sinfo->lock);
				1224	if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
				1225	btrfs_info(cache->fs_info,
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	1226	"unable to make block group %llu ro", cache->start);
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	1227	btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
				1228	}
				1229	return ret;
				1230	}
				1231
Nikolay Borisov	fe119a6	2020-01-20 16:09:18 +0200	[diff] [blame]	1232	static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
				1233	struct btrfs_block_group *bg)
Nikolay Borisov	45bb5d6	2020-01-20 16:09:17 +0200	[diff] [blame]	1234	{
				1235	struct btrfs_fs_info *fs_info = bg->fs_info;
Nikolay Borisov	fe119a6	2020-01-20 16:09:18 +0200	[diff] [blame]	1236	struct btrfs_transaction *prev_trans = NULL;
Nikolay Borisov	45bb5d6	2020-01-20 16:09:17 +0200	[diff] [blame]	1237	const u64 start = bg->start;
				1238	const u64 end = start + bg->length - 1;
				1239	int ret;
				1240
Nikolay Borisov	fe119a6	2020-01-20 16:09:18 +0200	[diff] [blame]	1241	spin_lock(&fs_info->trans_lock);
				1242	if (trans->transaction->list.prev != &fs_info->trans_list) {
				1243	prev_trans = list_last_entry(&trans->transaction->list,
				1244	struct btrfs_transaction, list);
				1245	refcount_inc(&prev_trans->use_count);
				1246	}
				1247	spin_unlock(&fs_info->trans_lock);
				1248
Nikolay Borisov	45bb5d6	2020-01-20 16:09:17 +0200	[diff] [blame]	1249	/*
				1250	* Hold the unused_bg_unpin_mutex lock to avoid racing with
				1251	* btrfs_finish_extent_commit(). If we are at transaction N, another
				1252	* task might be running finish_extent_commit() for the previous
				1253	* transaction N - 1, and have seen a range belonging to the block
Nikolay Borisov	fe119a6	2020-01-20 16:09:18 +0200	[diff] [blame]	1254	* group in pinned_extents before we were able to clear the whole block
				1255	* group range from pinned_extents. This means that task can lookup for
				1256	* the block group after we unpinned it from pinned_extents and removed
				1257	* it, leading to a BUG_ON() at unpin_extent_range().
Nikolay Borisov	45bb5d6	2020-01-20 16:09:17 +0200	[diff] [blame]	1258	*/
				1259	mutex_lock(&fs_info->unused_bg_unpin_mutex);
Nikolay Borisov	fe119a6	2020-01-20 16:09:18 +0200	[diff] [blame]	1260	if (prev_trans) {
				1261	ret = clear_extent_bits(&prev_trans->pinned_extents, start, end,
				1262	EXTENT_DIRTY);
				1263	if (ret)
Filipe Manana	534cf53	2020-04-17 16:36:50 +0100	[diff] [blame]	1264	goto out;
Nikolay Borisov	fe119a6	2020-01-20 16:09:18 +0200	[diff] [blame]	1265	}
Nikolay Borisov	45bb5d6	2020-01-20 16:09:17 +0200	[diff] [blame]	1266
Nikolay Borisov	fe119a6	2020-01-20 16:09:18 +0200	[diff] [blame]	1267	ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end,
Nikolay Borisov	45bb5d6	2020-01-20 16:09:17 +0200	[diff] [blame]	1268	EXTENT_DIRTY);
Filipe Manana	534cf53	2020-04-17 16:36:50 +0100	[diff] [blame]	1269	out:
Nikolay Borisov	45bb5d6	2020-01-20 16:09:17 +0200	[diff] [blame]	1270	mutex_unlock(&fs_info->unused_bg_unpin_mutex);
Filipe Manana	5150bf1	2020-04-17 16:36:15 +0100	[diff] [blame]	1271	if (prev_trans)
				1272	btrfs_put_transaction(prev_trans);
Nikolay Borisov	45bb5d6	2020-01-20 16:09:17 +0200	[diff] [blame]	1273
Filipe Manana	534cf53	2020-04-17 16:36:50 +0100	[diff] [blame]	1274	return ret == 0;
Nikolay Borisov	45bb5d6	2020-01-20 16:09:17 +0200	[diff] [blame]	1275	}
				1276
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	1277	/*
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1278	* Process the unused_bgs list and remove any that don't have any allocated
				1279	* space inside of them.
				1280	*/
				1281	void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
				1282	{
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	1283	struct btrfs_block_group *block_group;
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1284	struct btrfs_space_info *space_info;
				1285	struct btrfs_trans_handle *trans;
Dennis Zhou	6e80d4f	2019-12-13 16:22:15 -0800	[diff] [blame]	1286	const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1287	int ret = 0;
				1288
				1289	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
				1290	return;
				1291
Josef Bacik	ddfd08c	2020-12-18 14:24:19 -0500	[diff] [blame]	1292	/*
				1293	* Long running balances can keep us blocked here for eternity, so
				1294	* simply skip deletion if we're unable to get the mutex.
				1295	*/
Johannes Thumshirn	f337206	2021-04-19 16:41:01 +0900	[diff] [blame]	1296	if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
Josef Bacik	ddfd08c	2020-12-18 14:24:19 -0500	[diff] [blame]	1297	return;
				1298
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1299	spin_lock(&fs_info->unused_bgs_lock);
				1300	while (!list_empty(&fs_info->unused_bgs)) {
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1301	int trimming;
				1302
				1303	block_group = list_first_entry(&fs_info->unused_bgs,
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	1304	struct btrfs_block_group,
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1305	bg_list);
				1306	list_del_init(&block_group->bg_list);
				1307
				1308	space_info = block_group->space_info;
				1309
				1310	if (ret \|\| btrfs_mixed_space_info(space_info)) {
				1311	btrfs_put_block_group(block_group);
				1312	continue;
				1313	}
				1314	spin_unlock(&fs_info->unused_bgs_lock);
				1315
Dennis Zhou	b0643e5	2019-12-13 16:22:14 -0800	[diff] [blame]	1316	btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
				1317
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1318	/* Don't want to race with allocators so take the groups_sem */
				1319	down_write(&space_info->groups_sem);
Dennis Zhou	6e80d4f	2019-12-13 16:22:15 -0800	[diff] [blame]	1320
				1321	/*
				1322	* Async discard moves the final block group discard to be prior
				1323	* to the unused_bgs code path. Therefore, if it's not fully
				1324	* trimmed, punt it back to the async discard lists.
				1325	*/
				1326	if (btrfs_test_opt(fs_info, DISCARD_ASYNC) &&
				1327	!btrfs_is_free_space_trimmed(block_group)) {
				1328	trace_btrfs_skip_unused_block_group(block_group);
				1329	up_write(&space_info->groups_sem);
				1330	/* Requeue if we failed because of async discard */
				1331	btrfs_discard_queue_work(&fs_info->discard_ctl,
				1332	block_group);
				1333	goto next;
				1334	}
				1335
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1336	spin_lock(&block_group->lock);
				1337	if (block_group->reserved \|\| block_group->pinned \|\|
David Sterba	bf38be6	2019-10-23 18:48:11 +0200	[diff] [blame]	1338	block_group->used \|\| block_group->ro \|\|
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1339	list_is_singular(&block_group->list)) {
				1340	/*
				1341	* We want to bail if we made new allocations or have
				1342	* outstanding allocations in this block group. We do
				1343	* the ro check in case balance is currently acting on
				1344	* this block group.
				1345	*/
				1346	trace_btrfs_skip_unused_block_group(block_group);
				1347	spin_unlock(&block_group->lock);
				1348	up_write(&space_info->groups_sem);
				1349	goto next;
				1350	}
				1351	spin_unlock(&block_group->lock);
				1352
				1353	/* We don't want to force the issue, only flip if it's ok. */
Josef Bacik	e11c040	2019-06-20 15:38:07 -0400	[diff] [blame]	1354	ret = inc_block_group_ro(block_group, 0);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1355	up_write(&space_info->groups_sem);
				1356	if (ret < 0) {
				1357	ret = 0;
				1358	goto next;
				1359	}
				1360
				1361	/*
				1362	* Want to do this before we do anything else so we can recover
				1363	* properly if we fail to join the transaction.
				1364	*/
				1365	trans = btrfs_start_trans_remove_block_group(fs_info,
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	1366	block_group->start);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1367	if (IS_ERR(trans)) {
				1368	btrfs_dec_block_group_ro(block_group);
				1369	ret = PTR_ERR(trans);
				1370	goto next;
				1371	}
				1372
				1373	/*
				1374	* We could have pending pinned extents for this block group,
				1375	* just delete them, we don't care about them anymore.
				1376	*/
Filipe Manana	534cf53	2020-04-17 16:36:50 +0100	[diff] [blame]	1377	if (!clean_pinned_extents(trans, block_group)) {
				1378	btrfs_dec_block_group_ro(block_group);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1379	goto end_trans;
Filipe Manana	534cf53	2020-04-17 16:36:50 +0100	[diff] [blame]	1380	}
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1381
Dennis Zhou	b0643e5	2019-12-13 16:22:14 -0800	[diff] [blame]	1382	/*
				1383	* At this point, the block_group is read only and should fail
				1384	* new allocations. However, btrfs_finish_extent_commit() can
				1385	* cause this block_group to be placed back on the discard
				1386	* lists because now the block_group isn't fully discarded.
				1387	* Bail here and try again later after discarding everything.
				1388	*/
				1389	spin_lock(&fs_info->discard_ctl.lock);
				1390	if (!list_empty(&block_group->discard_list)) {
				1391	spin_unlock(&fs_info->discard_ctl.lock);
				1392	btrfs_dec_block_group_ro(block_group);
				1393	btrfs_discard_queue_work(&fs_info->discard_ctl,
				1394	block_group);
				1395	goto end_trans;
				1396	}
				1397	spin_unlock(&fs_info->discard_ctl.lock);
				1398
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1399	/* Reset pinned so btrfs_put_block_group doesn't complain */
				1400	spin_lock(&space_info->lock);
				1401	spin_lock(&block_group->lock);
				1402
				1403	btrfs_space_info_update_bytes_pinned(fs_info, space_info,
				1404	-block_group->pinned);
				1405	space_info->bytes_readonly += block_group->pinned;
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1406	block_group->pinned = 0;
				1407
				1408	spin_unlock(&block_group->lock);
				1409	spin_unlock(&space_info->lock);
				1410
Dennis Zhou	6e80d4f	2019-12-13 16:22:15 -0800	[diff] [blame]	1411	/*
				1412	* The normal path here is an unused block group is passed here,
				1413	* then trimming is handled in the transaction commit path.
				1414	* Async discard interposes before this to do the trimming
				1415	* before coming down the unused block group path as trimming
				1416	* will no longer be done later in the transaction commit path.
				1417	*/
				1418	if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC))
				1419	goto flip_async;
				1420
Naohiro Aota	dcba6e4	2021-02-04 19:21:56 +0900	[diff] [blame]	1421	/*
				1422	* DISCARD can flip during remount. On zoned filesystems, we
				1423	* need to reset sequential-required zones.
				1424	*/
				1425	trimming = btrfs_test_opt(fs_info, DISCARD_SYNC) \|\|
				1426	btrfs_is_zoned(fs_info);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1427
				1428	/* Implicit trim during transaction commit. */
				1429	if (trimming)
Filipe Manana	6b7304a	2020-05-08 11:01:47 +0100	[diff] [blame]	1430	btrfs_freeze_block_group(block_group);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1431
				1432	/*
				1433	* Btrfs_remove_chunk will abort the transaction if things go
				1434	* horribly wrong.
				1435	*/
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	1436	ret = btrfs_remove_chunk(trans, block_group->start);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1437
				1438	if (ret) {
				1439	if (trimming)
Filipe Manana	6b7304a	2020-05-08 11:01:47 +0100	[diff] [blame]	1440	btrfs_unfreeze_block_group(block_group);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1441	goto end_trans;
				1442	}
				1443
				1444	/*
				1445	* If we're not mounted with -odiscard, we can just forget
				1446	* about this block group. Otherwise we'll need to wait
				1447	* until transaction commit to do the actual discard.
				1448	*/
				1449	if (trimming) {
				1450	spin_lock(&fs_info->unused_bgs_lock);
				1451	/*
				1452	* A concurrent scrub might have added us to the list
				1453	* fs_info->unused_bgs, so use a list_move operation
				1454	* to add the block group to the deleted_bgs list.
				1455	*/
				1456	list_move(&block_group->bg_list,
				1457	&trans->transaction->deleted_bgs);
				1458	spin_unlock(&fs_info->unused_bgs_lock);
				1459	btrfs_get_block_group(block_group);
				1460	}
				1461	end_trans:
				1462	btrfs_end_transaction(trans);
				1463	next:
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1464	btrfs_put_block_group(block_group);
				1465	spin_lock(&fs_info->unused_bgs_lock);
				1466	}
				1467	spin_unlock(&fs_info->unused_bgs_lock);
Johannes Thumshirn	f337206	2021-04-19 16:41:01 +0900	[diff] [blame]	1468	mutex_unlock(&fs_info->reclaim_bgs_lock);
Dennis Zhou	6e80d4f	2019-12-13 16:22:15 -0800	[diff] [blame]	1469	return;
				1470
				1471	flip_async:
				1472	btrfs_end_transaction(trans);
Johannes Thumshirn	f337206	2021-04-19 16:41:01 +0900	[diff] [blame]	1473	mutex_unlock(&fs_info->reclaim_bgs_lock);
Dennis Zhou	6e80d4f	2019-12-13 16:22:15 -0800	[diff] [blame]	1474	btrfs_put_block_group(block_group);
				1475	btrfs_discard_punt_unused_bgs_list(fs_info);
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1476	}
				1477
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	1478	void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
Josef Bacik	e3e0520	2019-06-20 15:37:55 -0400	[diff] [blame]	1479	{
				1480	struct btrfs_fs_info *fs_info = bg->fs_info;
				1481
				1482	spin_lock(&fs_info->unused_bgs_lock);
				1483	if (list_empty(&bg->bg_list)) {
				1484	btrfs_get_block_group(bg);
				1485	trace_btrfs_add_unused_block_group(bg);
				1486	list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
				1487	}
				1488	spin_unlock(&fs_info->unused_bgs_lock);
				1489	}
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1490
Johannes Thumshirn	2ca0ec7	2021-10-14 18:39:02 +0900	[diff] [blame]	1491	/*
				1492	* We want block groups with a low number of used bytes to be in the beginning
				1493	* of the list, so they will get reclaimed first.
				1494	*/
				1495	static int reclaim_bgs_cmp(void unused, const struct list_head a,
				1496	const struct list_head *b)
				1497	{
				1498	const struct btrfs_block_group bg1, bg2;
				1499
				1500	bg1 = list_entry(a, struct btrfs_block_group, bg_list);
				1501	bg2 = list_entry(b, struct btrfs_block_group, bg_list);
				1502
				1503	return bg1->used > bg2->used;
				1504	}
				1505
Johannes Thumshirn	18bb8bb	2021-04-19 16:41:02 +0900	[diff] [blame]	1506	void btrfs_reclaim_bgs_work(struct work_struct *work)
				1507	{
				1508	struct btrfs_fs_info *fs_info =
				1509	container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
				1510	struct btrfs_block_group *bg;
				1511	struct btrfs_space_info *space_info;
Johannes Thumshirn	18bb8bb	2021-04-19 16:41:02 +0900	[diff] [blame]	1512
				1513	if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
				1514	return;
				1515
				1516	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
				1517	return;
				1518
Johannes Thumshirn	9cc0b83	2021-07-06 01:32:38 +0900	[diff] [blame]	1519	/*
				1520	* Long running balances can keep us blocked here for eternity, so
				1521	* simply skip reclaim if we're unable to get the mutex.
				1522	*/
				1523	if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
				1524	btrfs_exclop_finish(fs_info);
				1525	return;
				1526	}
				1527
Johannes Thumshirn	18bb8bb	2021-04-19 16:41:02 +0900	[diff] [blame]	1528	spin_lock(&fs_info->unused_bgs_lock);
Johannes Thumshirn	2ca0ec7	2021-10-14 18:39:02 +0900	[diff] [blame]	1529	/*
				1530	* Sort happens under lock because we can't simply splice it and sort.
				1531	* The block groups might still be in use and reachable via bg_list,
				1532	* and their presence in the reclaim_bgs list must be preserved.
				1533	*/
				1534	list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp);
Johannes Thumshirn	18bb8bb	2021-04-19 16:41:02 +0900	[diff] [blame]	1535	while (!list_empty(&fs_info->reclaim_bgs)) {
Johannes Thumshirn	5f93e77	2021-06-29 03:16:46 +0900	[diff] [blame]	1536	u64 zone_unusable;
Filipe Manana	1cea5cf	2021-06-21 11:10:38 +0100	[diff] [blame]	1537	int ret = 0;
				1538
Johannes Thumshirn	18bb8bb	2021-04-19 16:41:02 +0900	[diff] [blame]	1539	bg = list_first_entry(&fs_info->reclaim_bgs,
				1540	struct btrfs_block_group,
				1541	bg_list);
				1542	list_del_init(&bg->bg_list);
				1543
				1544	space_info = bg->space_info;
				1545	spin_unlock(&fs_info->unused_bgs_lock);
				1546
				1547	/* Don't race with allocators so take the groups_sem */
				1548	down_write(&space_info->groups_sem);
				1549
				1550	spin_lock(&bg->lock);
				1551	if (bg->reserved \|\| bg->pinned \|\| bg->ro) {
				1552	/*
				1553	* We want to bail if we made new allocations or have
				1554	* outstanding allocations in this block group. We do
				1555	* the ro check in case balance is currently acting on
				1556	* this block group.
				1557	*/
				1558	spin_unlock(&bg->lock);
				1559	up_write(&space_info->groups_sem);
				1560	goto next;
				1561	}
				1562	spin_unlock(&bg->lock);
				1563
				1564	/* Get out fast, in case we're unmounting the filesystem */
				1565	if (btrfs_fs_closing(fs_info)) {
				1566	up_write(&space_info->groups_sem);
				1567	goto next;
				1568	}
				1569
Johannes Thumshirn	5f93e77	2021-06-29 03:16:46 +0900	[diff] [blame]	1570	/*
				1571	* Cache the zone_unusable value before turning the block group
				1572	* to read only. As soon as the blog group is read only it's
				1573	* zone_unusable value gets moved to the block group's read-only
				1574	* bytes and isn't available for calculations anymore.
				1575	*/
				1576	zone_unusable = bg->zone_unusable;
Johannes Thumshirn	18bb8bb	2021-04-19 16:41:02 +0900	[diff] [blame]	1577	ret = inc_block_group_ro(bg, 0);
				1578	up_write(&space_info->groups_sem);
				1579	if (ret < 0)
				1580	goto next;
				1581
Johannes Thumshirn	5f93e77	2021-06-29 03:16:46 +0900	[diff] [blame]	1582	btrfs_info(fs_info,
				1583	"reclaiming chunk %llu with %llu%% used %llu%% unusable",
				1584	bg->start, div_u64(bg->used * 100, bg->length),
				1585	div64_u64(zone_unusable * 100, bg->length));
Johannes Thumshirn	18bb8bb	2021-04-19 16:41:02 +0900	[diff] [blame]	1586	trace_btrfs_reclaim_block_group(bg);
				1587	ret = btrfs_relocate_chunk(fs_info, bg->start);
Filipe Manana	d96b342	2021-11-22 12:03:38 +0000	[diff] [blame]	1588	if (ret)
Johannes Thumshirn	18bb8bb	2021-04-19 16:41:02 +0900	[diff] [blame]	1589	btrfs_err(fs_info, "error relocating chunk %llu",
				1590	bg->start);
				1591
				1592	next:
Filipe Manana	d96b342	2021-11-22 12:03:38 +0000	[diff] [blame]	1593	btrfs_put_block_group(bg);
Johannes Thumshirn	18bb8bb	2021-04-19 16:41:02 +0900	[diff] [blame]	1594	spin_lock(&fs_info->unused_bgs_lock);
				1595	}
				1596	spin_unlock(&fs_info->unused_bgs_lock);
				1597	mutex_unlock(&fs_info->reclaim_bgs_lock);
				1598	btrfs_exclop_finish(fs_info);
				1599	}
				1600
				1601	void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
				1602	{
				1603	spin_lock(&fs_info->unused_bgs_lock);
				1604	if (!list_empty(&fs_info->reclaim_bgs))
				1605	queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work);
				1606	spin_unlock(&fs_info->unused_bgs_lock);
				1607	}
				1608
				1609	void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
				1610	{
				1611	struct btrfs_fs_info *fs_info = bg->fs_info;
				1612
				1613	spin_lock(&fs_info->unused_bgs_lock);
				1614	if (list_empty(&bg->bg_list)) {
				1615	btrfs_get_block_group(bg);
				1616	trace_btrfs_add_reclaim_block_group(bg);
				1617	list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs);
				1618	}
				1619	spin_unlock(&fs_info->unused_bgs_lock);
				1620	}
				1621
Johannes Thumshirn	e3ba67a	2020-06-02 19:05:57 +0900	[diff] [blame]	1622	static int read_bg_from_eb(struct btrfs_fs_info fs_info, struct btrfs_key key,
				1623	struct btrfs_path *path)
				1624	{
				1625	struct extent_map_tree *em_tree;
				1626	struct extent_map *em;
				1627	struct btrfs_block_group_item bg;
				1628	struct extent_buffer *leaf;
				1629	int slot;
				1630	u64 flags;
				1631	int ret = 0;
				1632
				1633	slot = path->slots[0];
				1634	leaf = path->nodes[0];
				1635
				1636	em_tree = &fs_info->mapping_tree;
				1637	read_lock(&em_tree->lock);
				1638	em = lookup_extent_mapping(em_tree, key->objectid, key->offset);
				1639	read_unlock(&em_tree->lock);
				1640	if (!em) {
				1641	btrfs_err(fs_info,
				1642	"logical %llu len %llu found bg but no related chunk",
				1643	key->objectid, key->offset);
				1644	return -ENOENT;
				1645	}
				1646
				1647	if (em->start != key->objectid \|\| em->len != key->offset) {
				1648	btrfs_err(fs_info,
				1649	"block group %llu len %llu mismatch with chunk %llu len %llu",
				1650	key->objectid, key->offset, em->start, em->len);
				1651	ret = -EUCLEAN;
				1652	goto out_free_em;
				1653	}
				1654
				1655	read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot),
				1656	sizeof(bg));
				1657	flags = btrfs_stack_block_group_flags(&bg) &
				1658	BTRFS_BLOCK_GROUP_TYPE_MASK;
				1659
				1660	if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
				1661	btrfs_err(fs_info,
				1662	"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
				1663	key->objectid, key->offset, flags,
				1664	(BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type));
				1665	ret = -EUCLEAN;
				1666	}
				1667
				1668	out_free_em:
				1669	free_extent_map(em);
				1670	return ret;
				1671	}
				1672
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1673	static int find_first_block_group(struct btrfs_fs_info *fs_info,
				1674	struct btrfs_path *path,
				1675	struct btrfs_key *key)
				1676	{
Josef Bacik	dfe8aec	2021-11-05 16:45:36 -0400	[diff] [blame]	1677	struct btrfs_root *root = btrfs_block_group_root(fs_info);
Johannes Thumshirn	e3ba67a	2020-06-02 19:05:57 +0900	[diff] [blame]	1678	int ret;
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1679	struct btrfs_key found_key;
				1680	struct extent_buffer *leaf;
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1681	int slot;
				1682
				1683	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
				1684	if (ret < 0)
Johannes Thumshirn	e3ba67a	2020-06-02 19:05:57 +0900	[diff] [blame]	1685	return ret;
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1686
				1687	while (1) {
				1688	slot = path->slots[0];
				1689	leaf = path->nodes[0];
				1690	if (slot >= btrfs_header_nritems(leaf)) {
				1691	ret = btrfs_next_leaf(root, path);
				1692	if (ret == 0)
				1693	continue;
				1694	if (ret < 0)
				1695	goto out;
				1696	break;
				1697	}
				1698	btrfs_item_key_to_cpu(leaf, &found_key, slot);
				1699
				1700	if (found_key.objectid >= key->objectid &&
				1701	found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
Johannes Thumshirn	e3ba67a	2020-06-02 19:05:57 +0900	[diff] [blame]	1702	ret = read_bg_from_eb(fs_info, &found_key, path);
				1703	break;
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1704	}
Johannes Thumshirn	e3ba67a	2020-06-02 19:05:57 +0900	[diff] [blame]	1705
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1706	path->slots[0]++;
				1707	}
				1708	out:
				1709	return ret;
				1710	}
				1711
				1712	static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
				1713	{
				1714	u64 extra_flags = chunk_to_extended(flags) &
				1715	BTRFS_EXTENDED_PROFILE_MASK;
				1716
				1717	write_seqlock(&fs_info->profiles_lock);
				1718	if (flags & BTRFS_BLOCK_GROUP_DATA)
				1719	fs_info->avail_data_alloc_bits \|= extra_flags;
				1720	if (flags & BTRFS_BLOCK_GROUP_METADATA)
				1721	fs_info->avail_metadata_alloc_bits \|= extra_flags;
				1722	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
				1723	fs_info->avail_system_alloc_bits \|= extra_flags;
				1724	write_sequnlock(&fs_info->profiles_lock);
				1725	}
				1726
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	1727	/**
Nikolay Borisov	9ee9b97	2021-01-22 11:57:58 +0200	[diff] [blame]	1728	* Map a physical disk address to a list of logical addresses
				1729	*
				1730	* @fs_info: the filesystem
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	1731	* @chunk_start: logical address of block group
Naohiro Aota	138082f	2021-02-04 19:22:02 +0900	[diff] [blame]	1732	* @bdev: physical device to resolve, can be NULL to indicate any device
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	1733	* @physical: physical address to map to logical addresses
				1734	* @logical: return array of logical addresses which map to @physical
				1735	* @naddrs: length of @logical
				1736	* @stripe_len: size of IO stripe for the given block group
				1737	*
				1738	* Maps a particular @physical disk address to a list of @logical addresses.
				1739	* Used primarily to exclude those portions of a block group that contain super
				1740	* block copies.
				1741	*/
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	1742	int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
Naohiro Aota	138082f	2021-02-04 19:22:02 +0900	[diff] [blame]	1743	struct block_device bdev, u64 physical, u64 *logical,
				1744	int naddrs, int stripe_len)
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	1745	{
				1746	struct extent_map *em;
				1747	struct map_lookup *map;
				1748	u64 *buf;
				1749	u64 bytenr;
Nikolay Borisov	1776ad1	2019-11-19 14:05:53 +0200	[diff] [blame]	1750	u64 data_stripe_length;
				1751	u64 io_stripe_size;
				1752	int i, nr = 0;
				1753	int ret = 0;
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	1754
				1755	em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
				1756	if (IS_ERR(em))
				1757	return -EIO;
				1758
				1759	map = em->map_lookup;
Nikolay Borisov	9e22b92	2020-04-03 16:40:34 +0300	[diff] [blame]	1760	data_stripe_length = em->orig_block_len;
Nikolay Borisov	1776ad1	2019-11-19 14:05:53 +0200	[diff] [blame]	1761	io_stripe_size = map->stripe_len;
Naohiro Aota	138082f	2021-02-04 19:22:02 +0900	[diff] [blame]	1762	chunk_start = em->start;
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	1763
Nikolay Borisov	9e22b92	2020-04-03 16:40:34 +0300	[diff] [blame]	1764	/* For RAID5/6 adjust to a full IO stripe length */
				1765	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
Nikolay Borisov	1776ad1	2019-11-19 14:05:53 +0200	[diff] [blame]	1766	io_stripe_size = map->stripe_len * nr_data_stripes(map);
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	1767
				1768	buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
Nikolay Borisov	1776ad1	2019-11-19 14:05:53 +0200	[diff] [blame]	1769	if (!buf) {
				1770	ret = -ENOMEM;
				1771	goto out;
				1772	}
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	1773
				1774	for (i = 0; i < map->num_stripes; i++) {
Nikolay Borisov	1776ad1	2019-11-19 14:05:53 +0200	[diff] [blame]	1775	bool already_inserted = false;
				1776	u64 stripe_nr;
Naohiro Aota	138082f	2021-02-04 19:22:02 +0900	[diff] [blame]	1777	u64 offset;
Nikolay Borisov	1776ad1	2019-11-19 14:05:53 +0200	[diff] [blame]	1778	int j;
				1779
				1780	if (!in_range(physical, map->stripes[i].physical,
				1781	data_stripe_length))
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	1782	continue;
				1783
Naohiro Aota	138082f	2021-02-04 19:22:02 +0900	[diff] [blame]	1784	if (bdev && map->stripes[i].dev->bdev != bdev)
				1785	continue;
				1786
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	1787	stripe_nr = physical - map->stripes[i].physical;
Naohiro Aota	138082f	2021-02-04 19:22:02 +0900	[diff] [blame]	1788	stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset);
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	1789
				1790	if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
				1791	stripe_nr = stripe_nr * map->num_stripes + i;
				1792	stripe_nr = div_u64(stripe_nr, map->sub_stripes);
				1793	} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
				1794	stripe_nr = stripe_nr * map->num_stripes + i;
				1795	}
				1796	/*
				1797	* The remaining case would be for RAID56, multiply by
				1798	* nr_data_stripes(). Alternatively, just use rmap_len below
				1799	* instead of map->stripe_len
				1800	*/
				1801
Naohiro Aota	138082f	2021-02-04 19:22:02 +0900	[diff] [blame]	1802	bytenr = chunk_start + stripe_nr * io_stripe_size + offset;
Nikolay Borisov	1776ad1	2019-11-19 14:05:53 +0200	[diff] [blame]	1803
				1804	/* Ensure we don't add duplicate addresses */
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	1805	for (j = 0; j < nr; j++) {
Nikolay Borisov	1776ad1	2019-11-19 14:05:53 +0200	[diff] [blame]	1806	if (buf[j] == bytenr) {
				1807	already_inserted = true;
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	1808	break;
Nikolay Borisov	1776ad1	2019-11-19 14:05:53 +0200	[diff] [blame]	1809	}
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	1810	}
Nikolay Borisov	1776ad1	2019-11-19 14:05:53 +0200	[diff] [blame]	1811
				1812	if (!already_inserted)
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	1813	buf[nr++] = bytenr;
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	1814	}
				1815
				1816	*logical = buf;
				1817	*naddrs = nr;
Nikolay Borisov	1776ad1	2019-11-19 14:05:53 +0200	[diff] [blame]	1818	*stripe_len = io_stripe_size;
				1819	out:
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	1820	free_extent_map(em);
Nikolay Borisov	1776ad1	2019-11-19 14:05:53 +0200	[diff] [blame]	1821	return ret;
Nikolay Borisov	96a1433	2019-12-10 19:57:51 +0200	[diff] [blame]	1822	}
				1823
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	1824	static int exclude_super_stripes(struct btrfs_block_group *cache)
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1825	{
				1826	struct btrfs_fs_info *fs_info = cache->fs_info;
Naohiro Aota	1265925	2020-11-10 20:26:14 +0900	[diff] [blame]	1827	const bool zoned = btrfs_is_zoned(fs_info);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1828	u64 bytenr;
				1829	u64 *logical;
				1830	int stripe_len;
				1831	int i, nr, ret;
				1832
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	1833	if (cache->start < BTRFS_SUPER_INFO_OFFSET) {
				1834	stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start;
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1835	cache->bytes_super += stripe_len;
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	1836	ret = btrfs_add_excluded_extent(fs_info, cache->start,
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1837	stripe_len);
				1838	if (ret)
				1839	return ret;
				1840	}
				1841
				1842	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
				1843	bytenr = btrfs_sb_offset(i);
Naohiro Aota	138082f	2021-02-04 19:22:02 +0900	[diff] [blame]	1844	ret = btrfs_rmap_block(fs_info, cache->start, NULL,
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1845	bytenr, &logical, &nr, &stripe_len);
				1846	if (ret)
				1847	return ret;
				1848
Naohiro Aota	1265925	2020-11-10 20:26:14 +0900	[diff] [blame]	1849	/* Shouldn't have super stripes in sequential zones */
				1850	if (zoned && nr) {
				1851	btrfs_err(fs_info,
				1852	"zoned: block group %llu must not contain super block",
				1853	cache->start);
				1854	return -EUCLEAN;
				1855	}
				1856
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1857	while (nr--) {
Nikolay Borisov	96f9b0f	2020-04-03 16:40:35 +0300	[diff] [blame]	1858	u64 len = min_t(u64, stripe_len,
				1859	cache->start + cache->length - logical[nr]);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1860
				1861	cache->bytes_super += len;
Nikolay Borisov	96f9b0f	2020-04-03 16:40:35 +0300	[diff] [blame]	1862	ret = btrfs_add_excluded_extent(fs_info, logical[nr],
				1863	len);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1864	if (ret) {
				1865	kfree(logical);
				1866	return ret;
				1867	}
				1868	}
				1869
				1870	kfree(logical);
				1871	}
				1872	return 0;
				1873	}
				1874
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	1875	static void link_block_group(struct btrfs_block_group *cache)
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1876	{
				1877	struct btrfs_space_info *space_info = cache->space_info;
				1878	int index = btrfs_bg_flags_to_raid_index(cache->flags);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1879
				1880	down_write(&space_info->groups_sem);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1881	list_add_tail(&cache->list, &space_info->block_groups[index]);
				1882	up_write(&space_info->groups_sem);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1883	}
				1884
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	1885	static struct btrfs_block_group *btrfs_create_block_group_cache(
Qu Wenruo	9afc664	2020-05-05 07:58:20 +0800	[diff] [blame]	1886	struct btrfs_fs_info *fs_info, u64 start)
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1887	{
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	1888	struct btrfs_block_group *cache;
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1889
				1890	cache = kzalloc(sizeof(*cache), GFP_NOFS);
				1891	if (!cache)
				1892	return NULL;
				1893
				1894	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
				1895	GFP_NOFS);
				1896	if (!cache->free_space_ctl) {
				1897	kfree(cache);
				1898	return NULL;
				1899	}
				1900
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	1901	cache->start = start;
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1902
				1903	cache->fs_info = fs_info;
				1904	cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1905
Dennis Zhou	6e80d4f	2019-12-13 16:22:15 -0800	[diff] [blame]	1906	cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
				1907
Josef Bacik	48aaeeb	2020-07-06 09:14:11 -0400	[diff] [blame]	1908	refcount_set(&cache->refs, 1);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1909	spin_lock_init(&cache->lock);
				1910	init_rwsem(&cache->data_rwsem);
				1911	INIT_LIST_HEAD(&cache->list);
				1912	INIT_LIST_HEAD(&cache->cluster_list);
				1913	INIT_LIST_HEAD(&cache->bg_list);
				1914	INIT_LIST_HEAD(&cache->ro_list);
Dennis Zhou	b0643e5	2019-12-13 16:22:14 -0800	[diff] [blame]	1915	INIT_LIST_HEAD(&cache->discard_list);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1916	INIT_LIST_HEAD(&cache->dirty_list);
				1917	INIT_LIST_HEAD(&cache->io_list);
Naohiro Aota	afba2bc	2021-08-19 21:19:17 +0900	[diff] [blame]	1918	INIT_LIST_HEAD(&cache->active_bg_list);
Josef Bacik	cd79909	2020-10-23 09:58:08 -0400	[diff] [blame]	1919	btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
Filipe Manana	6b7304a	2020-05-08 11:01:47 +0100	[diff] [blame]	1920	atomic_set(&cache->frozen, 0);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1921	mutex_init(&cache->free_space_lock);
				1922	btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
				1923
				1924	return cache;
				1925	}
				1926
				1927	/*
				1928	* Iterate all chunks and verify that each of them has the corresponding block
				1929	* group
				1930	*/
				1931	static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
				1932	{
				1933	struct extent_map_tree *map_tree = &fs_info->mapping_tree;
				1934	struct extent_map *em;
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	1935	struct btrfs_block_group *bg;
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1936	u64 start = 0;
				1937	int ret = 0;
				1938
				1939	while (1) {
				1940	read_lock(&map_tree->lock);
				1941	/*
				1942	* lookup_extent_mapping will return the first extent map
				1943	* intersecting the range, so setting @len to 1 is enough to
				1944	* get the first chunk.
				1945	*/
				1946	em = lookup_extent_mapping(map_tree, start, 1);
				1947	read_unlock(&map_tree->lock);
				1948	if (!em)
				1949	break;
				1950
				1951	bg = btrfs_lookup_block_group(fs_info, em->start);
				1952	if (!bg) {
				1953	btrfs_err(fs_info,
				1954	"chunk start=%llu len=%llu doesn't have corresponding block group",
				1955	em->start, em->len);
				1956	ret = -EUCLEAN;
				1957	free_extent_map(em);
				1958	break;
				1959	}
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	1960	if (bg->start != em->start \|\| bg->length != em->len \|\|
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1961	(bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
				1962	(em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
				1963	btrfs_err(fs_info,
				1964	"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
				1965	em->start, em->len,
				1966	em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	1967	bg->start, bg->length,
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	1968	bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
				1969	ret = -EUCLEAN;
				1970	free_extent_map(em);
				1971	btrfs_put_block_group(bg);
				1972	break;
				1973	}
				1974	start = em->start + em->len;
				1975	free_extent_map(em);
				1976	btrfs_put_block_group(bg);
				1977	}
				1978	return ret;
				1979	}
				1980
Qu Wenruo	ffb9e0f	2019-10-10 10:39:27 +0800	[diff] [blame]	1981	static int read_one_block_group(struct btrfs_fs_info *info,
Johannes Thumshirn	4afd2fe	2021-02-04 19:21:44 +0900	[diff] [blame]	1982	struct btrfs_block_group_item *bgi,
Qu Wenruo	d49a2dd	2019-11-05 09:35:35 +0800	[diff] [blame]	1983	const struct btrfs_key *key,
Qu Wenruo	ffb9e0f	2019-10-10 10:39:27 +0800	[diff] [blame]	1984	int need_clear)
				1985	{
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	1986	struct btrfs_block_group *cache;
Qu Wenruo	ffb9e0f	2019-10-10 10:39:27 +0800	[diff] [blame]	1987	struct btrfs_space_info *space_info;
Qu Wenruo	ffb9e0f	2019-10-10 10:39:27 +0800	[diff] [blame]	1988	const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS);
Qu Wenruo	ffb9e0f	2019-10-10 10:39:27 +0800	[diff] [blame]	1989	int ret;
				1990
Qu Wenruo	d49a2dd	2019-11-05 09:35:35 +0800	[diff] [blame]	1991	ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY);
Qu Wenruo	ffb9e0f	2019-10-10 10:39:27 +0800	[diff] [blame]	1992
Qu Wenruo	9afc664	2020-05-05 07:58:20 +0800	[diff] [blame]	1993	cache = btrfs_create_block_group_cache(info, key->objectid);
Qu Wenruo	ffb9e0f	2019-10-10 10:39:27 +0800	[diff] [blame]	1994	if (!cache)
				1995	return -ENOMEM;
				1996
Johannes Thumshirn	4afd2fe	2021-02-04 19:21:44 +0900	[diff] [blame]	1997	cache->length = key->offset;
				1998	cache->used = btrfs_stack_block_group_used(bgi);
				1999	cache->flags = btrfs_stack_block_group_flags(bgi);
Qu Wenruo	9afc664	2020-05-05 07:58:20 +0800	[diff] [blame]	2000
Marcos Paulo de Souza	e3e39c7	2020-08-21 11:54:44 -0300	[diff] [blame]	2001	set_free_space_tree_thresholds(cache);
				2002
Qu Wenruo	ffb9e0f	2019-10-10 10:39:27 +0800	[diff] [blame]	2003	if (need_clear) {
				2004	/*
				2005	* When we mount with old space cache, we need to
				2006	* set BTRFS_DC_CLEAR and set dirty flag.
				2007	*
				2008	* a) Setting 'BTRFS_DC_CLEAR' makes sure that we
				2009	* truncate the old free space cache inode and
				2010	* setup a new one.
				2011	* b) Setting 'dirty flag' makes sure that we flush
				2012	* the new space cache info onto disk.
				2013	*/
				2014	if (btrfs_test_opt(info, SPACE_CACHE))
				2015	cache->disk_cache_state = BTRFS_DC_CLEAR;
				2016	}
Qu Wenruo	ffb9e0f	2019-10-10 10:39:27 +0800	[diff] [blame]	2017	if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
				2018	(cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
				2019	btrfs_err(info,
				2020	"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
				2021	cache->start);
				2022	ret = -EINVAL;
				2023	goto error;
				2024	}
				2025
Naohiro Aota	a94794d	2021-02-04 19:21:51 +0900	[diff] [blame]	2026	ret = btrfs_load_block_group_zone_info(cache, false);
Naohiro Aota	08e11a3	2021-02-04 19:21:50 +0900	[diff] [blame]	2027	if (ret) {
				2028	btrfs_err(info, "zoned: failed to load zone info of bg %llu",
				2029	cache->start);
				2030	goto error;
				2031	}
				2032
Qu Wenruo	ffb9e0f	2019-10-10 10:39:27 +0800	[diff] [blame]	2033	/*
				2034	* We need to exclude the super stripes now so that the space info has
				2035	* super bytes accounted for, otherwise we'll think we have more space
				2036	* than we actually do.
				2037	*/
				2038	ret = exclude_super_stripes(cache);
				2039	if (ret) {
				2040	/* We may have excluded something, so call this just in case. */
				2041	btrfs_free_excluded_extents(cache);
				2042	goto error;
				2043	}
				2044
				2045	/*
Naohiro Aota	169e0da	2021-02-04 19:21:52 +0900	[diff] [blame]	2046	* For zoned filesystem, space after the allocation offset is the only
				2047	* free space for a block group. So, we don't need any caching work.
				2048	* btrfs_calc_zone_unusable() will set the amount of free space and
				2049	* zone_unusable space.
				2050	*
				2051	* For regular filesystem, check for two cases, either we are full, and
				2052	* therefore don't need to bother with the caching work since we won't
				2053	* find any space, or we are empty, and we can just add all the space
				2054	* in and be done with it. This saves us _a_lot_ of time, particularly
				2055	* in the full case.
Qu Wenruo	ffb9e0f	2019-10-10 10:39:27 +0800	[diff] [blame]	2056	*/
Naohiro Aota	169e0da	2021-02-04 19:21:52 +0900	[diff] [blame]	2057	if (btrfs_is_zoned(info)) {
				2058	btrfs_calc_zone_unusable(cache);
Naohiro Aota	c46c424	2021-08-19 21:19:09 +0900	[diff] [blame]	2059	/* Should not have any excluded extents. Just in case, though. */
				2060	btrfs_free_excluded_extents(cache);
Naohiro Aota	169e0da	2021-02-04 19:21:52 +0900	[diff] [blame]	2061	} else if (cache->length == cache->used) {
Qu Wenruo	ffb9e0f	2019-10-10 10:39:27 +0800	[diff] [blame]	2062	cache->last_byte_to_unpin = (u64)-1;
				2063	cache->cached = BTRFS_CACHE_FINISHED;
				2064	btrfs_free_excluded_extents(cache);
				2065	} else if (cache->used == 0) {
				2066	cache->last_byte_to_unpin = (u64)-1;
				2067	cache->cached = BTRFS_CACHE_FINISHED;
Qu Wenruo	9afc664	2020-05-05 07:58:20 +0800	[diff] [blame]	2068	add_new_free_space(cache, cache->start,
				2069	cache->start + cache->length);
Qu Wenruo	ffb9e0f	2019-10-10 10:39:27 +0800	[diff] [blame]	2070	btrfs_free_excluded_extents(cache);
				2071	}
				2072
				2073	ret = btrfs_add_block_group_cache(info, cache);
				2074	if (ret) {
				2075	btrfs_remove_free_space_cache(cache);
				2076	goto error;
				2077	}
				2078	trace_btrfs_add_block_group(info, cache, 0);
Qu Wenruo	9afc664	2020-05-05 07:58:20 +0800	[diff] [blame]	2079	btrfs_update_space_info(info, cache->flags, cache->length,
Naohiro Aota	169e0da	2021-02-04 19:21:52 +0900	[diff] [blame]	2080	cache->used, cache->bytes_super,
				2081	cache->zone_unusable, &space_info);
Qu Wenruo	ffb9e0f	2019-10-10 10:39:27 +0800	[diff] [blame]	2082
				2083	cache->space_info = space_info;
				2084
				2085	link_block_group(cache);
				2086
				2087	set_avail_alloc_bits(info, cache->flags);
Anand Jain	a09f23c	2021-08-24 13:27:42 +0800	[diff] [blame]	2088	if (btrfs_chunk_writeable(info, cache->start)) {
				2089	if (cache->used == 0) {
				2090	ASSERT(list_empty(&cache->bg_list));
				2091	if (btrfs_test_opt(info, DISCARD_ASYNC))
				2092	btrfs_discard_queue_work(&info->discard_ctl, cache);
				2093	else
				2094	btrfs_mark_bg_unused(cache);
				2095	}
				2096	} else {
Qu Wenruo	ffb9e0f	2019-10-10 10:39:27 +0800	[diff] [blame]	2097	inc_block_group_ro(cache, 1);
Qu Wenruo	ffb9e0f	2019-10-10 10:39:27 +0800	[diff] [blame]	2098	}
Anand Jain	a09f23c	2021-08-24 13:27:42 +0800	[diff] [blame]	2099
Qu Wenruo	ffb9e0f	2019-10-10 10:39:27 +0800	[diff] [blame]	2100	return 0;
				2101	error:
				2102	btrfs_put_block_group(cache);
				2103	return ret;
				2104	}
				2105
Josef Bacik	42437a6	2020-10-16 11:29:18 -0400	[diff] [blame]	2106	static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
				2107	{
				2108	struct extent_map_tree *em_tree = &fs_info->mapping_tree;
				2109	struct btrfs_space_info *space_info;
				2110	struct rb_node *node;
				2111	int ret = 0;
				2112
				2113	for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
				2114	struct extent_map *em;
				2115	struct map_lookup *map;
				2116	struct btrfs_block_group *bg;
				2117
				2118	em = rb_entry(node, struct extent_map, rb_node);
				2119	map = em->map_lookup;
				2120	bg = btrfs_create_block_group_cache(fs_info, em->start);
				2121	if (!bg) {
				2122	ret = -ENOMEM;
				2123	break;
				2124	}
				2125
				2126	/* Fill dummy cache as FULL */
				2127	bg->length = em->len;
				2128	bg->flags = map->type;
				2129	bg->last_byte_to_unpin = (u64)-1;
				2130	bg->cached = BTRFS_CACHE_FINISHED;
				2131	bg->used = em->len;
				2132	bg->flags = map->type;
				2133	ret = btrfs_add_block_group_cache(fs_info, bg);
Qu Wenruo	2b29726	2021-07-19 13:43:04 +0800	[diff] [blame]	2134	/*
				2135	* We may have some valid block group cache added already, in
				2136	* that case we skip to the next one.
				2137	*/
				2138	if (ret == -EEXIST) {
				2139	ret = 0;
				2140	btrfs_put_block_group(bg);
				2141	continue;
				2142	}
				2143
Josef Bacik	42437a6	2020-10-16 11:29:18 -0400	[diff] [blame]	2144	if (ret) {
				2145	btrfs_remove_free_space_cache(bg);
				2146	btrfs_put_block_group(bg);
				2147	break;
				2148	}
Qu Wenruo	2b29726	2021-07-19 13:43:04 +0800	[diff] [blame]	2149
Josef Bacik	42437a6	2020-10-16 11:29:18 -0400	[diff] [blame]	2150	btrfs_update_space_info(fs_info, bg->flags, em->len, em->len,
Naohiro Aota	169e0da	2021-02-04 19:21:52 +0900	[diff] [blame]	2151	0, 0, &space_info);
Josef Bacik	42437a6	2020-10-16 11:29:18 -0400	[diff] [blame]	2152	bg->space_info = space_info;
				2153	link_block_group(bg);
				2154
				2155	set_avail_alloc_bits(fs_info, bg->flags);
				2156	}
				2157	if (!ret)
				2158	btrfs_init_global_block_rsv(fs_info);
				2159	return ret;
				2160	}
				2161
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2162	int btrfs_read_block_groups(struct btrfs_fs_info *info)
				2163	{
Josef Bacik	dfe8aec	2021-11-05 16:45:36 -0400	[diff] [blame]	2164	struct btrfs_root *root = btrfs_block_group_root(info);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2165	struct btrfs_path *path;
				2166	int ret;
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	2167	struct btrfs_block_group *cache;
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2168	struct btrfs_space_info *space_info;
				2169	struct btrfs_key key;
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2170	int need_clear = 0;
				2171	u64 cache_gen;
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2172
Josef Bacik	dfe8aec	2021-11-05 16:45:36 -0400	[diff] [blame]	2173	if (!root)
Josef Bacik	42437a6	2020-10-16 11:29:18 -0400	[diff] [blame]	2174	return fill_dummy_bgs(info);
				2175
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2176	key.objectid = 0;
				2177	key.offset = 0;
				2178	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
				2179	path = btrfs_alloc_path();
				2180	if (!path)
				2181	return -ENOMEM;
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2182
				2183	cache_gen = btrfs_super_cache_generation(info->super_copy);
				2184	if (btrfs_test_opt(info, SPACE_CACHE) &&
				2185	btrfs_super_generation(info->super_copy) != cache_gen)
				2186	need_clear = 1;
				2187	if (btrfs_test_opt(info, CLEAR_CACHE))
				2188	need_clear = 1;
				2189
				2190	while (1) {
Johannes Thumshirn	4afd2fe	2021-02-04 19:21:44 +0900	[diff] [blame]	2191	struct btrfs_block_group_item bgi;
				2192	struct extent_buffer *leaf;
				2193	int slot;
				2194
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2195	ret = find_first_block_group(info, path, &key);
				2196	if (ret > 0)
				2197	break;
				2198	if (ret != 0)
				2199	goto error;
				2200
Johannes Thumshirn	4afd2fe	2021-02-04 19:21:44 +0900	[diff] [blame]	2201	leaf = path->nodes[0];
				2202	slot = path->slots[0];
				2203
				2204	read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
				2205	sizeof(bgi));
				2206
				2207	btrfs_item_key_to_cpu(leaf, &key, slot);
				2208	btrfs_release_path(path);
				2209	ret = read_one_block_group(info, &bgi, &key, need_clear);
Qu Wenruo	ffb9e0f	2019-10-10 10:39:27 +0800	[diff] [blame]	2210	if (ret < 0)
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2211	goto error;
Qu Wenruo	ffb9e0f	2019-10-10 10:39:27 +0800	[diff] [blame]	2212	key.objectid += key.offset;
				2213	key.offset = 0;
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2214	}
Josef Bacik	7837fa8	2020-10-14 17:00:51 -0400	[diff] [blame]	2215	btrfs_release_path(path);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2216
Josef Bacik	7280490	2020-09-01 17:40:37 -0400	[diff] [blame]	2217	list_for_each_entry(space_info, &info->space_info, list) {
Josef Bacik	49ea112	2020-09-01 17:40:38 -0400	[diff] [blame]	2218	int i;
				2219
				2220	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
				2221	if (list_empty(&space_info->block_groups[i]))
				2222	continue;
				2223	cache = list_first_entry(&space_info->block_groups[i],
				2224	struct btrfs_block_group,
				2225	list);
				2226	btrfs_sysfs_add_block_group_type(cache);
				2227	}
				2228
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2229	if (!(btrfs_get_alloc_profile(info, space_info->flags) &
				2230	(BTRFS_BLOCK_GROUP_RAID10 \|
				2231	BTRFS_BLOCK_GROUP_RAID1_MASK \|
				2232	BTRFS_BLOCK_GROUP_RAID56_MASK \|
				2233	BTRFS_BLOCK_GROUP_DUP)))
				2234	continue;
				2235	/*
				2236	* Avoid allocating from un-mirrored block group if there are
				2237	* mirrored block groups.
				2238	*/
				2239	list_for_each_entry(cache,
				2240	&space_info->block_groups[BTRFS_RAID_RAID0],
				2241	list)
Josef Bacik	e11c040	2019-06-20 15:38:07 -0400	[diff] [blame]	2242	inc_block_group_ro(cache, 1);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2243	list_for_each_entry(cache,
				2244	&space_info->block_groups[BTRFS_RAID_SINGLE],
				2245	list)
Josef Bacik	e11c040	2019-06-20 15:38:07 -0400	[diff] [blame]	2246	inc_block_group_ro(cache, 1);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2247	}
				2248
				2249	btrfs_init_global_block_rsv(info);
				2250	ret = check_chunk_block_group_mappings(info);
				2251	error:
				2252	btrfs_free_path(path);
Qu Wenruo	2b29726	2021-07-19 13:43:04 +0800	[diff] [blame]	2253	/*
				2254	* We've hit some error while reading the extent tree, and have
				2255	* rescue=ibadroots mount option.
				2256	* Try to fill the tree using dummy block groups so that the user can
				2257	* continue to mount and grab their data.
				2258	*/
				2259	if (ret && btrfs_test_opt(info, IGNOREBADROOTS))
				2260	ret = fill_dummy_bgs(info);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2261	return ret;
				2262	}
				2263
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	2264	/*
				2265	* This function, insert_block_group_item(), belongs to the phase 2 of chunk
				2266	* allocation.
				2267	*
				2268	* See the comment at btrfs_chunk_alloc() for details about the chunk allocation
				2269	* phases.
				2270	*/
Qu Wenruo	97f4728	2020-05-05 07:58:22 +0800	[diff] [blame]	2271	static int insert_block_group_item(struct btrfs_trans_handle *trans,
				2272	struct btrfs_block_group *block_group)
				2273	{
				2274	struct btrfs_fs_info *fs_info = trans->fs_info;
				2275	struct btrfs_block_group_item bgi;
Josef Bacik	dfe8aec	2021-11-05 16:45:36 -0400	[diff] [blame]	2276	struct btrfs_root *root = btrfs_block_group_root(fs_info);
Qu Wenruo	97f4728	2020-05-05 07:58:22 +0800	[diff] [blame]	2277	struct btrfs_key key;
				2278
				2279	spin_lock(&block_group->lock);
				2280	btrfs_set_stack_block_group_used(&bgi, block_group->used);
				2281	btrfs_set_stack_block_group_chunk_objectid(&bgi,
				2282	BTRFS_FIRST_CHUNK_TREE_OBJECTID);
				2283	btrfs_set_stack_block_group_flags(&bgi, block_group->flags);
				2284	key.objectid = block_group->start;
				2285	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
				2286	key.offset = block_group->length;
				2287	spin_unlock(&block_group->lock);
				2288
Qu Wenruo	97f4728	2020-05-05 07:58:22 +0800	[diff] [blame]	2289	return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
				2290	}
				2291
Nikolay Borisov	2eadb9e	2021-07-05 12:29:19 +0300	[diff] [blame]	2292	static int insert_dev_extent(struct btrfs_trans_handle *trans,
				2293	struct btrfs_device *device, u64 chunk_offset,
				2294	u64 start, u64 num_bytes)
				2295	{
				2296	struct btrfs_fs_info *fs_info = device->fs_info;
				2297	struct btrfs_root *root = fs_info->dev_root;
				2298	struct btrfs_path *path;
				2299	struct btrfs_dev_extent *extent;
				2300	struct extent_buffer *leaf;
				2301	struct btrfs_key key;
				2302	int ret;
				2303
				2304	WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
				2305	WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
				2306	path = btrfs_alloc_path();
				2307	if (!path)
				2308	return -ENOMEM;
				2309
				2310	key.objectid = device->devid;
				2311	key.type = BTRFS_DEV_EXTENT_KEY;
				2312	key.offset = start;
				2313	ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent));
				2314	if (ret)
				2315	goto out;
				2316
				2317	leaf = path->nodes[0];
				2318	extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
				2319	btrfs_set_dev_extent_chunk_tree(leaf, extent, BTRFS_CHUNK_TREE_OBJECTID);
				2320	btrfs_set_dev_extent_chunk_objectid(leaf, extent,
				2321	BTRFS_FIRST_CHUNK_TREE_OBJECTID);
				2322	btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
				2323
				2324	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
				2325	btrfs_mark_buffer_dirty(leaf);
				2326	out:
				2327	btrfs_free_path(path);
				2328	return ret;
				2329	}
				2330
				2331	/*
				2332	* This function belongs to phase 2.
				2333	*
				2334	* See the comment at btrfs_chunk_alloc() for details about the chunk allocation
				2335	* phases.
				2336	*/
				2337	static int insert_dev_extents(struct btrfs_trans_handle *trans,
				2338	u64 chunk_offset, u64 chunk_size)
				2339	{
				2340	struct btrfs_fs_info *fs_info = trans->fs_info;
				2341	struct btrfs_device *device;
				2342	struct extent_map *em;
				2343	struct map_lookup *map;
				2344	u64 dev_offset;
				2345	u64 stripe_size;
				2346	int i;
				2347	int ret = 0;
				2348
				2349	em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
				2350	if (IS_ERR(em))
				2351	return PTR_ERR(em);
				2352
				2353	map = em->map_lookup;
				2354	stripe_size = em->orig_block_len;
				2355
				2356	/*
				2357	* Take the device list mutex to prevent races with the final phase of
				2358	* a device replace operation that replaces the device object associated
				2359	* with the map's stripes, because the device object's id can change
				2360	* at any time during that final phase of the device replace operation
				2361	* (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
				2362	* replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
				2363	* resulting in persisting a device extent item with such ID.
				2364	*/
				2365	mutex_lock(&fs_info->fs_devices->device_list_mutex);
				2366	for (i = 0; i < map->num_stripes; i++) {
				2367	device = map->stripes[i].dev;
				2368	dev_offset = map->stripes[i].physical;
				2369
				2370	ret = insert_dev_extent(trans, device, chunk_offset, dev_offset,
				2371	stripe_size);
				2372	if (ret)
				2373	break;
				2374	}
				2375	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
				2376
				2377	free_extent_map(em);
				2378	return ret;
				2379	}
				2380
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	2381	/*
				2382	* This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of
				2383	* chunk allocation.
				2384	*
				2385	* See the comment at btrfs_chunk_alloc() for details about the chunk allocation
				2386	* phases.
				2387	*/
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2388	void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
				2389	{
				2390	struct btrfs_fs_info *fs_info = trans->fs_info;
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	2391	struct btrfs_block_group *block_group;
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2392	int ret = 0;
				2393
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2394	while (!list_empty(&trans->new_bgs)) {
Josef Bacik	49ea112	2020-09-01 17:40:38 -0400	[diff] [blame]	2395	int index;
				2396
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2397	block_group = list_first_entry(&trans->new_bgs,
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	2398	struct btrfs_block_group,
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2399	bg_list);
				2400	if (ret)
				2401	goto next;
				2402
Josef Bacik	49ea112	2020-09-01 17:40:38 -0400	[diff] [blame]	2403	index = btrfs_bg_flags_to_raid_index(block_group->flags);
				2404
Qu Wenruo	97f4728	2020-05-05 07:58:22 +0800	[diff] [blame]	2405	ret = insert_block_group_item(trans, block_group);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2406	if (ret)
				2407	btrfs_abort_transaction(trans, ret);
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	2408	if (!block_group->chunk_item_inserted) {
				2409	mutex_lock(&fs_info->chunk_mutex);
				2410	ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group);
				2411	mutex_unlock(&fs_info->chunk_mutex);
				2412	if (ret)
				2413	btrfs_abort_transaction(trans, ret);
				2414	}
Nikolay Borisov	2eadb9e	2021-07-05 12:29:19 +0300	[diff] [blame]	2415	ret = insert_dev_extents(trans, block_group->start,
				2416	block_group->length);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2417	if (ret)
				2418	btrfs_abort_transaction(trans, ret);
				2419	add_block_group_free_space(trans, block_group);
Josef Bacik	49ea112	2020-09-01 17:40:38 -0400	[diff] [blame]	2420
				2421	/*
				2422	* If we restriped during balance, we may have added a new raid
				2423	* type, so now add the sysfs entries when it is safe to do so.
				2424	* We don't have to worry about locking here as it's handled in
				2425	* btrfs_sysfs_add_block_group_type.
				2426	*/
				2427	if (block_group->space_info->block_group_kobjs[index] == NULL)
				2428	btrfs_sysfs_add_block_group_type(block_group);
				2429
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2430	/* Already aborted the transaction if it failed. */
				2431	next:
				2432	btrfs_delayed_refs_rsv_release(fs_info, 1);
				2433	list_del_init(&block_group->bg_list);
				2434	}
				2435	btrfs_trans_release_chunk_metadata(trans);
				2436	}
				2437
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	2438	struct btrfs_block_group btrfs_make_block_group(struct btrfs_trans_handle trans,
				2439	u64 bytes_used, u64 type,
				2440	u64 chunk_offset, u64 size)
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2441	{
				2442	struct btrfs_fs_info *fs_info = trans->fs_info;
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	2443	struct btrfs_block_group *cache;
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2444	int ret;
				2445
				2446	btrfs_set_log_full_commit(trans);
				2447
Qu Wenruo	9afc664	2020-05-05 07:58:20 +0800	[diff] [blame]	2448	cache = btrfs_create_block_group_cache(fs_info, chunk_offset);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2449	if (!cache)
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	2450	return ERR_PTR(-ENOMEM);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2451
Qu Wenruo	9afc664	2020-05-05 07:58:20 +0800	[diff] [blame]	2452	cache->length = size;
Marcos Paulo de Souza	e3e39c7	2020-08-21 11:54:44 -0300	[diff] [blame]	2453	set_free_space_tree_thresholds(cache);
David Sterba	bf38be6	2019-10-23 18:48:11 +0200	[diff] [blame]	2454	cache->used = bytes_used;
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2455	cache->flags = type;
				2456	cache->last_byte_to_unpin = (u64)-1;
				2457	cache->cached = BTRFS_CACHE_FINISHED;
Boris Burkov	997e3e2	2020-11-18 15:06:18 -0800	[diff] [blame]	2458	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
				2459	cache->needs_free_space = 1;
Naohiro Aota	08e11a3	2021-02-04 19:21:50 +0900	[diff] [blame]	2460
Naohiro Aota	a94794d	2021-02-04 19:21:51 +0900	[diff] [blame]	2461	ret = btrfs_load_block_group_zone_info(cache, true);
Naohiro Aota	08e11a3	2021-02-04 19:21:50 +0900	[diff] [blame]	2462	if (ret) {
				2463	btrfs_put_block_group(cache);
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	2464	return ERR_PTR(ret);
Naohiro Aota	08e11a3	2021-02-04 19:21:50 +0900	[diff] [blame]	2465	}
				2466
Naohiro Aota	eb66a01	2021-08-19 21:19:20 +0900	[diff] [blame]	2467	/*
				2468	* New block group is likely to be used soon. Try to activate it now.
				2469	* Failure is OK for now.
				2470	*/
				2471	btrfs_zone_activate(cache);
				2472
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2473	ret = exclude_super_stripes(cache);
				2474	if (ret) {
				2475	/* We may have excluded something, so call this just in case */
				2476	btrfs_free_excluded_extents(cache);
				2477	btrfs_put_block_group(cache);
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	2478	return ERR_PTR(ret);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2479	}
				2480
				2481	add_new_free_space(cache, chunk_offset, chunk_offset + size);
				2482
				2483	btrfs_free_excluded_extents(cache);
				2484
				2485	#ifdef CONFIG_BTRFS_DEBUG
				2486	if (btrfs_should_fragment_free_space(cache)) {
				2487	u64 new_bytes_used = size - bytes_used;
				2488
				2489	bytes_used += new_bytes_used >> 1;
Josef Bacik	e11c040	2019-06-20 15:38:07 -0400	[diff] [blame]	2490	fragment_free_space(cache);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2491	}
				2492	#endif
				2493	/*
				2494	* Ensure the corresponding space_info object is created and
				2495	* assigned to our block group. We want our bg to be added to the rbtree
				2496	* with its ->space_info set.
				2497	*/
				2498	cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
				2499	ASSERT(cache->space_info);
				2500
				2501	ret = btrfs_add_block_group_cache(fs_info, cache);
				2502	if (ret) {
				2503	btrfs_remove_free_space_cache(cache);
				2504	btrfs_put_block_group(cache);
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	2505	return ERR_PTR(ret);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2506	}
				2507
				2508	/*
				2509	* Now that our block group has its ->space_info set and is inserted in
				2510	* the rbtree, update the space info's counters.
				2511	*/
				2512	trace_btrfs_add_block_group(fs_info, cache, 1);
				2513	btrfs_update_space_info(fs_info, cache->flags, size, bytes_used,
Naohiro Aota	9817325	2021-08-19 21:19:10 +0900	[diff] [blame]	2514	cache->bytes_super, cache->zone_unusable,
				2515	&cache->space_info);
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2516	btrfs_update_global_block_rsv(fs_info);
				2517
				2518	link_block_group(cache);
				2519
				2520	list_add_tail(&cache->bg_list, &trans->new_bgs);
				2521	trans->delayed_ref_updates++;
				2522	btrfs_update_delayed_refs_rsv(trans);
				2523
				2524	set_avail_alloc_bits(fs_info, type);
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	2525	return cache;
Josef Bacik	4358d963	2019-06-20 15:37:57 -0400	[diff] [blame]	2526	}
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	2527
Qu Wenruo	b12de52	2019-11-15 10:09:00 +0800	[diff] [blame]	2528	/*
				2529	* Mark one block group RO, can be called several times for the same block
				2530	* group.
				2531	*
				2532	* @cache: the destination block group
				2533	* @do_chunk_alloc: whether need to do chunk pre-allocation, this is to
				2534	* ensure we still have some free space after marking this
				2535	* block group RO.
				2536	*/
				2537	int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
				2538	bool do_chunk_alloc)
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	2539	{
				2540	struct btrfs_fs_info *fs_info = cache->fs_info;
				2541	struct btrfs_trans_handle *trans;
Josef Bacik	dfe8aec	2021-11-05 16:45:36 -0400	[diff] [blame]	2542	struct btrfs_root *root = btrfs_block_group_root(fs_info);
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	2543	u64 alloc_flags;
				2544	int ret;
Nikolay Borisov	b6e9f16	2021-02-17 15:12:50 +0200	[diff] [blame]	2545	bool dirty_bg_running;
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	2546
Nikolay Borisov	b6e9f16	2021-02-17 15:12:50 +0200	[diff] [blame]	2547	do {
Josef Bacik	dfe8aec	2021-11-05 16:45:36 -0400	[diff] [blame]	2548	trans = btrfs_join_transaction(root);
Nikolay Borisov	b6e9f16	2021-02-17 15:12:50 +0200	[diff] [blame]	2549	if (IS_ERR(trans))
				2550	return PTR_ERR(trans);
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	2551
Nikolay Borisov	b6e9f16	2021-02-17 15:12:50 +0200	[diff] [blame]	2552	dirty_bg_running = false;
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	2553
Nikolay Borisov	b6e9f16	2021-02-17 15:12:50 +0200	[diff] [blame]	2554	/*
				2555	* We're not allowed to set block groups readonly after the dirty
				2556	* block group cache has started writing. If it already started,
				2557	* back off and let this transaction commit.
				2558	*/
				2559	mutex_lock(&fs_info->ro_block_group_mutex);
				2560	if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
				2561	u64 transid = trans->transid;
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	2562
Nikolay Borisov	b6e9f16	2021-02-17 15:12:50 +0200	[diff] [blame]	2563	mutex_unlock(&fs_info->ro_block_group_mutex);
				2564	btrfs_end_transaction(trans);
				2565
				2566	ret = btrfs_wait_for_commit(fs_info, transid);
				2567	if (ret)
				2568	return ret;
				2569	dirty_bg_running = true;
				2570	}
				2571	} while (dirty_bg_running);
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	2572
Qu Wenruo	b12de52	2019-11-15 10:09:00 +0800	[diff] [blame]	2573	if (do_chunk_alloc) {
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	2574	/*
Qu Wenruo	b12de52	2019-11-15 10:09:00 +0800	[diff] [blame]	2575	* If we are changing raid levels, try to allocate a
				2576	* corresponding block group with the new raid level.
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	2577	*/
Josef Bacik	349e120	2020-07-21 10:48:45 -0400	[diff] [blame]	2578	alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
Qu Wenruo	b12de52	2019-11-15 10:09:00 +0800	[diff] [blame]	2579	if (alloc_flags != cache->flags) {
				2580	ret = btrfs_chunk_alloc(trans, alloc_flags,
				2581	CHUNK_ALLOC_FORCE);
				2582	/*
				2583	* ENOSPC is allowed here, we may have enough space
				2584	* already allocated at the new raid level to carry on
				2585	*/
				2586	if (ret == -ENOSPC)
				2587	ret = 0;
				2588	if (ret < 0)
				2589	goto out;
				2590	}
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	2591	}
				2592
Josef Bacik	a7a63acc	2020-01-17 09:07:38 -0500	[diff] [blame]	2593	ret = inc_block_group_ro(cache, 0);
Filipe Manana	195a49e	2021-02-05 12:55:37 +0000	[diff] [blame]	2594	if (!do_chunk_alloc \|\| ret == -ETXTBSY)
Qu Wenruo	b12de52	2019-11-15 10:09:00 +0800	[diff] [blame]	2595	goto unlock_out;
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	2596	if (!ret)
				2597	goto out;
				2598	alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags);
				2599	ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
				2600	if (ret < 0)
				2601	goto out;
Josef Bacik	e11c040	2019-06-20 15:38:07 -0400	[diff] [blame]	2602	ret = inc_block_group_ro(cache, 0);
Filipe Manana	195a49e	2021-02-05 12:55:37 +0000	[diff] [blame]	2603	if (ret == -ETXTBSY)
				2604	goto unlock_out;
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	2605	out:
				2606	if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
Josef Bacik	349e120	2020-07-21 10:48:45 -0400	[diff] [blame]	2607	alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	2608	mutex_lock(&fs_info->chunk_mutex);
				2609	check_system_chunk(trans, alloc_flags);
				2610	mutex_unlock(&fs_info->chunk_mutex);
				2611	}
Qu Wenruo	b12de52	2019-11-15 10:09:00 +0800	[diff] [blame]	2612	unlock_out:
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	2613	mutex_unlock(&fs_info->ro_block_group_mutex);
				2614
				2615	btrfs_end_transaction(trans);
				2616	return ret;
				2617	}
				2618
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	2619	void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	2620	{
				2621	struct btrfs_space_info *sinfo = cache->space_info;
				2622	u64 num_bytes;
				2623
				2624	BUG_ON(!cache->ro);
				2625
				2626	spin_lock(&sinfo->lock);
				2627	spin_lock(&cache->lock);
				2628	if (!--cache->ro) {
Naohiro Aota	169e0da	2021-02-04 19:21:52 +0900	[diff] [blame]	2629	if (btrfs_is_zoned(cache->fs_info)) {
				2630	/* Migrate zone_unusable bytes back */
Naohiro Aota	9817325	2021-08-19 21:19:10 +0900	[diff] [blame]	2631	cache->zone_unusable =
				2632	(cache->alloc_offset - cache->used) +
				2633	(cache->length - cache->zone_capacity);
Naohiro Aota	169e0da	2021-02-04 19:21:52 +0900	[diff] [blame]	2634	sinfo->bytes_zone_unusable += cache->zone_unusable;
				2635	sinfo->bytes_readonly -= cache->zone_unusable;
				2636	}
Naohiro Aota	f9f28e5	2021-06-17 13:56:18 +0900	[diff] [blame]	2637	num_bytes = cache->length - cache->reserved -
				2638	cache->pinned - cache->bytes_super -
				2639	cache->zone_unusable - cache->used;
				2640	sinfo->bytes_readonly -= num_bytes;
Josef Bacik	26ce209	2019-06-20 15:37:59 -0400	[diff] [blame]	2641	list_del_init(&cache->ro_list);
				2642	}
				2643	spin_unlock(&cache->lock);
				2644	spin_unlock(&sinfo->lock);
				2645	}
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2646
Qu Wenruo	3be4d8e	2020-05-05 07:58:23 +0800	[diff] [blame]	2647	static int update_block_group_item(struct btrfs_trans_handle *trans,
				2648	struct btrfs_path *path,
				2649	struct btrfs_block_group *cache)
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2650	{
				2651	struct btrfs_fs_info *fs_info = trans->fs_info;
				2652	int ret;
Josef Bacik	dfe8aec	2021-11-05 16:45:36 -0400	[diff] [blame]	2653	struct btrfs_root *root = btrfs_block_group_root(fs_info);
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2654	unsigned long bi;
				2655	struct extent_buffer *leaf;
David Sterba	bf38be6	2019-10-23 18:48:11 +0200	[diff] [blame]	2656	struct btrfs_block_group_item bgi;
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	2657	struct btrfs_key key;
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2658
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	2659	key.objectid = cache->start;
				2660	key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
				2661	key.offset = cache->length;
				2662
Qu Wenruo	3be4d8e	2020-05-05 07:58:23 +0800	[diff] [blame]	2663	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2664	if (ret) {
				2665	if (ret > 0)
				2666	ret = -ENOENT;
				2667	goto fail;
				2668	}
				2669
				2670	leaf = path->nodes[0];
				2671	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
David Sterba	de0dc45	2019-10-23 18:48:18 +0200	[diff] [blame]	2672	btrfs_set_stack_block_group_used(&bgi, cache->used);
				2673	btrfs_set_stack_block_group_chunk_objectid(&bgi,
David Sterba	3d97638	2019-10-23 18:48:15 +0200	[diff] [blame]	2674	BTRFS_FIRST_CHUNK_TREE_OBJECTID);
David Sterba	de0dc45	2019-10-23 18:48:18 +0200	[diff] [blame]	2675	btrfs_set_stack_block_group_flags(&bgi, cache->flags);
David Sterba	bf38be6	2019-10-23 18:48:11 +0200	[diff] [blame]	2676	write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2677	btrfs_mark_buffer_dirty(leaf);
				2678	fail:
				2679	btrfs_release_path(path);
				2680	return ret;
				2681
				2682	}
				2683
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	2684	static int cache_save_setup(struct btrfs_block_group *block_group,
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2685	struct btrfs_trans_handle *trans,
				2686	struct btrfs_path *path)
				2687	{
				2688	struct btrfs_fs_info *fs_info = block_group->fs_info;
				2689	struct btrfs_root *root = fs_info->tree_root;
				2690	struct inode *inode = NULL;
				2691	struct extent_changeset *data_reserved = NULL;
				2692	u64 alloc_hint = 0;
				2693	int dcs = BTRFS_DC_ERROR;
Qu Wenruo	0044ae1	2021-04-13 14:23:14 +0800	[diff] [blame]	2694	u64 cache_size = 0;
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2695	int retries = 0;
				2696	int ret = 0;
				2697
Boris Burkov	af456a2	2020-11-18 15:06:26 -0800	[diff] [blame]	2698	if (!btrfs_test_opt(fs_info, SPACE_CACHE))
				2699	return 0;
				2700
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2701	/*
				2702	* If this block group is smaller than 100 megs don't bother caching the
				2703	* block group.
				2704	*/
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	2705	if (block_group->length < (100 * SZ_1M)) {
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2706	spin_lock(&block_group->lock);
				2707	block_group->disk_cache_state = BTRFS_DC_WRITTEN;
				2708	spin_unlock(&block_group->lock);
				2709	return 0;
				2710	}
				2711
David Sterba	bf31f87	2020-02-05 17:34:34 +0100	[diff] [blame]	2712	if (TRANS_ABORTED(trans))
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2713	return 0;
				2714	again:
				2715	inode = lookup_free_space_inode(block_group, path);
				2716	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
				2717	ret = PTR_ERR(inode);
				2718	btrfs_release_path(path);
				2719	goto out;
				2720	}
				2721
				2722	if (IS_ERR(inode)) {
				2723	BUG_ON(retries);
				2724	retries++;
				2725
				2726	if (block_group->ro)
				2727	goto out_free;
				2728
				2729	ret = create_free_space_inode(trans, block_group, path);
				2730	if (ret)
				2731	goto out_free;
				2732	goto again;
				2733	}
				2734
				2735	/*
				2736	* We want to set the generation to 0, that way if anything goes wrong
				2737	* from here on out we know not to trust this cache when we load up next
				2738	* time.
				2739	*/
				2740	BTRFS_I(inode)->generation = 0;
Nikolay Borisov	9a56fcd	2020-11-02 16:48:59 +0200	[diff] [blame]	2741	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2742	if (ret) {
				2743	/*
				2744	* So theoretically we could recover from this, simply set the
				2745	* super cache generation to 0 so we know to invalidate the
				2746	* cache, but then we'd have to keep track of the block groups
				2747	* that fail this way so we know we _have_ to reset this cache
				2748	* before the next commit or risk reading stale cache. So to
				2749	* limit our exposure to horrible edge cases lets just abort the
				2750	* transaction, this only happens in really bad situations
				2751	* anyway.
				2752	*/
				2753	btrfs_abort_transaction(trans, ret);
				2754	goto out_put;
				2755	}
				2756	WARN_ON(ret);
				2757
				2758	/* We've already setup this transaction, go ahead and exit */
				2759	if (block_group->cache_generation == trans->transid &&
				2760	i_size_read(inode)) {
				2761	dcs = BTRFS_DC_SETUP;
				2762	goto out_put;
				2763	}
				2764
				2765	if (i_size_read(inode) > 0) {
				2766	ret = btrfs_check_trunc_cache_free_space(fs_info,
				2767	&fs_info->global_block_rsv);
				2768	if (ret)
				2769	goto out_put;
				2770
				2771	ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
				2772	if (ret)
				2773	goto out_put;
				2774	}
				2775
				2776	spin_lock(&block_group->lock);
				2777	if (block_group->cached != BTRFS_CACHE_FINISHED \|\|
				2778	!btrfs_test_opt(fs_info, SPACE_CACHE)) {
				2779	/*
				2780	* don't bother trying to write stuff out _if_
				2781	* a) we're not cached,
				2782	* b) we're with nospace_cache mount option,
				2783	* c) we're with v2 space_cache (FREE_SPACE_TREE).
				2784	*/
				2785	dcs = BTRFS_DC_WRITTEN;
				2786	spin_unlock(&block_group->lock);
				2787	goto out_put;
				2788	}
				2789	spin_unlock(&block_group->lock);
				2790
				2791	/*
				2792	* We hit an ENOSPC when setting up the cache in this transaction, just
				2793	* skip doing the setup, we've already cleared the cache so we're safe.
				2794	*/
				2795	if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
				2796	ret = -ENOSPC;
				2797	goto out_put;
				2798	}
				2799
				2800	/*
				2801	* Try to preallocate enough space based on how big the block group is.
				2802	* Keep in mind this has to include any pinned space which could end up
				2803	* taking up quite a bit since it's not folded into the other space
				2804	* cache.
				2805	*/
Qu Wenruo	0044ae1	2021-04-13 14:23:14 +0800	[diff] [blame]	2806	cache_size = div_u64(block_group->length, SZ_256M);
				2807	if (!cache_size)
				2808	cache_size = 1;
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2809
Qu Wenruo	0044ae1	2021-04-13 14:23:14 +0800	[diff] [blame]	2810	cache_size *= 16;
				2811	cache_size *= fs_info->sectorsize;
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2812
Nikolay Borisov	36ea6f3	2020-06-03 08:55:41 +0300	[diff] [blame]	2813	ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0,
Qu Wenruo	0044ae1	2021-04-13 14:23:14 +0800	[diff] [blame]	2814	cache_size);
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2815	if (ret)
				2816	goto out_put;
				2817
Qu Wenruo	0044ae1	2021-04-13 14:23:14 +0800	[diff] [blame]	2818	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, cache_size,
				2819	cache_size, cache_size,
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2820	&alloc_hint);
				2821	/*
				2822	* Our cache requires contiguous chunks so that we don't modify a bunch
				2823	* of metadata or split extents when writing the cache out, which means
				2824	* we can enospc if we are heavily fragmented in addition to just normal
				2825	* out of space conditions. So if we hit this just skip setting up any
				2826	* other block groups for this transaction, maybe we'll unpin enough
				2827	* space the next time around.
				2828	*/
				2829	if (!ret)
				2830	dcs = BTRFS_DC_SETUP;
				2831	else if (ret == -ENOSPC)
				2832	set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
				2833
				2834	out_put:
				2835	iput(inode);
				2836	out_free:
				2837	btrfs_release_path(path);
				2838	out:
				2839	spin_lock(&block_group->lock);
				2840	if (!ret && dcs == BTRFS_DC_SETUP)
				2841	block_group->cache_generation = trans->transid;
				2842	block_group->disk_cache_state = dcs;
				2843	spin_unlock(&block_group->lock);
				2844
				2845	extent_changeset_free(data_reserved);
				2846	return ret;
				2847	}
				2848
				2849	int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
				2850	{
				2851	struct btrfs_fs_info *fs_info = trans->fs_info;
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	2852	struct btrfs_block_group cache, tmp;
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2853	struct btrfs_transaction *cur_trans = trans->transaction;
				2854	struct btrfs_path *path;
				2855
				2856	if (list_empty(&cur_trans->dirty_bgs) \|\|
				2857	!btrfs_test_opt(fs_info, SPACE_CACHE))
				2858	return 0;
				2859
				2860	path = btrfs_alloc_path();
				2861	if (!path)
				2862	return -ENOMEM;
				2863
				2864	/* Could add new block groups, use _safe just in case */
				2865	list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
				2866	dirty_list) {
				2867	if (cache->disk_cache_state == BTRFS_DC_CLEAR)
				2868	cache_save_setup(cache, trans, path);
				2869	}
				2870
				2871	btrfs_free_path(path);
				2872	return 0;
				2873	}
				2874
				2875	/*
				2876	* Transaction commit does final block group cache writeback during a critical
				2877	* section where nothing is allowed to change the FS. This is required in
				2878	* order for the cache to actually match the block group, but can introduce a
				2879	* lot of latency into the commit.
				2880	*
				2881	* So, btrfs_start_dirty_block_groups is here to kick off block group cache IO.
				2882	* There's a chance we'll have to redo some of it if the block group changes
				2883	* again during the commit, but it greatly reduces the commit latency by
				2884	* getting rid of the easy block groups while we're still allowing others to
				2885	* join the commit.
				2886	*/
				2887	int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
				2888	{
				2889	struct btrfs_fs_info *fs_info = trans->fs_info;
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	2890	struct btrfs_block_group *cache;
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2891	struct btrfs_transaction *cur_trans = trans->transaction;
				2892	int ret = 0;
				2893	int should_put;
				2894	struct btrfs_path *path = NULL;
				2895	LIST_HEAD(dirty);
				2896	struct list_head *io = &cur_trans->io_bgs;
				2897	int num_started = 0;
				2898	int loops = 0;
				2899
				2900	spin_lock(&cur_trans->dirty_bgs_lock);
				2901	if (list_empty(&cur_trans->dirty_bgs)) {
				2902	spin_unlock(&cur_trans->dirty_bgs_lock);
				2903	return 0;
				2904	}
				2905	list_splice_init(&cur_trans->dirty_bgs, &dirty);
				2906	spin_unlock(&cur_trans->dirty_bgs_lock);
				2907
				2908	again:
				2909	/* Make sure all the block groups on our dirty list actually exist */
				2910	btrfs_create_pending_block_groups(trans);
				2911
				2912	if (!path) {
				2913	path = btrfs_alloc_path();
Josef Bacik	938fcbf	2021-01-14 14:02:43 -0500	[diff] [blame]	2914	if (!path) {
				2915	ret = -ENOMEM;
				2916	goto out;
				2917	}
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2918	}
				2919
				2920	/*
				2921	* cache_write_mutex is here only to save us from balance or automatic
				2922	* removal of empty block groups deleting this block group while we are
				2923	* writing out the cache
				2924	*/
				2925	mutex_lock(&trans->transaction->cache_write_mutex);
				2926	while (!list_empty(&dirty)) {
				2927	bool drop_reserve = true;
				2928
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	2929	cache = list_first_entry(&dirty, struct btrfs_block_group,
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2930	dirty_list);
				2931	/*
				2932	* This can happen if something re-dirties a block group that
				2933	* is already under IO. Just wait for it to finish and then do
				2934	* it all again
				2935	*/
				2936	if (!list_empty(&cache->io_list)) {
				2937	list_del_init(&cache->io_list);
				2938	btrfs_wait_cache_io(trans, cache, path);
				2939	btrfs_put_block_group(cache);
				2940	}
				2941
				2942
				2943	/*
				2944	* btrfs_wait_cache_io uses the cache->dirty_list to decide if
				2945	* it should update the cache_state. Don't delete until after
				2946	* we wait.
				2947	*
				2948	* Since we're not running in the commit critical section
				2949	* we need the dirty_bgs_lock to protect from update_block_group
				2950	*/
				2951	spin_lock(&cur_trans->dirty_bgs_lock);
				2952	list_del_init(&cache->dirty_list);
				2953	spin_unlock(&cur_trans->dirty_bgs_lock);
				2954
				2955	should_put = 1;
				2956
				2957	cache_save_setup(cache, trans, path);
				2958
				2959	if (cache->disk_cache_state == BTRFS_DC_SETUP) {
				2960	cache->io_ctl.inode = NULL;
				2961	ret = btrfs_write_out_cache(trans, cache, path);
				2962	if (ret == 0 && cache->io_ctl.inode) {
				2963	num_started++;
				2964	should_put = 0;
				2965
				2966	/*
				2967	* The cache_write_mutex is protecting the
				2968	* io_list, also refer to the definition of
				2969	* btrfs_transaction::io_bgs for more details
				2970	*/
				2971	list_add_tail(&cache->io_list, io);
				2972	} else {
				2973	/*
				2974	* If we failed to write the cache, the
				2975	* generation will be bad and life goes on
				2976	*/
				2977	ret = 0;
				2978	}
				2979	}
				2980	if (!ret) {
Qu Wenruo	3be4d8e	2020-05-05 07:58:23 +0800	[diff] [blame]	2981	ret = update_block_group_item(trans, path, cache);
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	2982	/*
				2983	* Our block group might still be attached to the list
				2984	* of new block groups in the transaction handle of some
				2985	* other task (struct btrfs_trans_handle->new_bgs). This
				2986	* means its block group item isn't yet in the extent
				2987	* tree. If this happens ignore the error, as we will
				2988	* try again later in the critical section of the
				2989	* transaction commit.
				2990	*/
				2991	if (ret == -ENOENT) {
				2992	ret = 0;
				2993	spin_lock(&cur_trans->dirty_bgs_lock);
				2994	if (list_empty(&cache->dirty_list)) {
				2995	list_add_tail(&cache->dirty_list,
				2996	&cur_trans->dirty_bgs);
				2997	btrfs_get_block_group(cache);
				2998	drop_reserve = false;
				2999	}
				3000	spin_unlock(&cur_trans->dirty_bgs_lock);
				3001	} else if (ret) {
				3002	btrfs_abort_transaction(trans, ret);
				3003	}
				3004	}
				3005
				3006	/* If it's not on the io list, we need to put the block group */
				3007	if (should_put)
				3008	btrfs_put_block_group(cache);
				3009	if (drop_reserve)
				3010	btrfs_delayed_refs_rsv_release(fs_info, 1);
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	3011	/*
				3012	* Avoid blocking other tasks for too long. It might even save
				3013	* us from writing caches for block groups that are going to be
				3014	* removed.
				3015	*/
				3016	mutex_unlock(&trans->transaction->cache_write_mutex);
Josef Bacik	938fcbf	2021-01-14 14:02:43 -0500	[diff] [blame]	3017	if (ret)
				3018	goto out;
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	3019	mutex_lock(&trans->transaction->cache_write_mutex);
				3020	}
				3021	mutex_unlock(&trans->transaction->cache_write_mutex);
				3022
				3023	/*
				3024	* Go through delayed refs for all the stuff we've just kicked off
				3025	* and then loop back (just once)
				3026	*/
Josef Bacik	34d1eb0	2020-12-16 11:22:17 -0500	[diff] [blame]	3027	if (!ret)
				3028	ret = btrfs_run_delayed_refs(trans, 0);
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	3029	if (!ret && loops == 0) {
				3030	loops++;
				3031	spin_lock(&cur_trans->dirty_bgs_lock);
				3032	list_splice_init(&cur_trans->dirty_bgs, &dirty);
				3033	/*
				3034	* dirty_bgs_lock protects us from concurrent block group
				3035	* deletes too (not just cache_write_mutex).
				3036	*/
				3037	if (!list_empty(&dirty)) {
				3038	spin_unlock(&cur_trans->dirty_bgs_lock);
				3039	goto again;
				3040	}
				3041	spin_unlock(&cur_trans->dirty_bgs_lock);
Josef Bacik	938fcbf	2021-01-14 14:02:43 -0500	[diff] [blame]	3042	}
				3043	out:
				3044	if (ret < 0) {
				3045	spin_lock(&cur_trans->dirty_bgs_lock);
				3046	list_splice_init(&dirty, &cur_trans->dirty_bgs);
				3047	spin_unlock(&cur_trans->dirty_bgs_lock);
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	3048	btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
				3049	}
				3050
				3051	btrfs_free_path(path);
				3052	return ret;
				3053	}
				3054
				3055	int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
				3056	{
				3057	struct btrfs_fs_info *fs_info = trans->fs_info;
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	3058	struct btrfs_block_group *cache;
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	3059	struct btrfs_transaction *cur_trans = trans->transaction;
				3060	int ret = 0;
				3061	int should_put;
				3062	struct btrfs_path *path;
				3063	struct list_head *io = &cur_trans->io_bgs;
				3064	int num_started = 0;
				3065
				3066	path = btrfs_alloc_path();
				3067	if (!path)
				3068	return -ENOMEM;
				3069
				3070	/*
				3071	* Even though we are in the critical section of the transaction commit,
				3072	* we can still have concurrent tasks adding elements to this
				3073	* transaction's list of dirty block groups. These tasks correspond to
				3074	* endio free space workers started when writeback finishes for a
				3075	* space cache, which run inode.c:btrfs_finish_ordered_io(), and can
				3076	* allocate new block groups as a result of COWing nodes of the root
				3077	* tree when updating the free space inode. The writeback for the space
				3078	* caches is triggered by an earlier call to
				3079	* btrfs_start_dirty_block_groups() and iterations of the following
				3080	* loop.
				3081	* Also we want to do the cache_save_setup first and then run the
				3082	* delayed refs to make sure we have the best chance at doing this all
				3083	* in one shot.
				3084	*/
				3085	spin_lock(&cur_trans->dirty_bgs_lock);
				3086	while (!list_empty(&cur_trans->dirty_bgs)) {
				3087	cache = list_first_entry(&cur_trans->dirty_bgs,
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	3088	struct btrfs_block_group,
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	3089	dirty_list);
				3090
				3091	/*
				3092	* This can happen if cache_save_setup re-dirties a block group
				3093	* that is already under IO. Just wait for it to finish and
				3094	* then do it all again
				3095	*/
				3096	if (!list_empty(&cache->io_list)) {
				3097	spin_unlock(&cur_trans->dirty_bgs_lock);
				3098	list_del_init(&cache->io_list);
				3099	btrfs_wait_cache_io(trans, cache, path);
				3100	btrfs_put_block_group(cache);
				3101	spin_lock(&cur_trans->dirty_bgs_lock);
				3102	}
				3103
				3104	/*
				3105	* Don't remove from the dirty list until after we've waited on
				3106	* any pending IO
				3107	*/
				3108	list_del_init(&cache->dirty_list);
				3109	spin_unlock(&cur_trans->dirty_bgs_lock);
				3110	should_put = 1;
				3111
				3112	cache_save_setup(cache, trans, path);
				3113
				3114	if (!ret)
				3115	ret = btrfs_run_delayed_refs(trans,
				3116	(unsigned long) -1);
				3117
				3118	if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
				3119	cache->io_ctl.inode = NULL;
				3120	ret = btrfs_write_out_cache(trans, cache, path);
				3121	if (ret == 0 && cache->io_ctl.inode) {
				3122	num_started++;
				3123	should_put = 0;
				3124	list_add_tail(&cache->io_list, io);
				3125	} else {
				3126	/*
				3127	* If we failed to write the cache, the
				3128	* generation will be bad and life goes on
				3129	*/
				3130	ret = 0;
				3131	}
				3132	}
				3133	if (!ret) {
Qu Wenruo	3be4d8e	2020-05-05 07:58:23 +0800	[diff] [blame]	3134	ret = update_block_group_item(trans, path, cache);
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	3135	/*
				3136	* One of the free space endio workers might have
				3137	* created a new block group while updating a free space
				3138	* cache's inode (at inode.c:btrfs_finish_ordered_io())
				3139	* and hasn't released its transaction handle yet, in
				3140	* which case the new block group is still attached to
				3141	* its transaction handle and its creation has not
				3142	* finished yet (no block group item in the extent tree
				3143	* yet, etc). If this is the case, wait for all free
				3144	* space endio workers to finish and retry. This is a
Randy Dunlap	260db43	2020-08-04 19:48:34 -0700	[diff] [blame]	3145	* very rare case so no need for a more efficient and
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	3146	* complex approach.
				3147	*/
				3148	if (ret == -ENOENT) {
				3149	wait_event(cur_trans->writer_wait,
				3150	atomic_read(&cur_trans->num_writers) == 1);
Qu Wenruo	3be4d8e	2020-05-05 07:58:23 +0800	[diff] [blame]	3151	ret = update_block_group_item(trans, path, cache);
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	3152	}
				3153	if (ret)
				3154	btrfs_abort_transaction(trans, ret);
				3155	}
				3156
				3157	/* If its not on the io list, we need to put the block group */
				3158	if (should_put)
				3159	btrfs_put_block_group(cache);
				3160	btrfs_delayed_refs_rsv_release(fs_info, 1);
				3161	spin_lock(&cur_trans->dirty_bgs_lock);
				3162	}
				3163	spin_unlock(&cur_trans->dirty_bgs_lock);
				3164
				3165	/*
				3166	* Refer to the definition of io_bgs member for details why it's safe
				3167	* to use it without any locking
				3168	*/
				3169	while (!list_empty(io)) {
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	3170	cache = list_first_entry(io, struct btrfs_block_group,
Josef Bacik	77745c0	2019-06-20 15:38:00 -0400	[diff] [blame]	3171	io_list);
				3172	list_del_init(&cache->io_list);
				3173	btrfs_wait_cache_io(trans, cache, path);
				3174	btrfs_put_block_group(cache);
				3175	}
				3176
				3177	btrfs_free_path(path);
				3178	return ret;
				3179	}
Josef Bacik	606d1bf	2019-06-20 15:38:02 -0400	[diff] [blame]	3180
				3181	int btrfs_update_block_group(struct btrfs_trans_handle *trans,
Anand Jain	11b66fa	2021-10-13 14:05:14 +0800	[diff] [blame]	3182	u64 bytenr, u64 num_bytes, bool alloc)
Josef Bacik	606d1bf	2019-06-20 15:38:02 -0400	[diff] [blame]	3183	{
				3184	struct btrfs_fs_info *info = trans->fs_info;
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	3185	struct btrfs_block_group *cache = NULL;
Josef Bacik	606d1bf	2019-06-20 15:38:02 -0400	[diff] [blame]	3186	u64 total = num_bytes;
				3187	u64 old_val;
				3188	u64 byte_in_group;
				3189	int factor;
				3190	int ret = 0;
				3191
				3192	/* Block accounting for super block */
				3193	spin_lock(&info->delalloc_root_lock);
				3194	old_val = btrfs_super_bytes_used(info->super_copy);
				3195	if (alloc)
				3196	old_val += num_bytes;
				3197	else
				3198	old_val -= num_bytes;
				3199	btrfs_set_super_bytes_used(info->super_copy, old_val);
				3200	spin_unlock(&info->delalloc_root_lock);
				3201
				3202	while (total) {
				3203	cache = btrfs_lookup_block_group(info, bytenr);
				3204	if (!cache) {
				3205	ret = -ENOENT;
				3206	break;
				3207	}
				3208	factor = btrfs_bg_type_to_factor(cache->flags);
				3209
				3210	/*
				3211	* If this block group has free space cache written out, we
				3212	* need to make sure to load it if we are removing space. This
				3213	* is because we need the unpinning stage to actually add the
				3214	* space back to the block group, otherwise we will leak space.
				3215	*/
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	3216	if (!alloc && !btrfs_block_group_done(cache))
Josef Bacik	606d1bf	2019-06-20 15:38:02 -0400	[diff] [blame]	3217	btrfs_cache_block_group(cache, 1);
				3218
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	3219	byte_in_group = bytenr - cache->start;
				3220	WARN_ON(byte_in_group > cache->length);
Josef Bacik	606d1bf	2019-06-20 15:38:02 -0400	[diff] [blame]	3221
				3222	spin_lock(&cache->space_info->lock);
				3223	spin_lock(&cache->lock);
				3224
				3225	if (btrfs_test_opt(info, SPACE_CACHE) &&
				3226	cache->disk_cache_state < BTRFS_DC_CLEAR)
				3227	cache->disk_cache_state = BTRFS_DC_CLEAR;
				3228
David Sterba	bf38be6	2019-10-23 18:48:11 +0200	[diff] [blame]	3229	old_val = cache->used;
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	3230	num_bytes = min(total, cache->length - byte_in_group);
Josef Bacik	606d1bf	2019-06-20 15:38:02 -0400	[diff] [blame]	3231	if (alloc) {
				3232	old_val += num_bytes;
David Sterba	bf38be6	2019-10-23 18:48:11 +0200	[diff] [blame]	3233	cache->used = old_val;
Josef Bacik	606d1bf	2019-06-20 15:38:02 -0400	[diff] [blame]	3234	cache->reserved -= num_bytes;
				3235	cache->space_info->bytes_reserved -= num_bytes;
				3236	cache->space_info->bytes_used += num_bytes;
				3237	cache->space_info->disk_used += num_bytes * factor;
				3238	spin_unlock(&cache->lock);
				3239	spin_unlock(&cache->space_info->lock);
				3240	} else {
				3241	old_val -= num_bytes;
David Sterba	bf38be6	2019-10-23 18:48:11 +0200	[diff] [blame]	3242	cache->used = old_val;
Josef Bacik	606d1bf	2019-06-20 15:38:02 -0400	[diff] [blame]	3243	cache->pinned += num_bytes;
				3244	btrfs_space_info_update_bytes_pinned(info,
				3245	cache->space_info, num_bytes);
				3246	cache->space_info->bytes_used -= num_bytes;
				3247	cache->space_info->disk_used -= num_bytes * factor;
				3248	spin_unlock(&cache->lock);
				3249	spin_unlock(&cache->space_info->lock);
				3250
Nikolay Borisov	fe119a6	2020-01-20 16:09:18 +0200	[diff] [blame]	3251	set_extent_dirty(&trans->transaction->pinned_extents,
Josef Bacik	606d1bf	2019-06-20 15:38:02 -0400	[diff] [blame]	3252	bytenr, bytenr + num_bytes - 1,
				3253	GFP_NOFS \| __GFP_NOFAIL);
				3254	}
				3255
				3256	spin_lock(&trans->transaction->dirty_bgs_lock);
				3257	if (list_empty(&cache->dirty_list)) {
				3258	list_add_tail(&cache->dirty_list,
				3259	&trans->transaction->dirty_bgs);
				3260	trans->delayed_ref_updates++;
				3261	btrfs_get_block_group(cache);
				3262	}
				3263	spin_unlock(&trans->transaction->dirty_bgs_lock);
				3264
				3265	/*
				3266	* No longer have used bytes in this block group, queue it for
				3267	* deletion. We do this after adding the block group to the
				3268	* dirty list to avoid races between cleaner kthread and space
				3269	* cache writeout.
				3270	*/
Dennis Zhou	6e80d4f	2019-12-13 16:22:15 -0800	[diff] [blame]	3271	if (!alloc && old_val == 0) {
				3272	if (!btrfs_test_opt(info, DISCARD_ASYNC))
				3273	btrfs_mark_bg_unused(cache);
				3274	}
Josef Bacik	606d1bf	2019-06-20 15:38:02 -0400	[diff] [blame]	3275
				3276	btrfs_put_block_group(cache);
				3277	total -= num_bytes;
				3278	bytenr += num_bytes;
				3279	}
				3280
				3281	/* Modified block groups are accounted for in the delayed_refs_rsv. */
				3282	btrfs_update_delayed_refs_rsv(trans);
				3283	return ret;
				3284	}
				3285
				3286	/**
				3287	* btrfs_add_reserved_bytes - update the block_group and space info counters
				3288	* @cache: The cache we are manipulating
				3289	* @ram_bytes: The number of bytes of file content, and will be same to
				3290	* @num_bytes except for the compress path.
				3291	* @num_bytes: The number of bytes in question
				3292	* @delalloc: The blocks are allocated for the delalloc write
				3293	*
				3294	* This is called by the allocator when it reserves space. If this is a
				3295	* reservation and the block group has become read only we cannot make the
				3296	* reservation and return -EAGAIN, otherwise this function always succeeds.
				3297	*/
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	3298	int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
Josef Bacik	606d1bf	2019-06-20 15:38:02 -0400	[diff] [blame]	3299	u64 ram_bytes, u64 num_bytes, int delalloc)
				3300	{
				3301	struct btrfs_space_info *space_info = cache->space_info;
				3302	int ret = 0;
				3303
				3304	spin_lock(&space_info->lock);
				3305	spin_lock(&cache->lock);
				3306	if (cache->ro) {
				3307	ret = -EAGAIN;
				3308	} else {
				3309	cache->reserved += num_bytes;
				3310	space_info->bytes_reserved += num_bytes;
Josef Bacik	a43c383	2019-08-22 15:10:56 -0400	[diff] [blame]	3311	trace_btrfs_space_reservation(cache->fs_info, "space_info",
				3312	space_info->flags, num_bytes, 1);
Josef Bacik	606d1bf	2019-06-20 15:38:02 -0400	[diff] [blame]	3313	btrfs_space_info_update_bytes_may_use(cache->fs_info,
				3314	space_info, -ram_bytes);
				3315	if (delalloc)
				3316	cache->delalloc_bytes += num_bytes;
Josef Bacik	99ffb43	2020-07-21 10:22:19 -0400	[diff] [blame]	3317
				3318	/*
				3319	* Compression can use less space than we reserved, so wake
				3320	* tickets if that happens
				3321	*/
				3322	if (num_bytes < ram_bytes)
				3323	btrfs_try_granting_tickets(cache->fs_info, space_info);
Josef Bacik	606d1bf	2019-06-20 15:38:02 -0400	[diff] [blame]	3324	}
				3325	spin_unlock(&cache->lock);
				3326	spin_unlock(&space_info->lock);
				3327	return ret;
				3328	}
				3329
				3330	/**
				3331	* btrfs_free_reserved_bytes - update the block_group and space info counters
				3332	* @cache: The cache we are manipulating
				3333	* @num_bytes: The number of bytes in question
				3334	* @delalloc: The blocks are allocated for the delalloc write
				3335	*
				3336	* This is called by somebody who is freeing space that was never actually used
				3337	* on disk. For example if you reserve some space for a new leaf in transaction
				3338	* A and before transaction A commits you free that leaf, you call this with
				3339	* reserve set to 0 in order to clear the reservation.
				3340	*/
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	3341	void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
Josef Bacik	606d1bf	2019-06-20 15:38:02 -0400	[diff] [blame]	3342	u64 num_bytes, int delalloc)
				3343	{
				3344	struct btrfs_space_info *space_info = cache->space_info;
				3345
				3346	spin_lock(&space_info->lock);
				3347	spin_lock(&cache->lock);
				3348	if (cache->ro)
				3349	space_info->bytes_readonly += num_bytes;
				3350	cache->reserved -= num_bytes;
				3351	space_info->bytes_reserved -= num_bytes;
				3352	space_info->max_extent_size = 0;
				3353
				3354	if (delalloc)
				3355	cache->delalloc_bytes -= num_bytes;
				3356	spin_unlock(&cache->lock);
Josef Bacik	3308234	2020-07-21 10:22:17 -0400	[diff] [blame]	3357
				3358	btrfs_try_granting_tickets(cache->fs_info, space_info);
Josef Bacik	606d1bf	2019-06-20 15:38:02 -0400	[diff] [blame]	3359	spin_unlock(&space_info->lock);
				3360	}
Josef Bacik	07730d8	2019-06-20 15:38:04 -0400	[diff] [blame]	3361
				3362	static void force_metadata_allocation(struct btrfs_fs_info *info)
				3363	{
				3364	struct list_head *head = &info->space_info;
				3365	struct btrfs_space_info *found;
				3366
Josef Bacik	7280490	2020-09-01 17:40:37 -0400	[diff] [blame]	3367	list_for_each_entry(found, head, list) {
Josef Bacik	07730d8	2019-06-20 15:38:04 -0400	[diff] [blame]	3368	if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
				3369	found->force_alloc = CHUNK_ALLOC_FORCE;
				3370	}
Josef Bacik	07730d8	2019-06-20 15:38:04 -0400	[diff] [blame]	3371	}
				3372
				3373	static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
				3374	struct btrfs_space_info *sinfo, int force)
				3375	{
				3376	u64 bytes_used = btrfs_space_info_used(sinfo, false);
				3377	u64 thresh;
				3378
				3379	if (force == CHUNK_ALLOC_FORCE)
				3380	return 1;
				3381
				3382	/*
				3383	* in limited mode, we want to have some free space up to
				3384	* about 1% of the FS size.
				3385	*/
				3386	if (force == CHUNK_ALLOC_LIMITED) {
				3387	thresh = btrfs_super_total_bytes(fs_info->super_copy);
				3388	thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
				3389
				3390	if (sinfo->total_bytes - bytes_used < thresh)
				3391	return 1;
				3392	}
				3393
				3394	if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
				3395	return 0;
				3396	return 1;
				3397	}
				3398
				3399	int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
				3400	{
				3401	u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type);
				3402
				3403	return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
				3404	}
				3405
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	3406	static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
				3407	{
				3408	struct btrfs_block_group *bg;
				3409	int ret;
				3410
				3411	/*
				3412	* Check if we have enough space in the system space info because we
				3413	* will need to update device items in the chunk btree and insert a new
				3414	* chunk item in the chunk btree as well. This will allocate a new
				3415	* system block group if needed.
				3416	*/
				3417	check_system_chunk(trans, flags);
				3418
Nikolay Borisov	f6f39f7	2021-08-18 13:41:19 +0300	[diff] [blame]	3419	bg = btrfs_create_chunk(trans, flags);
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	3420	if (IS_ERR(bg)) {
				3421	ret = PTR_ERR(bg);
				3422	goto out;
				3423	}
				3424
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	3425	ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
				3426	/*
				3427	* Normally we are not expected to fail with -ENOSPC here, since we have
				3428	* previously reserved space in the system space_info and allocated one
Filipe Manana	ecd84d5	2021-10-13 10:12:50 +0100	[diff] [blame]	3429	* new system chunk if necessary. However there are three exceptions:
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	3430	*
				3431	* 1) We may have enough free space in the system space_info but all the
				3432	* existing system block groups have a profile which can not be used
				3433	* for extent allocation.
				3434	*
				3435	* This happens when mounting in degraded mode. For example we have a
				3436	* RAID1 filesystem with 2 devices, lose one device and mount the fs
				3437	* using the other device in degraded mode. If we then allocate a chunk,
				3438	* we may have enough free space in the existing system space_info, but
				3439	* none of the block groups can be used for extent allocation since they
				3440	* have a RAID1 profile, and because we are in degraded mode with a
				3441	* single device, we are forced to allocate a new system chunk with a
				3442	* SINGLE profile. Making check_system_chunk() iterate over all system
				3443	* block groups and check if they have a usable profile and enough space
				3444	* can be slow on very large filesystems, so we tolerate the -ENOSPC and
				3445	* try again after forcing allocation of a new system chunk. Like this
				3446	* we avoid paying the cost of that search in normal circumstances, when
				3447	* we were not mounted in degraded mode;
				3448	*
				3449	* 2) We had enough free space info the system space_info, and one suitable
				3450	* block group to allocate from when we called check_system_chunk()
				3451	* above. However right after we called it, the only system block group
				3452	* with enough free space got turned into RO mode by a running scrub,
				3453	* and in this case we have to allocate a new one and retry. We only
				3454	* need do this allocate and retry once, since we have a transaction
Filipe Manana	ecd84d5	2021-10-13 10:12:50 +0100	[diff] [blame]	3455	* handle and scrub uses the commit root to search for block groups;
				3456	*
				3457	* 3) We had one system block group with enough free space when we called
				3458	* check_system_chunk(), but after that, right before we tried to
				3459	* allocate the last extent buffer we needed, a discard operation came
				3460	* in and it temporarily removed the last free space entry from the
				3461	* block group (discard removes a free space entry, discards it, and
				3462	* then adds back the entry to the block group cache).
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	3463	*/
				3464	if (ret == -ENOSPC) {
				3465	const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
				3466	struct btrfs_block_group *sys_bg;
				3467
Nikolay Borisov	f6f39f7	2021-08-18 13:41:19 +0300	[diff] [blame]	3468	sys_bg = btrfs_create_chunk(trans, sys_flags);
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	3469	if (IS_ERR(sys_bg)) {
				3470	ret = PTR_ERR(sys_bg);
				3471	btrfs_abort_transaction(trans, ret);
				3472	goto out;
				3473	}
				3474
				3475	ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
				3476	if (ret) {
				3477	btrfs_abort_transaction(trans, ret);
				3478	goto out;
				3479	}
				3480
				3481	ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
				3482	if (ret) {
				3483	btrfs_abort_transaction(trans, ret);
				3484	goto out;
				3485	}
				3486	} else if (ret) {
				3487	btrfs_abort_transaction(trans, ret);
				3488	goto out;
				3489	}
				3490	out:
				3491	btrfs_trans_release_chunk_metadata(trans);
				3492
				3493	return ret;
				3494	}
				3495
Josef Bacik	07730d8	2019-06-20 15:38:04 -0400	[diff] [blame]	3496	/*
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	3497	* Chunk allocation is done in 2 phases:
				3498	*
				3499	* 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for
				3500	* the chunk, the chunk mapping, create its block group and add the items
				3501	* that belong in the chunk btree to it - more specifically, we need to
				3502	* update device items in the chunk btree and add a new chunk item to it.
				3503	*
				3504	* 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block
				3505	* group item to the extent btree and the device extent items to the devices
				3506	* btree.
				3507	*
				3508	* This is done to prevent deadlocks. For example when COWing a node from the
				3509	* extent btree we are holding a write lock on the node's parent and if we
				3510	* trigger chunk allocation and attempted to insert the new block group item
				3511	* in the extent btree right way, we could deadlock because the path for the
				3512	* insertion can include that parent node. At first glance it seems impossible
				3513	* to trigger chunk allocation after starting a transaction since tasks should
				3514	* reserve enough transaction units (metadata space), however while that is true
				3515	* most of the time, chunk allocation may still be triggered for several reasons:
				3516	*
				3517	* 1) When reserving metadata, we check if there is enough free space in the
				3518	* metadata space_info and therefore don't trigger allocation of a new chunk.
				3519	* However later when the task actually tries to COW an extent buffer from
				3520	* the extent btree or from the device btree for example, it is forced to
				3521	* allocate a new block group (chunk) because the only one that had enough
				3522	* free space was just turned to RO mode by a running scrub for example (or
				3523	* device replace, block group reclaim thread, etc), so we can not use it
				3524	* for allocating an extent and end up being forced to allocate a new one;
				3525	*
				3526	* 2) Because we only check that the metadata space_info has enough free bytes,
				3527	* we end up not allocating a new metadata chunk in that case. However if
				3528	* the filesystem was mounted in degraded mode, none of the existing block
				3529	* groups might be suitable for extent allocation due to their incompatible
				3530	* profile (for e.g. mounting a 2 devices filesystem, where all block groups
				3531	* use a RAID1 profile, in degraded mode using a single device). In this case
				3532	* when the task attempts to COW some extent buffer of the extent btree for
				3533	* example, it will trigger allocation of a new metadata block group with a
				3534	* suitable profile (SINGLE profile in the example of the degraded mount of
				3535	* the RAID1 filesystem);
				3536	*
				3537	* 3) The task has reserved enough transaction units / metadata space, but when
				3538	* it attempts to COW an extent buffer from the extent or device btree for
				3539	* example, it does not find any free extent in any metadata block group,
				3540	* therefore forced to try to allocate a new metadata block group.
				3541	* This is because some other task allocated all available extents in the
				3542	* meanwhile - this typically happens with tasks that don't reserve space
				3543	* properly, either intentionally or as a bug. One example where this is
				3544	* done intentionally is fsync, as it does not reserve any transaction units
				3545	* and ends up allocating a variable number of metadata extents for log
Filipe Manana	ecd84d5	2021-10-13 10:12:50 +0100	[diff] [blame]	3546	* tree extent buffers;
				3547	*
				3548	* 4) The task has reserved enough transaction units / metadata space, but right
				3549	* before it tries to allocate the last extent buffer it needs, a discard
				3550	* operation comes in and, temporarily, removes the last free space entry from
				3551	* the only metadata block group that had free space (discard starts by
				3552	* removing a free space entry from a block group, then does the discard
				3553	* operation and, once it's done, it adds back the free space entry to the
				3554	* block group).
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	3555	*
				3556	* We also need this 2 phases setup when adding a device to a filesystem with
				3557	* a seed device - we must create new metadata and system chunks without adding
				3558	* any of the block group items to the chunk, extent and device btrees. If we
				3559	* did not do it this way, we would get ENOSPC when attempting to update those
				3560	* btrees, since all the chunks from the seed device are read-only.
				3561	*
				3562	* Phase 1 does the updates and insertions to the chunk btree because if we had
				3563	* it done in phase 2 and have a thundering herd of tasks allocating chunks in
				3564	* parallel, we risk having too many system chunks allocated by many tasks if
				3565	* many tasks reach phase 1 without the previous ones completing phase 2. In the
				3566	* extreme case this leads to exhaustion of the system chunk array in the
				3567	* superblock. This is easier to trigger if using a btree node/leaf size of 64K
				3568	* and with RAID filesystems (so we have more device items in the chunk btree).
				3569	* This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
				3570	* the system chunk array due to concurrent allocations") provides more details.
				3571	*
Filipe Manana	2bb2e00	2021-10-13 10:12:49 +0100	[diff] [blame]	3572	* Allocation of system chunks does not happen through this function. A task that
				3573	* needs to update the chunk btree (the only btree that uses system chunks), must
				3574	* preallocate chunk space by calling either check_system_chunk() or
				3575	* btrfs_reserve_chunk_metadata() - the former is used when allocating a data or
				3576	* metadata chunk or when removing a chunk, while the later is used before doing
				3577	* a modification to the chunk btree - use cases for the later are adding,
				3578	* removing and resizing a device as well as relocation of a system chunk.
				3579	* See the comment below for more details.
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	3580	*
				3581	* The reservation of system space, done through check_system_chunk(), as well
				3582	* as all the updates and insertions into the chunk btree must be done while
				3583	* holding fs_info->chunk_mutex. This is important to guarantee that while COWing
				3584	* an extent buffer from the chunks btree we never trigger allocation of a new
				3585	* system chunk, which would result in a deadlock (trying to lock twice an
				3586	* extent buffer of the chunk btree, first time before triggering the chunk
				3587	* allocation and the second time during chunk allocation while attempting to
				3588	* update the chunks btree). The system chunk array is also updated while holding
				3589	* that mutex. The same logic applies to removing chunks - we must reserve system
				3590	* space, update the chunk btree and the system chunk array in the superblock
				3591	* while holding fs_info->chunk_mutex.
				3592	*
				3593	* This function, btrfs_chunk_alloc(), belongs to phase 1.
				3594	*
				3595	* If @force is CHUNK_ALLOC_FORCE:
Josef Bacik	07730d8	2019-06-20 15:38:04 -0400	[diff] [blame]	3596	* - return 1 if it successfully allocates a chunk,
				3597	* - return errors including -ENOSPC otherwise.
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	3598	* If @force is NOT CHUNK_ALLOC_FORCE:
Josef Bacik	07730d8	2019-06-20 15:38:04 -0400	[diff] [blame]	3599	* - return 0 if it doesn't need to allocate a new chunk,
				3600	* - return 1 if it successfully allocates a chunk,
				3601	* - return errors including -ENOSPC otherwise.
				3602	*/
				3603	int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
				3604	enum btrfs_chunk_alloc_enum force)
				3605	{
				3606	struct btrfs_fs_info *fs_info = trans->fs_info;
				3607	struct btrfs_space_info *space_info;
				3608	bool wait_for_alloc = false;
				3609	bool should_alloc = false;
				3610	int ret = 0;
				3611
				3612	/* Don't re-enter if we're already allocating a chunk */
				3613	if (trans->allocating_chunk)
				3614	return -ENOSPC;
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	3615	/*
Filipe Manana	2bb2e00	2021-10-13 10:12:49 +0100	[diff] [blame]	3616	* Allocation of system chunks can not happen through this path, as we
				3617	* could end up in a deadlock if we are allocating a data or metadata
				3618	* chunk and there is another task modifying the chunk btree.
				3619	*
				3620	* This is because while we are holding the chunk mutex, we will attempt
				3621	* to add the new chunk item to the chunk btree or update an existing
				3622	* device item in the chunk btree, while the other task that is modifying
				3623	* the chunk btree is attempting to COW an extent buffer while holding a
				3624	* lock on it and on its parent - if the COW operation triggers a system
				3625	* chunk allocation, then we can deadlock because we are holding the
				3626	* chunk mutex and we may need to access that extent buffer or its parent
				3627	* in order to add the chunk item or update a device item.
				3628	*
				3629	* Tasks that want to modify the chunk tree should reserve system space
				3630	* before updating the chunk btree, by calling either
				3631	* btrfs_reserve_chunk_metadata() or check_system_chunk().
				3632	* It's possible that after a task reserves the space, it still ends up
				3633	* here - this happens in the cases described above at do_chunk_alloc().
				3634	* The task will have to either retry or fail.
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	3635	*/
Filipe Manana	2bb2e00	2021-10-13 10:12:49 +0100	[diff] [blame]	3636	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	3637	return -ENOSPC;
Josef Bacik	07730d8	2019-06-20 15:38:04 -0400	[diff] [blame]	3638
				3639	space_info = btrfs_find_space_info(fs_info, flags);
				3640	ASSERT(space_info);
				3641
				3642	do {
				3643	spin_lock(&space_info->lock);
				3644	if (force < space_info->force_alloc)
				3645	force = space_info->force_alloc;
				3646	should_alloc = should_alloc_chunk(fs_info, space_info, force);
				3647	if (space_info->full) {
				3648	/* No more free physical space */
				3649	if (should_alloc)
				3650	ret = -ENOSPC;
				3651	else
				3652	ret = 0;
				3653	spin_unlock(&space_info->lock);
				3654	return ret;
				3655	} else if (!should_alloc) {
				3656	spin_unlock(&space_info->lock);
				3657	return 0;
				3658	} else if (space_info->chunk_alloc) {
				3659	/*
				3660	* Someone is already allocating, so we need to block
				3661	* until this someone is finished and then loop to
				3662	* recheck if we should continue with our allocation
				3663	* attempt.
				3664	*/
				3665	wait_for_alloc = true;
				3666	spin_unlock(&space_info->lock);
				3667	mutex_lock(&fs_info->chunk_mutex);
				3668	mutex_unlock(&fs_info->chunk_mutex);
				3669	} else {
				3670	/* Proceed with allocation */
				3671	space_info->chunk_alloc = 1;
				3672	wait_for_alloc = false;
				3673	spin_unlock(&space_info->lock);
				3674	}
				3675
				3676	cond_resched();
				3677	} while (wait_for_alloc);
				3678
				3679	mutex_lock(&fs_info->chunk_mutex);
				3680	trans->allocating_chunk = true;
				3681
				3682	/*
				3683	* If we have mixed data/metadata chunks we want to make sure we keep
				3684	* allocating mixed chunks instead of individual chunks.
				3685	*/
				3686	if (btrfs_mixed_space_info(space_info))
				3687	flags \|= (BTRFS_BLOCK_GROUP_DATA \| BTRFS_BLOCK_GROUP_METADATA);
				3688
				3689	/*
				3690	* if we're doing a data chunk, go ahead and make sure that
				3691	* we keep a reasonable number of metadata chunks allocated in the
				3692	* FS as well.
				3693	*/
				3694	if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
				3695	fs_info->data_chunk_allocations++;
				3696	if (!(fs_info->data_chunk_allocations %
				3697	fs_info->metadata_ratio))
				3698	force_metadata_allocation(fs_info);
				3699	}
				3700
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	3701	ret = do_chunk_alloc(trans, flags);
Josef Bacik	07730d8	2019-06-20 15:38:04 -0400	[diff] [blame]	3702	trans->allocating_chunk = false;
				3703
				3704	spin_lock(&space_info->lock);
				3705	if (ret < 0) {
				3706	if (ret == -ENOSPC)
				3707	space_info->full = 1;
				3708	else
				3709	goto out;
				3710	} else {
				3711	ret = 1;
				3712	space_info->max_extent_size = 0;
				3713	}
				3714
				3715	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
				3716	out:
				3717	space_info->chunk_alloc = 0;
				3718	spin_unlock(&space_info->lock);
				3719	mutex_unlock(&fs_info->chunk_mutex);
Josef Bacik	07730d8	2019-06-20 15:38:04 -0400	[diff] [blame]	3720
				3721	return ret;
				3722	}
				3723
				3724	static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
				3725	{
				3726	u64 num_dev;
				3727
				3728	num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
				3729	if (!num_dev)
				3730	num_dev = fs_info->fs_devices->rw_devices;
				3731
				3732	return num_dev;
				3733	}
				3734
Filipe Manana	2bb2e00	2021-10-13 10:12:49 +0100	[diff] [blame]	3735	static void reserve_chunk_space(struct btrfs_trans_handle *trans,
				3736	u64 bytes,
				3737	u64 type)
Josef Bacik	07730d8	2019-06-20 15:38:04 -0400	[diff] [blame]	3738	{
				3739	struct btrfs_fs_info *fs_info = trans->fs_info;
				3740	struct btrfs_space_info *info;
				3741	u64 left;
Josef Bacik	07730d8	2019-06-20 15:38:04 -0400	[diff] [blame]	3742	int ret = 0;
Josef Bacik	07730d8	2019-06-20 15:38:04 -0400	[diff] [blame]	3743
				3744	/*
				3745	* Needed because we can end up allocating a system chunk and for an
				3746	* atomic and race free space reservation in the chunk block reserve.
				3747	*/
				3748	lockdep_assert_held(&fs_info->chunk_mutex);
				3749
				3750	info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
				3751	spin_lock(&info->lock);
				3752	left = info->total_bytes - btrfs_space_info_used(info, true);
				3753	spin_unlock(&info->lock);
				3754
Filipe Manana	2bb2e00	2021-10-13 10:12:49 +0100	[diff] [blame]	3755	if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
Josef Bacik	07730d8	2019-06-20 15:38:04 -0400	[diff] [blame]	3756	btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
Filipe Manana	2bb2e00	2021-10-13 10:12:49 +0100	[diff] [blame]	3757	left, bytes, type);
Josef Bacik	07730d8	2019-06-20 15:38:04 -0400	[diff] [blame]	3758	btrfs_dump_space_info(fs_info, info, 0, 0);
				3759	}
				3760
Filipe Manana	2bb2e00	2021-10-13 10:12:49 +0100	[diff] [blame]	3761	if (left < bytes) {
Josef Bacik	07730d8	2019-06-20 15:38:04 -0400	[diff] [blame]	3762	u64 flags = btrfs_system_alloc_profile(fs_info);
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	3763	struct btrfs_block_group *bg;
Josef Bacik	07730d8	2019-06-20 15:38:04 -0400	[diff] [blame]	3764
				3765	/*
				3766	* Ignore failure to create system chunk. We might end up not
				3767	* needing it, as we might not need to COW all nodes/leafs from
				3768	* the paths we visit in the chunk tree (they were already COWed
				3769	* or created in the current transaction for example).
				3770	*/
Nikolay Borisov	f6f39f7	2021-08-18 13:41:19 +0300	[diff] [blame]	3771	bg = btrfs_create_chunk(trans, flags);
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	3772	if (IS_ERR(bg)) {
				3773	ret = PTR_ERR(bg);
Filipe Manana	2bb2e00	2021-10-13 10:12:49 +0100	[diff] [blame]	3774	} else {
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	3775	/*
				3776	* If we fail to add the chunk item here, we end up
				3777	* trying again at phase 2 of chunk allocation, at
				3778	* btrfs_create_pending_block_groups(). So ignore
Filipe Manana	2bb2e00	2021-10-13 10:12:49 +0100	[diff] [blame]	3779	* any error here. An ENOSPC here could happen, due to
				3780	* the cases described at do_chunk_alloc() - the system
				3781	* block group we just created was just turned into RO
				3782	* mode by a scrub for example, or a running discard
				3783	* temporarily removed its free space entries, etc.
Filipe Manana	79bd371	2021-06-29 14:43:06 +0100	[diff] [blame]	3784	*/
				3785	btrfs_chunk_alloc_add_chunk_item(trans, bg);
				3786	}
Josef Bacik	07730d8	2019-06-20 15:38:04 -0400	[diff] [blame]	3787	}
				3788
				3789	if (!ret) {
Josef Bacik	9270501	2021-11-09 10:12:07 -0500	[diff] [blame]	3790	ret = btrfs_block_rsv_add(fs_info,
Josef Bacik	07730d8	2019-06-20 15:38:04 -0400	[diff] [blame]	3791	&fs_info->chunk_block_rsv,
Filipe Manana	2bb2e00	2021-10-13 10:12:49 +0100	[diff] [blame]	3792	bytes, BTRFS_RESERVE_NO_FLUSH);
Filipe Manana	1cb3db1	2021-06-29 14:43:05 +0100	[diff] [blame]	3793	if (!ret)
Filipe Manana	2bb2e00	2021-10-13 10:12:49 +0100	[diff] [blame]	3794	trans->chunk_bytes_reserved += bytes;
Josef Bacik	07730d8	2019-06-20 15:38:04 -0400	[diff] [blame]	3795	}
				3796	}
				3797
Filipe Manana	2bb2e00	2021-10-13 10:12:49 +0100	[diff] [blame]	3798	/*
				3799	* Reserve space in the system space for allocating or removing a chunk.
				3800	* The caller must be holding fs_info->chunk_mutex.
				3801	*/
				3802	void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
				3803	{
				3804	struct btrfs_fs_info *fs_info = trans->fs_info;
				3805	const u64 num_devs = get_profile_num_devs(fs_info, type);
				3806	u64 bytes;
				3807
				3808	/* num_devs device items to update and 1 chunk item to add or remove. */
				3809	bytes = btrfs_calc_metadata_size(fs_info, num_devs) +
				3810	btrfs_calc_insert_metadata_size(fs_info, 1);
				3811
				3812	reserve_chunk_space(trans, bytes, type);
				3813	}
				3814
				3815	/*
				3816	* Reserve space in the system space, if needed, for doing a modification to the
				3817	* chunk btree.
				3818	*
				3819	* @trans: A transaction handle.
				3820	* @is_item_insertion: Indicate if the modification is for inserting a new item
				3821	* in the chunk btree or if it's for the deletion or update
				3822	* of an existing item.
				3823	*
				3824	* This is used in a context where we need to update the chunk btree outside
				3825	* block group allocation and removal, to avoid a deadlock with a concurrent
				3826	* task that is allocating a metadata or data block group and therefore needs to
				3827	* update the chunk btree while holding the chunk mutex. After the update to the
				3828	* chunk btree is done, btrfs_trans_release_chunk_metadata() should be called.
				3829	*
				3830	*/
				3831	void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
				3832	bool is_item_insertion)
				3833	{
				3834	struct btrfs_fs_info *fs_info = trans->fs_info;
				3835	u64 bytes;
				3836
				3837	if (is_item_insertion)
				3838	bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
				3839	else
				3840	bytes = btrfs_calc_metadata_size(fs_info, 1);
				3841
				3842	mutex_lock(&fs_info->chunk_mutex);
				3843	reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM);
				3844	mutex_unlock(&fs_info->chunk_mutex);
				3845	}
				3846
Josef Bacik	3e43c27	2019-06-20 15:38:06 -0400	[diff] [blame]	3847	void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
				3848	{
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	3849	struct btrfs_block_group *block_group;
Josef Bacik	3e43c27	2019-06-20 15:38:06 -0400	[diff] [blame]	3850	u64 last = 0;
				3851
				3852	while (1) {
				3853	struct inode *inode;
				3854
				3855	block_group = btrfs_lookup_first_block_group(info, last);
				3856	while (block_group) {
				3857	btrfs_wait_block_group_cache_done(block_group);
				3858	spin_lock(&block_group->lock);
				3859	if (block_group->iref)
				3860	break;
				3861	spin_unlock(&block_group->lock);
				3862	block_group = btrfs_next_block_group(block_group);
				3863	}
				3864	if (!block_group) {
				3865	if (last == 0)
				3866	break;
				3867	last = 0;
				3868	continue;
				3869	}
				3870
				3871	inode = block_group->inode;
				3872	block_group->iref = 0;
				3873	block_group->inode = NULL;
				3874	spin_unlock(&block_group->lock);
				3875	ASSERT(block_group->io_ctl.inode == NULL);
				3876	iput(inode);
David Sterba	b3470b5	2019-10-23 18:48:22 +0200	[diff] [blame]	3877	last = block_group->start + block_group->length;
Josef Bacik	3e43c27	2019-06-20 15:38:06 -0400	[diff] [blame]	3878	btrfs_put_block_group(block_group);
				3879	}
				3880	}
				3881
				3882	/*
				3883	* Must be called only after stopping all workers, since we could have block
				3884	* group caching kthreads running, and therefore they could race with us if we
				3885	* freed the block groups before stopping them.
				3886	*/
				3887	int btrfs_free_block_groups(struct btrfs_fs_info *info)
				3888	{
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	3889	struct btrfs_block_group *block_group;
Josef Bacik	3e43c27	2019-06-20 15:38:06 -0400	[diff] [blame]	3890	struct btrfs_space_info *space_info;
				3891	struct btrfs_caching_control *caching_ctl;
				3892	struct rb_node *n;
				3893
Josef Bacik	bbb86a3	2020-10-23 09:58:11 -0400	[diff] [blame]	3894	spin_lock(&info->block_group_cache_lock);
Josef Bacik	3e43c27	2019-06-20 15:38:06 -0400	[diff] [blame]	3895	while (!list_empty(&info->caching_block_groups)) {
				3896	caching_ctl = list_entry(info->caching_block_groups.next,
				3897	struct btrfs_caching_control, list);
				3898	list_del(&caching_ctl->list);
				3899	btrfs_put_caching_control(caching_ctl);
				3900	}
Josef Bacik	bbb86a3	2020-10-23 09:58:11 -0400	[diff] [blame]	3901	spin_unlock(&info->block_group_cache_lock);
Josef Bacik	3e43c27	2019-06-20 15:38:06 -0400	[diff] [blame]	3902
				3903	spin_lock(&info->unused_bgs_lock);
				3904	while (!list_empty(&info->unused_bgs)) {
				3905	block_group = list_first_entry(&info->unused_bgs,
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	3906	struct btrfs_block_group,
Josef Bacik	3e43c27	2019-06-20 15:38:06 -0400	[diff] [blame]	3907	bg_list);
				3908	list_del_init(&block_group->bg_list);
				3909	btrfs_put_block_group(block_group);
				3910	}
Josef Bacik	3e43c27	2019-06-20 15:38:06 -0400	[diff] [blame]	3911
Johannes Thumshirn	18bb8bb	2021-04-19 16:41:02 +0900	[diff] [blame]	3912	while (!list_empty(&info->reclaim_bgs)) {
				3913	block_group = list_first_entry(&info->reclaim_bgs,
				3914	struct btrfs_block_group,
				3915	bg_list);
				3916	list_del_init(&block_group->bg_list);
				3917	btrfs_put_block_group(block_group);
				3918	}
				3919	spin_unlock(&info->unused_bgs_lock);
				3920
Naohiro Aota	afba2bc	2021-08-19 21:19:17 +0900	[diff] [blame]	3921	spin_lock(&info->zone_active_bgs_lock);
				3922	while (!list_empty(&info->zone_active_bgs)) {
				3923	block_group = list_first_entry(&info->zone_active_bgs,
				3924	struct btrfs_block_group,
				3925	active_bg_list);
				3926	list_del_init(&block_group->active_bg_list);
				3927	btrfs_put_block_group(block_group);
				3928	}
				3929	spin_unlock(&info->zone_active_bgs_lock);
				3930
Josef Bacik	3e43c27	2019-06-20 15:38:06 -0400	[diff] [blame]	3931	spin_lock(&info->block_group_cache_lock);
				3932	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
David Sterba	32da5386	2019-10-29 19:20:18 +0100	[diff] [blame]	3933	block_group = rb_entry(n, struct btrfs_block_group,
Josef Bacik	3e43c27	2019-06-20 15:38:06 -0400	[diff] [blame]	3934	cache_node);
				3935	rb_erase(&block_group->cache_node,
				3936	&info->block_group_cache_tree);
				3937	RB_CLEAR_NODE(&block_group->cache_node);
				3938	spin_unlock(&info->block_group_cache_lock);
				3939
				3940	down_write(&block_group->space_info->groups_sem);
				3941	list_del(&block_group->list);
				3942	up_write(&block_group->space_info->groups_sem);
				3943
				3944	/*
				3945	* We haven't cached this block group, which means we could
				3946	* possibly have excluded extents on this block group.
				3947	*/
				3948	if (block_group->cached == BTRFS_CACHE_NO \|\|
				3949	block_group->cached == BTRFS_CACHE_ERROR)
				3950	btrfs_free_excluded_extents(block_group);
				3951
				3952	btrfs_remove_free_space_cache(block_group);
				3953	ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
				3954	ASSERT(list_empty(&block_group->dirty_list));
				3955	ASSERT(list_empty(&block_group->io_list));
				3956	ASSERT(list_empty(&block_group->bg_list));
Josef Bacik	48aaeeb	2020-07-06 09:14:11 -0400	[diff] [blame]	3957	ASSERT(refcount_read(&block_group->refs) == 1);
Filipe Manana	195a49e	2021-02-05 12:55:37 +0000	[diff] [blame]	3958	ASSERT(block_group->swap_extents == 0);
Josef Bacik	3e43c27	2019-06-20 15:38:06 -0400	[diff] [blame]	3959	btrfs_put_block_group(block_group);
				3960
				3961	spin_lock(&info->block_group_cache_lock);
				3962	}
				3963	spin_unlock(&info->block_group_cache_lock);
				3964
Josef Bacik	3e43c27	2019-06-20 15:38:06 -0400	[diff] [blame]	3965	btrfs_release_global_block_rsv(info);
				3966
				3967	while (!list_empty(&info->space_info)) {
				3968	space_info = list_entry(info->space_info.next,
				3969	struct btrfs_space_info,
				3970	list);
				3971
				3972	/*
				3973	* Do not hide this behind enospc_debug, this is actually
				3974	* important and indicates a real bug if this happens.
				3975	*/
				3976	if (WARN_ON(space_info->bytes_pinned > 0 \|\|
				3977	space_info->bytes_reserved > 0 \|\|
				3978	space_info->bytes_may_use > 0))
				3979	btrfs_dump_space_info(info, space_info, 0, 0);
Filipe Manana	d611add	2020-04-07 11:38:49 +0100	[diff] [blame]	3980	WARN_ON(space_info->reclaim_size > 0);
Josef Bacik	3e43c27	2019-06-20 15:38:06 -0400	[diff] [blame]	3981	list_del(&space_info->list);
				3982	btrfs_sysfs_remove_space_info(space_info);
				3983	}
				3984	return 0;
				3985	}
Filipe Manana	684b752	2020-05-08 11:01:59 +0100	[diff] [blame]	3986
				3987	void btrfs_freeze_block_group(struct btrfs_block_group *cache)
				3988	{
				3989	atomic_inc(&cache->frozen);
				3990	}
				3991
				3992	void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
				3993	{
				3994	struct btrfs_fs_info *fs_info = block_group->fs_info;
				3995	struct extent_map_tree *em_tree;
				3996	struct extent_map *em;
				3997	bool cleanup;
				3998
				3999	spin_lock(&block_group->lock);
				4000	cleanup = (atomic_dec_and_test(&block_group->frozen) &&
				4001	block_group->removed);
				4002	spin_unlock(&block_group->lock);
				4003
				4004	if (cleanup) {
Filipe Manana	684b752	2020-05-08 11:01:59 +0100	[diff] [blame]	4005	em_tree = &fs_info->mapping_tree;
				4006	write_lock(&em_tree->lock);
				4007	em = lookup_extent_mapping(em_tree, block_group->start,
				4008	1);
				4009	BUG_ON(!em); /* logic error, can't happen */
				4010	remove_extent_mapping(em_tree, em);
				4011	write_unlock(&em_tree->lock);
Filipe Manana	684b752	2020-05-08 11:01:59 +0100	[diff] [blame]	4012
				4013	/* once for us and once for the tree */
				4014	free_extent_map(em);
				4015	free_extent_map(em);
				4016
				4017	/*
				4018	* We may have left one free space entry and other possible
				4019	* tasks trimming this block group have left 1 entry each one.
				4020	* Free them if any.
				4021	*/
				4022	__btrfs_remove_free_space_cache(block_group->free_space_ctl);
				4023	}
				4024	}
Filipe Manana	195a49e	2021-02-05 12:55:37 +0000	[diff] [blame]	4025
				4026	bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg)
				4027	{
				4028	bool ret = true;
				4029
				4030	spin_lock(&bg->lock);
				4031	if (bg->ro)
				4032	ret = false;
				4033	else
				4034	bg->swap_extents++;
				4035	spin_unlock(&bg->lock);
				4036
				4037	return ret;
				4038	}
				4039
				4040	void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount)
				4041	{
				4042	spin_lock(&bg->lock);
				4043	ASSERT(!bg->ro);
				4044	ASSERT(bg->swap_extents >= amount);
				4045	bg->swap_extents -= amount;
				4046	spin_unlock(&bg->lock);
				4047	}