Blame - fs/btrfs/raid56.c - SHIFTPHONES/mainline/linux

blob: 244d499ebc72ca112ced4571e5ae5dc6a032169b [file] [log] [blame]

David Sterba	c1d7c51	2018-04-03 19:23:33 +0200	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2	/*
				3	* Copyright (C) 2012 Fusion-io All rights reserved.
				4	* Copyright (C) 2012 Intel Corp. All rights reserved.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	5	*/
David Sterba	c1d7c51	2018-04-03 19:23:33 +0200	[diff] [blame]	6
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	7	#include <linux/sched.h>
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	8	#include <linux/bio.h>
				9	#include <linux/slab.h>
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	10	#include <linux/blkdev.h>
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	11	#include <linux/raid/pq.h>
				12	#include <linux/hash.h>
				13	#include <linux/list_sort.h>
				14	#include <linux/raid/xor.h>
David Sterba	818e010	2017-05-31 18:40:02 +0200	[diff] [blame]	15	#include <linux/mm.h>
Johannes Thumshirn	cea6280	2021-03-16 19:04:01 +0900	[diff] [blame]	16	#include "misc.h"
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	17	#include "ctree.h"
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	18	#include "disk-io.h"
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	19	#include "volumes.h"
				20	#include "raid56.h"
				21	#include "async-thread.h"
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	22
				23	/* set when additional merges to this rbio are not allowed */
				24	#define RBIO_RMW_LOCKED_BIT 1
				25
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	26	/*
				27	* set when this rbio is sitting in the hash, but it is just a cache
				28	* of past RMW
				29	*/
				30	#define RBIO_CACHE_BIT 2
				31
				32	/*
				33	* set when it is safe to trust the stripe_pages for caching
				34	*/
				35	#define RBIO_CACHE_READY_BIT 3
				36
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	37	#define RBIO_CACHE_SIZE 1024
				38
David Sterba	8a95334	2019-08-21 19:06:17 +0200	[diff] [blame]	39	#define BTRFS_STRIPE_HASH_TABLE_BITS 11
				40
				41	/* Used by the raid56 code to lock stripes for read/modify/write */
				42	struct btrfs_stripe_hash {
				43	struct list_head hash_list;
				44	spinlock_t lock;
				45	};
				46
				47	/* Used by the raid56 code to lock stripes for read/modify/write */
				48	struct btrfs_stripe_hash_table {
				49	struct list_head stripe_cache;
				50	spinlock_t cache_lock;
				51	int cache_size;
				52	struct btrfs_stripe_hash table[];
				53	};
				54
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	55	enum btrfs_rbio_ops {
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	56	BTRFS_RBIO_WRITE,
				57	BTRFS_RBIO_READ_REBUILD,
				58	BTRFS_RBIO_PARITY_SCRUB,
				59	BTRFS_RBIO_REBUILD_MISSING,
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	60	};
				61
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	62	struct btrfs_raid_bio {
				63	struct btrfs_fs_info *fs_info;
				64	struct btrfs_bio *bbio;
				65
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	66	/* while we're doing rmw on a stripe
				67	* we put it into a hash table so we can
				68	* lock the stripe and merge more rbios
				69	* into it.
				70	*/
				71	struct list_head hash_list;
				72
				73	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	74	* LRU list for the stripe cache
				75	*/
				76	struct list_head stripe_cache;
				77
				78	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	79	* for scheduling work in the helper threads
				80	*/
				81	struct btrfs_work work;
				82
				83	/*
				84	* bio list and bio_list_lock are used
				85	* to add more bios into the stripe
				86	* in hopes of avoiding the full rmw
				87	*/
				88	struct bio_list bio_list;
				89	spinlock_t bio_list_lock;
				90
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	91	/* also protected by the bio_list_lock, the
				92	* plug list is used by the plugging code
				93	* to collect partial bios while plugged. The
				94	* stripe locking code also uses it to hand off
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	95	* the stripe lock to the next pending IO
				96	*/
				97	struct list_head plug_list;
				98
				99	/*
				100	* flags that tell us if it is safe to
				101	* merge with this bio
				102	*/
				103	unsigned long flags;
				104
				105	/* size of each individual stripe on disk */
				106	int stripe_len;
				107
				108	/* number of data stripes (no p/q) */
				109	int nr_data;
				110
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	111	int real_stripes;
				112
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	113	int stripe_npages;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	114	/*
				115	* set if we're doing a parity rebuild
				116	* for a read from higher up, which is handled
				117	* differently from a parity rebuild as part of
				118	* rmw
				119	*/
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	120	enum btrfs_rbio_ops operation;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	121
				122	/* first bad stripe */
				123	int faila;
				124
				125	/* second bad stripe (for raid6 use) */
				126	int failb;
				127
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	128	int scrubp;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	129	/*
				130	* number of pages needed to represent the full
				131	* stripe
				132	*/
				133	int nr_pages;
				134
				135	/*
				136	* size of all the bios in the bio_list. This
				137	* helps us decide if the rbio maps to a full
				138	* stripe or not
				139	*/
				140	int bio_list_bytes;
				141
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	142	int generic_bio_cnt;
				143
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	144	refcount_t refs;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	145
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	146	atomic_t stripes_pending;
				147
				148	atomic_t error;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	149	/*
				150	* these are two arrays of pointers. We allocate the
				151	* rbio big enough to hold them both and setup their
				152	* locations when the rbio is allocated
				153	*/
				154
				155	/* pointers to pages that we allocated for
				156	* reading/writing stripes directly from the disk (including P/Q)
				157	*/
				158	struct page **stripe_pages;
				159
				160	/*
				161	* pointers to the pages in the bio_list. Stored
				162	* here for faster lookup
				163	*/
				164	struct page **bio_pages;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	165
				166	/*
				167	* bitmap to record which horizontal stripe has data
				168	*/
				169	unsigned long *dbitmap;
Kees Cook	1389053	2018-05-29 16:44:59 -0700	[diff] [blame]	170
				171	/* allocated with real_stripes-many pointers for finish_() calls /
				172	void **finish_pointers;
				173
				174	/* allocated with stripe_npages-many bits for finish_() calls /
				175	unsigned long *finish_pbitmap;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	176	};
				177
				178	static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
				179	static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
				180	static void rmw_work(struct btrfs_work *work);
				181	static void read_rebuild_work(struct btrfs_work *work);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	182	static int fail_bio_stripe(struct btrfs_raid_bio rbio, struct bio bio);
				183	static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
				184	static void __free_raid_bio(struct btrfs_raid_bio *rbio);
				185	static void index_rbio_pages(struct btrfs_raid_bio *rbio);
				186	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
				187
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	188	static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
				189	int need_check);
David Sterba	a81b747	2018-06-29 10:57:03 +0200	[diff] [blame]	190	static void scrub_parity_work(struct btrfs_work *work);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	191
David Sterba	ac63885	2018-06-29 10:56:56 +0200	[diff] [blame]	192	static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func)
				193	{
Omar Sandoval	a0cac0e	2019-09-16 11:30:57 -0700	[diff] [blame]	194	btrfs_init_work(&rbio->work, work_func, NULL, NULL);
David Sterba	ac63885	2018-06-29 10:56:56 +0200	[diff] [blame]	195	btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
				196	}
				197
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	198	/*
				199	* the stripe hash table is used for locking, and to collect
				200	* bios in hopes of making a full stripe
				201	*/
				202	int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
				203	{
				204	struct btrfs_stripe_hash_table *table;
				205	struct btrfs_stripe_hash_table *x;
				206	struct btrfs_stripe_hash *cur;
				207	struct btrfs_stripe_hash *h;
				208	int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
				209	int i;
				210
				211	if (info->stripe_hash_table)
				212	return 0;
				213
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	214	/*
				215	* The table is large, starting with order 4 and can go as high as
				216	* order 7 in case lock debugging is turned on.
				217	*
				218	* Try harder to allocate and fallback to vmalloc to lower the chance
				219	* of a failing mount.
				220	*/
David Sterba	ee787f9	2019-03-29 02:07:02 +0100	[diff] [blame]	221	table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
David Sterba	818e010	2017-05-31 18:40:02 +0200	[diff] [blame]	222	if (!table)
				223	return -ENOMEM;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	224
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	225	spin_lock_init(&table->cache_lock);
				226	INIT_LIST_HEAD(&table->stripe_cache);
				227
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	228	h = table->table;
				229
				230	for (i = 0; i < num_entries; i++) {
				231	cur = h + i;
				232	INIT_LIST_HEAD(&cur->hash_list);
				233	spin_lock_init(&cur->lock);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	234	}
				235
				236	x = cmpxchg(&info->stripe_hash_table, NULL, table);
Yang Li	fe3b7bb	2021-01-21 16:19:47 +0800	[diff] [blame]	237	kvfree(x);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	238	return 0;
				239	}
				240
				241	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	242	* caching an rbio means to copy anything from the
				243	* bio_pages array into the stripe_pages array. We
				244	* use the page uptodate bit in the stripe cache array
				245	* to indicate if it has valid data
				246	*
				247	* once the caching is done, we set the cache ready
				248	* bit.
				249	*/
				250	static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
				251	{
				252	int i;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	253	int ret;
				254
				255	ret = alloc_rbio_pages(rbio);
				256	if (ret)
				257	return;
				258
				259	for (i = 0; i < rbio->nr_pages; i++) {
				260	if (!rbio->bio_pages[i])
				261	continue;
				262
Ira Weiny	80cc838	2021-02-09 22:22:20 -0800	[diff] [blame]	263	copy_highpage(rbio->stripe_pages[i], rbio->bio_pages[i]);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	264	SetPageUptodate(rbio->stripe_pages[i]);
				265	}
				266	set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				267	}
				268
				269	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	270	* we hash on the first logical address of the stripe
				271	*/
				272	static int rbio_bucket(struct btrfs_raid_bio *rbio)
				273	{
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	274	u64 num = rbio->bbio->raid_map[0];
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	275
				276	/*
				277	* we shift down quite a bit. We're using byte
				278	* addressing, and most of the lower bits are zeros.
				279	* This tends to upset hash_64, and it consistently
				280	* returns just one or two different values.
				281	*
				282	* shifting off the lower bits fixes things.
				283	*/
				284	return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
				285	}
				286
				287	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	288	* stealing an rbio means taking all the uptodate pages from the stripe
				289	* array in the source rbio and putting them into the destination rbio
				290	*/
				291	static void steal_rbio(struct btrfs_raid_bio src, struct btrfs_raid_bio dest)
				292	{
				293	int i;
				294	struct page *s;
				295	struct page *d;
				296
				297	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
				298	return;
				299
				300	for (i = 0; i < dest->nr_pages; i++) {
				301	s = src->stripe_pages[i];
				302	if (!s \|\| !PageUptodate(s)) {
				303	continue;
				304	}
				305
				306	d = dest->stripe_pages[i];
				307	if (d)
				308	__free_page(d);
				309
				310	dest->stripe_pages[i] = s;
				311	src->stripe_pages[i] = NULL;
				312	}
				313	}
				314
				315	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	316	* merging means we take the bio_list from the victim and
				317	* splice it into the destination. The victim should
				318	* be discarded afterwards.
				319	*
				320	* must be called with dest->rbio_list_lock held
				321	*/
				322	static void merge_rbio(struct btrfs_raid_bio *dest,
				323	struct btrfs_raid_bio *victim)
				324	{
				325	bio_list_merge(&dest->bio_list, &victim->bio_list);
				326	dest->bio_list_bytes += victim->bio_list_bytes;
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	327	dest->generic_bio_cnt += victim->generic_bio_cnt;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	328	bio_list_init(&victim->bio_list);
				329	}
				330
				331	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	332	* used to prune items that are in the cache. The caller
				333	* must hold the hash table lock.
				334	*/
				335	static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
				336	{
				337	int bucket = rbio_bucket(rbio);
				338	struct btrfs_stripe_hash_table *table;
				339	struct btrfs_stripe_hash *h;
				340	int freeit = 0;
				341
				342	/*
				343	* check the bit again under the hash table lock.
				344	*/
				345	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
				346	return;
				347
				348	table = rbio->fs_info->stripe_hash_table;
				349	h = table->table + bucket;
				350
				351	/* hold the lock for the bucket because we may be
				352	* removing it from the hash table
				353	*/
				354	spin_lock(&h->lock);
				355
				356	/*
				357	* hold the lock for the bio list because we need
				358	* to make sure the bio list is empty
				359	*/
				360	spin_lock(&rbio->bio_list_lock);
				361
				362	if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
				363	list_del_init(&rbio->stripe_cache);
				364	table->cache_size -= 1;
				365	freeit = 1;
				366
				367	/* if the bio list isn't empty, this rbio is
				368	* still involved in an IO. We take it out
				369	* of the cache list, and drop the ref that
				370	* was held for the list.
				371	*
				372	* If the bio_list was empty, we also remove
				373	* the rbio from the hash_table, and drop
				374	* the corresponding ref
				375	*/
				376	if (bio_list_empty(&rbio->bio_list)) {
				377	if (!list_empty(&rbio->hash_list)) {
				378	list_del_init(&rbio->hash_list);
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	379	refcount_dec(&rbio->refs);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	380	BUG_ON(!list_empty(&rbio->plug_list));
				381	}
				382	}
				383	}
				384
				385	spin_unlock(&rbio->bio_list_lock);
				386	spin_unlock(&h->lock);
				387
				388	if (freeit)
				389	__free_raid_bio(rbio);
				390	}
				391
				392	/*
				393	* prune a given rbio from the cache
				394	*/
				395	static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
				396	{
				397	struct btrfs_stripe_hash_table *table;
				398	unsigned long flags;
				399
				400	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
				401	return;
				402
				403	table = rbio->fs_info->stripe_hash_table;
				404
				405	spin_lock_irqsave(&table->cache_lock, flags);
				406	__remove_rbio_from_cache(rbio);
				407	spin_unlock_irqrestore(&table->cache_lock, flags);
				408	}
				409
				410	/*
				411	* remove everything in the cache
				412	*/
Eric Sandeen	48a3b63	2013-04-25 20:41:01 +0000	[diff] [blame]	413	static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	414	{
				415	struct btrfs_stripe_hash_table *table;
				416	unsigned long flags;
				417	struct btrfs_raid_bio *rbio;
				418
				419	table = info->stripe_hash_table;
				420
				421	spin_lock_irqsave(&table->cache_lock, flags);
				422	while (!list_empty(&table->stripe_cache)) {
				423	rbio = list_entry(table->stripe_cache.next,
				424	struct btrfs_raid_bio,
				425	stripe_cache);
				426	__remove_rbio_from_cache(rbio);
				427	}
				428	spin_unlock_irqrestore(&table->cache_lock, flags);
				429	}
				430
				431	/*
				432	* remove all cached entries and free the hash table
				433	* used by unmount
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	434	*/
				435	void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
				436	{
				437	if (!info->stripe_hash_table)
				438	return;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	439	btrfs_clear_rbio_cache(info);
Wang Shilong	f749303	2014-11-22 21:13:10 +0800	[diff] [blame]	440	kvfree(info->stripe_hash_table);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	441	info->stripe_hash_table = NULL;
				442	}
				443
				444	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	445	* insert an rbio into the stripe cache. It
				446	* must have already been prepared by calling
				447	* cache_rbio_pages
				448	*
				449	* If this rbio was already cached, it gets
				450	* moved to the front of the lru.
				451	*
				452	* If the size of the rbio cache is too big, we
				453	* prune an item.
				454	*/
				455	static void cache_rbio(struct btrfs_raid_bio *rbio)
				456	{
				457	struct btrfs_stripe_hash_table *table;
				458	unsigned long flags;
				459
				460	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
				461	return;
				462
				463	table = rbio->fs_info->stripe_hash_table;
				464
				465	spin_lock_irqsave(&table->cache_lock, flags);
				466	spin_lock(&rbio->bio_list_lock);
				467
				468	/* bump our ref if we were not in the list before */
				469	if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	470	refcount_inc(&rbio->refs);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	471
				472	if (!list_empty(&rbio->stripe_cache)){
				473	list_move(&rbio->stripe_cache, &table->stripe_cache);
				474	} else {
				475	list_add(&rbio->stripe_cache, &table->stripe_cache);
				476	table->cache_size += 1;
				477	}
				478
				479	spin_unlock(&rbio->bio_list_lock);
				480
				481	if (table->cache_size > RBIO_CACHE_SIZE) {
				482	struct btrfs_raid_bio *found;
				483
				484	found = list_entry(table->stripe_cache.prev,
				485	struct btrfs_raid_bio,
				486	stripe_cache);
				487
				488	if (found != rbio)
				489	__remove_rbio_from_cache(found);
				490	}
				491
				492	spin_unlock_irqrestore(&table->cache_lock, flags);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	493	}
				494
				495	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	496	* helper function to run the xor_blocks api. It is only
				497	* able to do MAX_XOR_BLOCKS at a time, so we need to
				498	* loop through.
				499	*/
				500	static void run_xor(void **pages, int src_cnt, ssize_t len)
				501	{
				502	int src_off = 0;
				503	int xor_src_cnt = 0;
				504	void *dest = pages[src_cnt];
				505
				506	while(src_cnt > 0) {
				507	xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
				508	xor_blocks(xor_src_cnt, len, dest, pages + src_off);
				509
				510	src_cnt -= xor_src_cnt;
				511	src_off += xor_src_cnt;
				512	}
				513	}
				514
				515	/*
David Sterba	176571a	2018-06-29 10:57:05 +0200	[diff] [blame]	516	* Returns true if the bio list inside this rbio covers an entire stripe (no
				517	* rmw required).
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	518	*/
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	519	static int rbio_is_full(struct btrfs_raid_bio *rbio)
				520	{
				521	unsigned long flags;
David Sterba	176571a	2018-06-29 10:57:05 +0200	[diff] [blame]	522	unsigned long size = rbio->bio_list_bytes;
				523	int ret = 1;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	524
				525	spin_lock_irqsave(&rbio->bio_list_lock, flags);
David Sterba	176571a	2018-06-29 10:57:05 +0200	[diff] [blame]	526	if (size != rbio->nr_data * rbio->stripe_len)
				527	ret = 0;
				528	BUG_ON(size > rbio->nr_data * rbio->stripe_len);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	529	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
David Sterba	176571a	2018-06-29 10:57:05 +0200	[diff] [blame]	530
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	531	return ret;
				532	}
				533
				534	/*
				535	* returns 1 if it is safe to merge two rbios together.
				536	* The merging is safe if the two rbios correspond to
				537	* the same stripe and if they are both going in the same
				538	* direction (read vs write), and if neither one is
				539	* locked for final IO
				540	*
				541	* The caller is responsible for locking such that
				542	* rmw_locked is safe to test
				543	*/
				544	static int rbio_can_merge(struct btrfs_raid_bio *last,
				545	struct btrfs_raid_bio *cur)
				546	{
				547	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) \|\|
				548	test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
				549	return 0;
				550
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	551	/*
				552	* we can't merge with cached rbios, since the
				553	* idea is that when we merge the destination
				554	* rbio is going to run our IO for us. We can
Nicholas D Steeves	0132761	2016-05-19 21:18:45 -0400	[diff] [blame]	555	* steal from cached rbios though, other functions
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	556	* handle that.
				557	*/
				558	if (test_bit(RBIO_CACHE_BIT, &last->flags) \|\|
				559	test_bit(RBIO_CACHE_BIT, &cur->flags))
				560	return 0;
				561
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	562	if (last->bbio->raid_map[0] !=
				563	cur->bbio->raid_map[0])
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	564	return 0;
				565
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	566	/* we can't merge with different operations */
				567	if (last->operation != cur->operation)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	568	return 0;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	569	/*
				570	* We've need read the full stripe from the drive.
				571	* check and repair the parity and write the new results.
				572	*
				573	* We're not allowed to add any new bios to the
				574	* bio list here, anyone else that wants to
				575	* change this stripe needs to do their own rmw.
				576	*/
Liu Bo	db34be1	2017-12-04 15:40:35 -0700	[diff] [blame]	577	if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	578	return 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	579
Liu Bo	db34be1	2017-12-04 15:40:35 -0700	[diff] [blame]	580	if (last->operation == BTRFS_RBIO_REBUILD_MISSING)
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	581	return 0;
				582
Liu Bo	cc54ff6	2017-12-11 14:56:31 -0700	[diff] [blame]	583	if (last->operation == BTRFS_RBIO_READ_REBUILD) {
				584	int fa = last->faila;
				585	int fb = last->failb;
				586	int cur_fa = cur->faila;
				587	int cur_fb = cur->failb;
				588
				589	if (last->faila >= last->failb) {
				590	fa = last->failb;
				591	fb = last->faila;
				592	}
				593
				594	if (cur->faila >= cur->failb) {
				595	cur_fa = cur->failb;
				596	cur_fb = cur->faila;
				597	}
				598
				599	if (fa != cur_fa \|\| fb != cur_fb)
				600	return 0;
				601	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	602	return 1;
				603	}
				604
Zhao Lei	b7178a5	2015-03-03 20:38:46 +0800	[diff] [blame]	605	static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe,
				606	int index)
				607	{
				608	return stripe * rbio->stripe_npages + index;
				609	}
				610
				611	/*
				612	* these are just the pages from the rbio array, not from anything
				613	* the FS sent down to us
				614	*/
				615	static struct page rbio_stripe_page(struct btrfs_raid_bio rbio, int stripe,
				616	int index)
				617	{
				618	return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)];
				619	}
				620
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	621	/*
				622	* helper to index into the pstripe
				623	*/
				624	static struct page rbio_pstripe_page(struct btrfs_raid_bio rbio, int index)
				625	{
Zhao Lei	b7178a5	2015-03-03 20:38:46 +0800	[diff] [blame]	626	return rbio_stripe_page(rbio, rbio->nr_data, index);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	627	}
				628
				629	/*
				630	* helper to index into the qstripe, returns null
				631	* if there is no qstripe
				632	*/
				633	static struct page rbio_qstripe_page(struct btrfs_raid_bio rbio, int index)
				634	{
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	635	if (rbio->nr_data + 1 == rbio->real_stripes)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	636	return NULL;
Zhao Lei	b7178a5	2015-03-03 20:38:46 +0800	[diff] [blame]	637	return rbio_stripe_page(rbio, rbio->nr_data + 1, index);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	638	}
				639
				640	/*
				641	* The first stripe in the table for a logical address
				642	* has the lock. rbios are added in one of three ways:
				643	*
				644	* 1) Nobody has the stripe locked yet. The rbio is given
				645	* the lock and 0 is returned. The caller must start the IO
				646	* themselves.
				647	*
				648	* 2) Someone has the stripe locked, but we're able to merge
				649	* with the lock owner. The rbio is freed and the IO will
				650	* start automatically along with the existing rbio. 1 is returned.
				651	*
				652	* 3) Someone has the stripe locked, but we're not able to merge.
				653	* The rbio is added to the lock owner's plug list, or merged into
				654	* an rbio already on the plug list. When the lock owner unlocks,
				655	* the next rbio on the list is run and the IO is started automatically.
				656	* 1 is returned
				657	*
				658	* If we return 0, the caller still owns the rbio and must continue with
				659	* IO submission. If we return 1, the caller must assume the rbio has
				660	* already been freed.
				661	*/
				662	static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
				663	{
Johannes Thumshirn	721860d	2019-10-18 11:58:21 +0200	[diff] [blame]	664	struct btrfs_stripe_hash *h;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	665	struct btrfs_raid_bio *cur;
				666	struct btrfs_raid_bio *pending;
				667	unsigned long flags;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	668	struct btrfs_raid_bio *freeit = NULL;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	669	struct btrfs_raid_bio *cache_drop = NULL;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	670	int ret = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	671
Johannes Thumshirn	721860d	2019-10-18 11:58:21 +0200	[diff] [blame]	672	h = rbio->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
				673
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	674	spin_lock_irqsave(&h->lock, flags);
				675	list_for_each_entry(cur, &h->hash_list, hash_list) {
Johannes Thumshirn	9d6cb1b	2019-10-18 11:58:20 +0200	[diff] [blame]	676	if (cur->bbio->raid_map[0] != rbio->bbio->raid_map[0])
				677	continue;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	678
Johannes Thumshirn	9d6cb1b	2019-10-18 11:58:20 +0200	[diff] [blame]	679	spin_lock(&cur->bio_list_lock);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	680
Johannes Thumshirn	9d6cb1b	2019-10-18 11:58:20 +0200	[diff] [blame]	681	/* Can we steal this cached rbio's pages? */
				682	if (bio_list_empty(&cur->bio_list) &&
				683	list_empty(&cur->plug_list) &&
				684	test_bit(RBIO_CACHE_BIT, &cur->flags) &&
				685	!test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
				686	list_del_init(&cur->hash_list);
				687	refcount_dec(&cur->refs);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	688
Johannes Thumshirn	9d6cb1b	2019-10-18 11:58:20 +0200	[diff] [blame]	689	steal_rbio(cur, rbio);
				690	cache_drop = cur;
				691	spin_unlock(&cur->bio_list_lock);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	692
Johannes Thumshirn	9d6cb1b	2019-10-18 11:58:20 +0200	[diff] [blame]	693	goto lockit;
				694	}
				695
				696	/* Can we merge into the lock owner? */
				697	if (rbio_can_merge(cur, rbio)) {
				698	merge_rbio(cur, rbio);
				699	spin_unlock(&cur->bio_list_lock);
				700	freeit = rbio;
				701	ret = 1;
				702	goto out;
				703	}
				704
				705
				706	/*
				707	* We couldn't merge with the running rbio, see if we can merge
				708	* with the pending ones. We don't have to check for rmw_locked
				709	* because there is no way they are inside finish_rmw right now
				710	*/
				711	list_for_each_entry(pending, &cur->plug_list, plug_list) {
				712	if (rbio_can_merge(pending, rbio)) {
				713	merge_rbio(pending, rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	714	spin_unlock(&cur->bio_list_lock);
				715	freeit = rbio;
				716	ret = 1;
				717	goto out;
				718	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	719	}
Johannes Thumshirn	9d6cb1b	2019-10-18 11:58:20 +0200	[diff] [blame]	720
				721	/*
				722	* No merging, put us on the tail of the plug list, our rbio
				723	* will be started with the currently running rbio unlocks
				724	*/
				725	list_add_tail(&rbio->plug_list, &cur->plug_list);
				726	spin_unlock(&cur->bio_list_lock);
				727	ret = 1;
				728	goto out;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	729	}
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	730	lockit:
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	731	refcount_inc(&rbio->refs);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	732	list_add(&rbio->hash_list, &h->hash_list);
				733	out:
				734	spin_unlock_irqrestore(&h->lock, flags);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	735	if (cache_drop)
				736	remove_rbio_from_cache(cache_drop);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	737	if (freeit)
				738	__free_raid_bio(freeit);
				739	return ret;
				740	}
				741
				742	/*
				743	* called as rmw or parity rebuild is completed. If the plug list has more
				744	* rbios waiting for this stripe, the next one on the list will be started
				745	*/
				746	static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
				747	{
				748	int bucket;
				749	struct btrfs_stripe_hash *h;
				750	unsigned long flags;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	751	int keep_cache = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	752
				753	bucket = rbio_bucket(rbio);
				754	h = rbio->fs_info->stripe_hash_table->table + bucket;
				755
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	756	if (list_empty(&rbio->plug_list))
				757	cache_rbio(rbio);
				758
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	759	spin_lock_irqsave(&h->lock, flags);
				760	spin_lock(&rbio->bio_list_lock);
				761
				762	if (!list_empty(&rbio->hash_list)) {
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	763	/*
				764	* if we're still cached and there is no other IO
				765	* to perform, just leave this rbio here for others
				766	* to steal from later
				767	*/
				768	if (list_empty(&rbio->plug_list) &&
				769	test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
				770	keep_cache = 1;
				771	clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				772	BUG_ON(!bio_list_empty(&rbio->bio_list));
				773	goto done;
				774	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	775
				776	list_del_init(&rbio->hash_list);
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	777	refcount_dec(&rbio->refs);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	778
				779	/*
				780	* we use the plug list to hold all the rbios
				781	* waiting for the chance to lock this stripe.
				782	* hand the lock over to one of them.
				783	*/
				784	if (!list_empty(&rbio->plug_list)) {
				785	struct btrfs_raid_bio *next;
				786	struct list_head *head = rbio->plug_list.next;
				787
				788	next = list_entry(head, struct btrfs_raid_bio,
				789	plug_list);
				790
				791	list_del_init(&rbio->plug_list);
				792
				793	list_add(&next->hash_list, &h->hash_list);
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	794	refcount_inc(&next->refs);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	795	spin_unlock(&rbio->bio_list_lock);
				796	spin_unlock_irqrestore(&h->lock, flags);
				797
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	798	if (next->operation == BTRFS_RBIO_READ_REBUILD)
David Sterba	e66d8d5	2018-06-29 10:57:00 +0200	[diff] [blame]	799	start_async_work(next, read_rebuild_work);
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	800	else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
				801	steal_rbio(rbio, next);
David Sterba	e66d8d5	2018-06-29 10:57:00 +0200	[diff] [blame]	802	start_async_work(next, read_rebuild_work);
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	803	} else if (next->operation == BTRFS_RBIO_WRITE) {
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	804	steal_rbio(rbio, next);
David Sterba	cf6a4a7	2018-06-29 10:56:58 +0200	[diff] [blame]	805	start_async_work(next, rmw_work);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	806	} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
				807	steal_rbio(rbio, next);
David Sterba	a81b747	2018-06-29 10:57:03 +0200	[diff] [blame]	808	start_async_work(next, scrub_parity_work);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	809	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	810
				811	goto done_nolock;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	812	}
				813	}
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	814	done:
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	815	spin_unlock(&rbio->bio_list_lock);
				816	spin_unlock_irqrestore(&h->lock, flags);
				817
				818	done_nolock:
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	819	if (!keep_cache)
				820	remove_rbio_from_cache(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	821	}
				822
				823	static void __free_raid_bio(struct btrfs_raid_bio *rbio)
				824	{
				825	int i;
				826
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	827	if (!refcount_dec_and_test(&rbio->refs))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	828	return;
				829
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	830	WARN_ON(!list_empty(&rbio->stripe_cache));
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	831	WARN_ON(!list_empty(&rbio->hash_list));
				832	WARN_ON(!bio_list_empty(&rbio->bio_list));
				833
				834	for (i = 0; i < rbio->nr_pages; i++) {
				835	if (rbio->stripe_pages[i]) {
				836	__free_page(rbio->stripe_pages[i]);
				837	rbio->stripe_pages[i] = NULL;
				838	}
				839	}
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	840
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	841	btrfs_put_bbio(rbio->bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	842	kfree(rbio);
				843	}
				844
Liu Bo	7583d8d	2018-01-09 18:36:25 -0700	[diff] [blame]	845	static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	846	{
Liu Bo	7583d8d	2018-01-09 18:36:25 -0700	[diff] [blame]	847	struct bio *next;
				848
				849	while (cur) {
				850	next = cur->bi_next;
				851	cur->bi_next = NULL;
				852	cur->bi_status = err;
				853	bio_endio(cur);
				854	cur = next;
				855	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	856	}
				857
				858	/*
				859	* this frees the rbio and runs through all the bios in the
				860	* bio_list and calls end_io on them
				861	*/
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	862	static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	863	{
				864	struct bio *cur = bio_list_get(&rbio->bio_list);
Liu Bo	7583d8d	2018-01-09 18:36:25 -0700	[diff] [blame]	865	struct bio *extra;
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	866
				867	if (rbio->generic_bio_cnt)
				868	btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
				869
Liu Bo	7583d8d	2018-01-09 18:36:25 -0700	[diff] [blame]	870	/*
				871	* At this moment, rbio->bio_list is empty, however since rbio does not
				872	* always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
				873	* hash list, rbio may be merged with others so that rbio->bio_list
				874	* becomes non-empty.
				875	* Once unlock_stripe() is done, rbio->bio_list will not be updated any
				876	* more and we can call bio_endio() on all queued bios.
				877	*/
				878	unlock_stripe(rbio);
				879	extra = bio_list_get(&rbio->bio_list);
				880	__free_raid_bio(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	881
Liu Bo	7583d8d	2018-01-09 18:36:25 -0700	[diff] [blame]	882	rbio_endio_bio_list(cur, err);
				883	if (extra)
				884	rbio_endio_bio_list(extra, err);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	885	}
				886
				887	/*
				888	* end io function used by finish_rmw. When we finally
				889	* get here, we've written a full stripe
				890	*/
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	891	static void raid_write_end_io(struct bio *bio)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	892	{
				893	struct btrfs_raid_bio *rbio = bio->bi_private;
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	894	blk_status_t err = bio->bi_status;
Zhao Lei	a6111d11b	2016-01-12 17:52:13 +0800	[diff] [blame]	895	int max_errors;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	896
				897	if (err)
				898	fail_bio_stripe(rbio, bio);
				899
				900	bio_put(bio);
				901
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	902	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	903	return;
				904
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	905	err = BLK_STS_OK;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	906
				907	/* OK, we have read all the stripes we need to. */
Zhao Lei	a6111d11b	2016-01-12 17:52:13 +0800	[diff] [blame]	908	max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
				909	0 : rbio->bbio->max_errors;
				910	if (atomic_read(&rbio->error) > max_errors)
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	911	err = BLK_STS_IOERR;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	912
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	913	rbio_orig_end_io(rbio, err);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	914	}
				915
				916	/*
				917	* the read/modify/write code wants to use the original bio for
				918	* any pages it included, and then use the rbio for everything
				919	* else. This function decides if a given index (stripe number)
				920	* and page number in that stripe fall inside the original bio
				921	* or the rbio.
				922	*
				923	* if you set bio_list_only, you'll get a NULL back for any ranges
				924	* that are outside the bio_list
				925	*
				926	* This doesn't take any refs on anything, you get a bare page pointer
				927	* and the caller must bump refs as required.
				928	*
				929	* You must call index_rbio_pages once before you can trust
				930	* the answers from this function.
				931	*/
				932	static struct page page_in_rbio(struct btrfs_raid_bio rbio,
				933	int index, int pagenr, int bio_list_only)
				934	{
				935	int chunk_page;
				936	struct page *p = NULL;
				937
				938	chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
				939
				940	spin_lock_irq(&rbio->bio_list_lock);
				941	p = rbio->bio_pages[chunk_page];
				942	spin_unlock_irq(&rbio->bio_list_lock);
				943
				944	if (p \|\| bio_list_only)
				945	return p;
				946
				947	return rbio->stripe_pages[chunk_page];
				948	}
				949
				950	/*
				951	* number of pages we need for the entire stripe across all the
				952	* drives
				953	*/
				954	static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
				955	{
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	956	return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	957	}
				958
				959	/*
				960	* allocation and initial setup for the btrfs_raid_bio. Not
				961	* this does not allocate any pages for rbio->pages.
				962	*/
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	963	static struct btrfs_raid_bio alloc_rbio(struct btrfs_fs_info fs_info,
				964	struct btrfs_bio *bbio,
				965	u64 stripe_len)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	966	{
				967	struct btrfs_raid_bio *rbio;
				968	int nr_data = 0;
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	969	int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
				970	int num_pages = rbio_nr_pages(stripe_len, real_stripes);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	971	int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	972	void *p;
				973
Kees Cook	1389053	2018-05-29 16:44:59 -0700	[diff] [blame]	974	rbio = kzalloc(sizeof(*rbio) +
				975	sizeof(rbio->stripe_pages) num_pages +
				976	sizeof(rbio->bio_pages) num_pages +
				977	sizeof(rbio->finish_pointers) real_stripes +
				978	sizeof(rbio->dbitmap) BITS_TO_LONGS(stripe_npages) +
				979	sizeof(rbio->finish_pbitmap)
				980	BITS_TO_LONGS(stripe_npages),
				981	GFP_NOFS);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	982	if (!rbio)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	983	return ERR_PTR(-ENOMEM);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	984
				985	bio_list_init(&rbio->bio_list);
				986	INIT_LIST_HEAD(&rbio->plug_list);
				987	spin_lock_init(&rbio->bio_list_lock);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	988	INIT_LIST_HEAD(&rbio->stripe_cache);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	989	INIT_LIST_HEAD(&rbio->hash_list);
				990	rbio->bbio = bbio;
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	991	rbio->fs_info = fs_info;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	992	rbio->stripe_len = stripe_len;
				993	rbio->nr_pages = num_pages;
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	994	rbio->real_stripes = real_stripes;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	995	rbio->stripe_npages = stripe_npages;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	996	rbio->faila = -1;
				997	rbio->failb = -1;
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	998	refcount_set(&rbio->refs, 1);
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	999	atomic_set(&rbio->error, 0);
				1000	atomic_set(&rbio->stripes_pending, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1001
				1002	/*
Kees Cook	1389053	2018-05-29 16:44:59 -0700	[diff] [blame]	1003	* the stripe_pages, bio_pages, etc arrays point to the extra
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1004	* memory we allocated past the end of the rbio
				1005	*/
				1006	p = rbio + 1;
Kees Cook	1389053	2018-05-29 16:44:59 -0700	[diff] [blame]	1007	#define CONSUME_ALLOC(ptr, count) do { \
				1008	ptr = p; \
				1009	p = (unsigned char )p + sizeof((ptr)) * (count); \
				1010	} while (0)
				1011	CONSUME_ALLOC(rbio->stripe_pages, num_pages);
				1012	CONSUME_ALLOC(rbio->bio_pages, num_pages);
				1013	CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
				1014	CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages));
				1015	CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages));
				1016	#undef CONSUME_ALLOC
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1017
Zhao Lei	10f1190	2015-01-20 15:11:43 +0800	[diff] [blame]	1018	if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
				1019	nr_data = real_stripes - 1;
				1020	else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1021	nr_data = real_stripes - 2;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1022	else
Zhao Lei	10f1190	2015-01-20 15:11:43 +0800	[diff] [blame]	1023	BUG();
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1024
				1025	rbio->nr_data = nr_data;
				1026	return rbio;
				1027	}
				1028
				1029	/* allocate pages for all the stripes in the bio, including parity */
				1030	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
				1031	{
				1032	int i;
				1033	struct page *page;
				1034
				1035	for (i = 0; i < rbio->nr_pages; i++) {
				1036	if (rbio->stripe_pages[i])
				1037	continue;
				1038	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				1039	if (!page)
				1040	return -ENOMEM;
				1041	rbio->stripe_pages[i] = page;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1042	}
				1043	return 0;
				1044	}
				1045
Zhao Lei	b7178a5	2015-03-03 20:38:46 +0800	[diff] [blame]	1046	/* only allocate pages for p/q stripes */
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1047	static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
				1048	{
				1049	int i;
				1050	struct page *page;
				1051
Zhao Lei	b7178a5	2015-03-03 20:38:46 +0800	[diff] [blame]	1052	i = rbio_stripe_page_index(rbio, rbio->nr_data, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1053
				1054	for (; i < rbio->nr_pages; i++) {
				1055	if (rbio->stripe_pages[i])
				1056	continue;
				1057	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				1058	if (!page)
				1059	return -ENOMEM;
				1060	rbio->stripe_pages[i] = page;
				1061	}
				1062	return 0;
				1063	}
				1064
				1065	/*
				1066	* add a single page from a specific stripe into our list of bios for IO
				1067	* this will try to merge into existing bios if possible, and returns
				1068	* zero if all went well.
				1069	*/
Eric Sandeen	48a3b63	2013-04-25 20:41:01 +0000	[diff] [blame]	1070	static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
				1071	struct bio_list *bio_list,
				1072	struct page *page,
				1073	int stripe_nr,
				1074	unsigned long page_index,
				1075	unsigned long bio_max_len)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1076	{
				1077	struct bio *last = bio_list->tail;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1078	int ret;
				1079	struct bio *bio;
				1080	struct btrfs_bio_stripe *stripe;
				1081	u64 disk_start;
				1082
				1083	stripe = &rbio->bbio->stripes[stripe_nr];
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1084	disk_start = stripe->physical + (page_index << PAGE_SHIFT);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1085
				1086	/* if the device is missing, just fail this stripe */
				1087	if (!stripe->dev->bdev)
				1088	return fail_rbio_index(rbio, stripe_nr);
				1089
				1090	/* see if we can add this page onto our existing bio */
				1091	if (last) {
David Sterba	1201b58	2020-11-26 15:41:27 +0100	[diff] [blame]	1092	u64 last_end = last->bi_iter.bi_sector << 9;
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1093	last_end += last->bi_iter.bi_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1094
				1095	/*
				1096	* we can't merge these if they are from different
				1097	* devices or if they are not contiguous
				1098	*/
Nikolay Borisov	f90ae76	2020-07-02 16:46:42 +0300	[diff] [blame]	1099	if (last_end == disk_start && !last->bi_status &&
Christoph Hellwig	309dca30	2021-01-24 11:02:34 +0100	[diff] [blame]	1100	last->bi_bdev == stripe->dev->bdev) {
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1101	ret = bio_add_page(last, page, PAGE_SIZE, 0);
				1102	if (ret == PAGE_SIZE)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1103	return 0;
				1104	}
				1105	}
				1106
				1107	/* put a new bio on the list */
David Sterba	c5e4c3d	2017-06-12 17:29:41 +0200	[diff] [blame]	1108	bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
Nikolay Borisov	c31efbd	2020-07-03 11:14:27 +0300	[diff] [blame]	1109	btrfs_io_bio(bio)->device = stripe->dev;
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1110	bio->bi_iter.bi_size = 0;
Christoph Hellwig	74d4699	2017-08-23 19:10:32 +0200	[diff] [blame]	1111	bio_set_dev(bio, stripe->dev->bdev);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1112	bio->bi_iter.bi_sector = disk_start >> 9;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1113
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1114	bio_add_page(bio, page, PAGE_SIZE, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1115	bio_list_add(bio_list, bio);
				1116	return 0;
				1117	}
				1118
				1119	/*
				1120	* while we're doing the read/modify/write cycle, we could
				1121	* have errors in reading pages off the disk. This checks
				1122	* for errors and if we're not able to read the page it'll
				1123	* trigger parity reconstruction. The rmw will be finished
				1124	* after we've reconstructed the failed stripes
				1125	*/
				1126	static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
				1127	{
				1128	if (rbio->faila >= 0 \|\| rbio->failb >= 0) {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1129	BUG_ON(rbio->faila == rbio->real_stripes - 1);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1130	__raid56_parity_recover(rbio);
				1131	} else {
				1132	finish_rmw(rbio);
				1133	}
				1134	}
				1135
				1136	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1137	* helper function to walk our bio list and populate the bio_pages array with
				1138	* the result. This seems expensive, but it is faster than constantly
				1139	* searching through the bio list as we setup the IO in finish_rmw or stripe
				1140	* reconstruction.
				1141	*
				1142	* This must be called before you trust the answers from page_in_rbio
				1143	*/
				1144	static void index_rbio_pages(struct btrfs_raid_bio *rbio)
				1145	{
				1146	struct bio *bio;
				1147	u64 start;
				1148	unsigned long stripe_offset;
				1149	unsigned long page_index;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1150
				1151	spin_lock_irq(&rbio->bio_list_lock);
				1152	bio_list_for_each(bio, &rbio->bio_list) {
Filipe Manana	6592e58	2017-07-12 23:36:02 +0100	[diff] [blame]	1153	struct bio_vec bvec;
				1154	struct bvec_iter iter;
				1155	int i = 0;
				1156
David Sterba	1201b58	2020-11-26 15:41:27 +0100	[diff] [blame]	1157	start = bio->bi_iter.bi_sector << 9;
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1158	stripe_offset = start - rbio->bbio->raid_map[0];
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1159	page_index = stripe_offset >> PAGE_SHIFT;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1160
Filipe Manana	6592e58	2017-07-12 23:36:02 +0100	[diff] [blame]	1161	if (bio_flagged(bio, BIO_CLONED))
				1162	bio->bi_iter = btrfs_io_bio(bio)->iter;
				1163
				1164	bio_for_each_segment(bvec, bio, iter) {
				1165	rbio->bio_pages[page_index + i] = bvec.bv_page;
				1166	i++;
				1167	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1168	}
				1169	spin_unlock_irq(&rbio->bio_list_lock);
				1170	}
				1171
				1172	/*
				1173	* this is called from one of two situations. We either
				1174	* have a full stripe from the higher layers, or we've read all
				1175	* the missing bits off disk.
				1176	*
				1177	* This will calculate the parity and then send down any
				1178	* changed blocks.
				1179	*/
				1180	static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
				1181	{
				1182	struct btrfs_bio *bbio = rbio->bbio;
Kees Cook	1389053	2018-05-29 16:44:59 -0700	[diff] [blame]	1183	void **pointers = rbio->finish_pointers;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1184	int nr_data = rbio->nr_data;
				1185	int stripe;
				1186	int pagenr;
David Sterba	c17af96	2020-02-19 15:17:20 +0100	[diff] [blame]	1187	bool has_qstripe;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1188	struct bio_list bio_list;
				1189	struct bio *bio;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1190	int ret;
				1191
				1192	bio_list_init(&bio_list);
				1193
David Sterba	c17af96	2020-02-19 15:17:20 +0100	[diff] [blame]	1194	if (rbio->real_stripes - rbio->nr_data == 1)
				1195	has_qstripe = false;
				1196	else if (rbio->real_stripes - rbio->nr_data == 2)
				1197	has_qstripe = true;
				1198	else
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1199	BUG();
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1200
				1201	/* at this point we either have a full stripe,
				1202	* or we've read the full stripe from the drive.
				1203	* recalculate the parity and write the new results.
				1204	*
				1205	* We're not allowed to add any new bios to the
				1206	* bio list here, anyone else that wants to
				1207	* change this stripe needs to do their own rmw.
				1208	*/
				1209	spin_lock_irq(&rbio->bio_list_lock);
				1210	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				1211	spin_unlock_irq(&rbio->bio_list_lock);
				1212
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1213	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1214
				1215	/*
				1216	* now that we've set rmw_locked, run through the
				1217	* bio list one last time and map the page pointers
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1218	*
				1219	* We don't cache full rbios because we're assuming
				1220	* the higher layers are unlikely to use this area of
				1221	* the disk again soon. If they do use it again,
				1222	* hopefully they will send another full bio.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1223	*/
				1224	index_rbio_pages(rbio);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1225	if (!rbio_is_full(rbio))
				1226	cache_rbio_pages(rbio);
				1227	else
				1228	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1229
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1230	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1231	struct page *p;
				1232	/* first collect one page from each data stripe */
				1233	for (stripe = 0; stripe < nr_data; stripe++) {
				1234	p = page_in_rbio(rbio, stripe, pagenr, 0);
Ira Weiny	94a0b58	2021-02-16 18:48:24 -0800	[diff] [blame]	1235	pointers[stripe] = kmap_local_page(p);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1236	}
				1237
				1238	/* then add the parity stripe */
				1239	p = rbio_pstripe_page(rbio, pagenr);
				1240	SetPageUptodate(p);
Ira Weiny	94a0b58	2021-02-16 18:48:24 -0800	[diff] [blame]	1241	pointers[stripe++] = kmap_local_page(p);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1242
David Sterba	c17af96	2020-02-19 15:17:20 +0100	[diff] [blame]	1243	if (has_qstripe) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1244
				1245	/*
				1246	* raid6, add the qstripe and call the
				1247	* library function to fill in our p/q
				1248	*/
				1249	p = rbio_qstripe_page(rbio, pagenr);
				1250	SetPageUptodate(p);
Ira Weiny	94a0b58	2021-02-16 18:48:24 -0800	[diff] [blame]	1251	pointers[stripe++] = kmap_local_page(p);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1252
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1253	raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1254	pointers);
				1255	} else {
				1256	/* raid5 */
David Sterba	69d2480	2018-06-29 10:56:44 +0200	[diff] [blame]	1257	copy_page(pointers[nr_data], pointers[0]);
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1258	run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1259	}
Ira Weiny	94a0b58	2021-02-16 18:48:24 -0800	[diff] [blame]	1260	for (stripe = stripe - 1; stripe >= 0; stripe--)
				1261	kunmap_local(pointers[stripe]);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1262	}
				1263
				1264	/*
				1265	* time to start writing. Make bios for everything from the
				1266	* higher layers (the bio_list in our rbio) and our p/q. Ignore
				1267	* everything else.
				1268	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1269	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1270	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1271	struct page *page;
				1272	if (stripe < rbio->nr_data) {
				1273	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1274	if (!page)
				1275	continue;
				1276	} else {
				1277	page = rbio_stripe_page(rbio, stripe, pagenr);
				1278	}
				1279
				1280	ret = rbio_add_io_page(rbio, &bio_list,
				1281	page, stripe, pagenr, rbio->stripe_len);
				1282	if (ret)
				1283	goto cleanup;
				1284	}
				1285	}
				1286
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1287	if (likely(!bbio->num_tgtdevs))
				1288	goto write_data;
				1289
				1290	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
				1291	if (!bbio->tgtdev_map[stripe])
				1292	continue;
				1293
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1294	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1295	struct page *page;
				1296	if (stripe < rbio->nr_data) {
				1297	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1298	if (!page)
				1299	continue;
				1300	} else {
				1301	page = rbio_stripe_page(rbio, stripe, pagenr);
				1302	}
				1303
				1304	ret = rbio_add_io_page(rbio, &bio_list, page,
				1305	rbio->bbio->tgtdev_map[stripe],
				1306	pagenr, rbio->stripe_len);
				1307	if (ret)
				1308	goto cleanup;
				1309	}
				1310	}
				1311
				1312	write_data:
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1313	atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
				1314	BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1315
Nikolay Borisov	bf28a60	2020-07-02 16:46:43 +0300	[diff] [blame]	1316	while ((bio = bio_list_pop(&bio_list))) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1317	bio->bi_private = rbio;
				1318	bio->bi_end_io = raid_write_end_io;
David Sterba	ebcc326	2018-06-29 10:56:53 +0200	[diff] [blame]	1319	bio->bi_opf = REQ_OP_WRITE;
Mike Christie	4e49ea4	2016-06-05 14:31:41 -0500	[diff] [blame]	1320
				1321	submit_bio(bio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1322	}
				1323	return;
				1324
				1325	cleanup:
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1326	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	1327
				1328	while ((bio = bio_list_pop(&bio_list)))
				1329	bio_put(bio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1330	}
				1331
				1332	/*
				1333	* helper to find the stripe number for a given bio. Used to figure out which
				1334	* stripe has failed. This expects the bio to correspond to a physical disk,
				1335	* so it looks up based on physical sector numbers.
				1336	*/
				1337	static int find_bio_stripe(struct btrfs_raid_bio *rbio,
				1338	struct bio *bio)
				1339	{
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1340	u64 physical = bio->bi_iter.bi_sector;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1341	int i;
				1342	struct btrfs_bio_stripe *stripe;
				1343
				1344	physical <<= 9;
				1345
				1346	for (i = 0; i < rbio->bbio->num_stripes; i++) {
				1347	stripe = &rbio->bbio->stripes[i];
Nikolay Borisov	8302586	2020-07-02 16:46:45 +0300	[diff] [blame]	1348	if (in_range(physical, stripe->physical, rbio->stripe_len) &&
Christoph Hellwig	309dca30	2021-01-24 11:02:34 +0100	[diff] [blame]	1349	stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1350	return i;
				1351	}
				1352	}
				1353	return -1;
				1354	}
				1355
				1356	/*
				1357	* helper to find the stripe number for a given
				1358	* bio (before mapping). Used to figure out which stripe has
				1359	* failed. This looks up based on logical block numbers.
				1360	*/
				1361	static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
				1362	struct bio *bio)
				1363	{
David Sterba	1201b58	2020-11-26 15:41:27 +0100	[diff] [blame]	1364	u64 logical = bio->bi_iter.bi_sector << 9;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1365	int i;
				1366
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1367	for (i = 0; i < rbio->nr_data; i++) {
Nikolay Borisov	8302586	2020-07-02 16:46:45 +0300	[diff] [blame]	1368	u64 stripe_start = rbio->bbio->raid_map[i];
				1369
				1370	if (in_range(logical, stripe_start, rbio->stripe_len))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1371	return i;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1372	}
				1373	return -1;
				1374	}
				1375
				1376	/*
				1377	* returns -EIO if we had too many failures
				1378	*/
				1379	static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
				1380	{
				1381	unsigned long flags;
				1382	int ret = 0;
				1383
				1384	spin_lock_irqsave(&rbio->bio_list_lock, flags);
				1385
				1386	/* we already know this stripe is bad, move on */
				1387	if (rbio->faila == failed \|\| rbio->failb == failed)
				1388	goto out;
				1389
				1390	if (rbio->faila == -1) {
				1391	/* first failure on this rbio */
				1392	rbio->faila = failed;
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1393	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1394	} else if (rbio->failb == -1) {
				1395	/* second failure on this rbio */
				1396	rbio->failb = failed;
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1397	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1398	} else {
				1399	ret = -EIO;
				1400	}
				1401	out:
				1402	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
				1403
				1404	return ret;
				1405	}
				1406
				1407	/*
				1408	* helper to fail a stripe based on a physical disk
				1409	* bio.
				1410	*/
				1411	static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
				1412	struct bio *bio)
				1413	{
				1414	int failed = find_bio_stripe(rbio, bio);
				1415
				1416	if (failed < 0)
				1417	return -EIO;
				1418
				1419	return fail_rbio_index(rbio, failed);
				1420	}
				1421
				1422	/*
				1423	* this sets each page in the bio uptodate. It should only be used on private
				1424	* rbio pages, nothing that comes in from the higher layers
				1425	*/
				1426	static void set_bio_pages_uptodate(struct bio *bio)
				1427	{
Liu Bo	0198e5b	2018-01-12 18:07:01 -0700	[diff] [blame]	1428	struct bio_vec *bvec;
Ming Lei	6dc4f10	2019-02-15 19:13:19 +0800	[diff] [blame]	1429	struct bvec_iter_all iter_all;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1430
Liu Bo	0198e5b	2018-01-12 18:07:01 -0700	[diff] [blame]	1431	ASSERT(!bio_flagged(bio, BIO_CLONED));
Filipe Manana	6592e58	2017-07-12 23:36:02 +0100	[diff] [blame]	1432
Christoph Hellwig	2b070cf	2019-04-25 09:03:00 +0200	[diff] [blame]	1433	bio_for_each_segment_all(bvec, bio, iter_all)
Liu Bo	0198e5b	2018-01-12 18:07:01 -0700	[diff] [blame]	1434	SetPageUptodate(bvec->bv_page);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1435	}
				1436
				1437	/*
				1438	* end io for the read phase of the rmw cycle. All the bios here are physical
				1439	* stripe bios we've read from the disk so we can recalculate the parity of the
				1440	* stripe.
				1441	*
				1442	* This will usually kick off finish_rmw once all the bios are read in, but it
				1443	* may trigger parity reconstruction if we had any errors along the way
				1444	*/
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	1445	static void raid_rmw_end_io(struct bio *bio)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1446	{
				1447	struct btrfs_raid_bio *rbio = bio->bi_private;
				1448
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	1449	if (bio->bi_status)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1450	fail_bio_stripe(rbio, bio);
				1451	else
				1452	set_bio_pages_uptodate(bio);
				1453
				1454	bio_put(bio);
				1455
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1456	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1457	return;
				1458
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1459	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1460	goto cleanup;
				1461
				1462	/*
				1463	* this will normally call finish_rmw to start our write
				1464	* but if there are any failed stripes we'll reconstruct
				1465	* from parity first
				1466	*/
				1467	validate_rbio_for_rmw(rbio);
				1468	return;
				1469
				1470	cleanup:
				1471
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1472	rbio_orig_end_io(rbio, BLK_STS_IOERR);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1473	}
				1474
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1475	/*
				1476	* the stripe must be locked by the caller. It will
				1477	* unlock after all the writes are done
				1478	*/
				1479	static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
				1480	{
				1481	int bios_to_read = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1482	struct bio_list bio_list;
				1483	int ret;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1484	int pagenr;
				1485	int stripe;
				1486	struct bio *bio;
				1487
				1488	bio_list_init(&bio_list);
				1489
				1490	ret = alloc_rbio_pages(rbio);
				1491	if (ret)
				1492	goto cleanup;
				1493
				1494	index_rbio_pages(rbio);
				1495
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1496	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1497	/*
				1498	* build a list of bios to read all the missing parts of this
				1499	* stripe
				1500	*/
				1501	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1502	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1503	struct page *page;
				1504	/*
				1505	* we want to find all the pages missing from
				1506	* the rbio and read them from the disk. If
				1507	* page_in_rbio finds a page in the bio list
				1508	* we don't need to read it off the stripe.
				1509	*/
				1510	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1511	if (page)
				1512	continue;
				1513
				1514	page = rbio_stripe_page(rbio, stripe, pagenr);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1515	/*
				1516	* the bio cache may have handed us an uptodate
				1517	* page. If so, be happy and use it
				1518	*/
				1519	if (PageUptodate(page))
				1520	continue;
				1521
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1522	ret = rbio_add_io_page(rbio, &bio_list, page,
				1523	stripe, pagenr, rbio->stripe_len);
				1524	if (ret)
				1525	goto cleanup;
				1526	}
				1527	}
				1528
				1529	bios_to_read = bio_list_size(&bio_list);
				1530	if (!bios_to_read) {
				1531	/*
				1532	* this can happen if others have merged with
				1533	* us, it means there is nothing left to read.
				1534	* But if there are missing devices it may not be
				1535	* safe to do the full stripe write yet.
				1536	*/
				1537	goto finish;
				1538	}
				1539
				1540	/*
				1541	* the bbio may be freed once we submit the last bio. Make sure
				1542	* not to touch it after that
				1543	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1544	atomic_set(&rbio->stripes_pending, bios_to_read);
Nikolay Borisov	bf28a60	2020-07-02 16:46:43 +0300	[diff] [blame]	1545	while ((bio = bio_list_pop(&bio_list))) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1546	bio->bi_private = rbio;
				1547	bio->bi_end_io = raid_rmw_end_io;
David Sterba	ebcc326	2018-06-29 10:56:53 +0200	[diff] [blame]	1548	bio->bi_opf = REQ_OP_READ;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1549
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1550	btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1551
Mike Christie	4e49ea4	2016-06-05 14:31:41 -0500	[diff] [blame]	1552	submit_bio(bio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1553	}
				1554	/* the actual write will happen once the reads are done */
				1555	return 0;
				1556
				1557	cleanup:
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1558	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	1559
				1560	while ((bio = bio_list_pop(&bio_list)))
				1561	bio_put(bio);
				1562
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1563	return -EIO;
				1564
				1565	finish:
				1566	validate_rbio_for_rmw(rbio);
				1567	return 0;
				1568	}
				1569
				1570	/*
				1571	* if the upper layers pass in a full stripe, we thank them by only allocating
				1572	* enough pages to hold the parity, and sending it all down quickly.
				1573	*/
				1574	static int full_stripe_write(struct btrfs_raid_bio *rbio)
				1575	{
				1576	int ret;
				1577
				1578	ret = alloc_rbio_parity_pages(rbio);
Miao Xie	3cd846d	2013-07-22 16:36:57 +0800	[diff] [blame]	1579	if (ret) {
				1580	__free_raid_bio(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1581	return ret;
Miao Xie	3cd846d	2013-07-22 16:36:57 +0800	[diff] [blame]	1582	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1583
				1584	ret = lock_stripe_add(rbio);
				1585	if (ret == 0)
				1586	finish_rmw(rbio);
				1587	return 0;
				1588	}
				1589
				1590	/*
				1591	* partial stripe writes get handed over to async helpers.
				1592	* We're really hoping to merge a few more writes into this
				1593	* rbio before calculating new parity
				1594	*/
				1595	static int partial_stripe_write(struct btrfs_raid_bio *rbio)
				1596	{
				1597	int ret;
				1598
				1599	ret = lock_stripe_add(rbio);
				1600	if (ret == 0)
David Sterba	cf6a4a7	2018-06-29 10:56:58 +0200	[diff] [blame]	1601	start_async_work(rbio, rmw_work);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1602	return 0;
				1603	}
				1604
				1605	/*
				1606	* sometimes while we were reading from the drive to
				1607	* recalculate parity, enough new bios come into create
				1608	* a full stripe. So we do a check here to see if we can
				1609	* go directly to finish_rmw
				1610	*/
				1611	static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
				1612	{
				1613	/* head off into rmw land if we don't have a full stripe */
				1614	if (!rbio_is_full(rbio))
				1615	return partial_stripe_write(rbio);
				1616	return full_stripe_write(rbio);
				1617	}
				1618
				1619	/*
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1620	* We use plugging call backs to collect full stripes.
				1621	* Any time we get a partial stripe write while plugged
				1622	* we collect it into a list. When the unplug comes down,
				1623	* we sort the list by logical block number and merge
				1624	* everything we can into the same rbios
				1625	*/
				1626	struct btrfs_plug_cb {
				1627	struct blk_plug_cb cb;
				1628	struct btrfs_fs_info *info;
				1629	struct list_head rbio_list;
				1630	struct btrfs_work work;
				1631	};
				1632
				1633	/*
				1634	* rbios on the plug list are sorted for easier merging.
				1635	*/
Sami Tolvanen	4f0f586	2021-04-08 11:28:34 -0700	[diff] [blame]	1636	static int plug_cmp(void priv, const struct list_head a,
				1637	const struct list_head *b)
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1638	{
				1639	struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
				1640	plug_list);
				1641	struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
				1642	plug_list);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1643	u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
				1644	u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1645
				1646	if (a_sector < b_sector)
				1647	return -1;
				1648	if (a_sector > b_sector)
				1649	return 1;
				1650	return 0;
				1651	}
				1652
				1653	static void run_plug(struct btrfs_plug_cb *plug)
				1654	{
				1655	struct btrfs_raid_bio *cur;
				1656	struct btrfs_raid_bio *last = NULL;
				1657
				1658	/*
				1659	* sort our plug list then try to merge
				1660	* everything we can in hopes of creating full
				1661	* stripes.
				1662	*/
				1663	list_sort(NULL, &plug->rbio_list, plug_cmp);
				1664	while (!list_empty(&plug->rbio_list)) {
				1665	cur = list_entry(plug->rbio_list.next,
				1666	struct btrfs_raid_bio, plug_list);
				1667	list_del_init(&cur->plug_list);
				1668
				1669	if (rbio_is_full(cur)) {
David Sterba	c7b562c	2018-06-29 10:57:10 +0200	[diff] [blame]	1670	int ret;
				1671
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1672	/* we have a full stripe, send it down */
David Sterba	c7b562c	2018-06-29 10:57:10 +0200	[diff] [blame]	1673	ret = full_stripe_write(cur);
				1674	BUG_ON(ret);
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1675	continue;
				1676	}
				1677	if (last) {
				1678	if (rbio_can_merge(last, cur)) {
				1679	merge_rbio(last, cur);
				1680	__free_raid_bio(cur);
				1681	continue;
				1682
				1683	}
				1684	__raid56_parity_write(last);
				1685	}
				1686	last = cur;
				1687	}
				1688	if (last) {
				1689	__raid56_parity_write(last);
				1690	}
				1691	kfree(plug);
				1692	}
				1693
				1694	/*
				1695	* if the unplug comes from schedule, we have to push the
				1696	* work off to a helper thread
				1697	*/
				1698	static void unplug_work(struct btrfs_work *work)
				1699	{
				1700	struct btrfs_plug_cb *plug;
				1701	plug = container_of(work, struct btrfs_plug_cb, work);
				1702	run_plug(plug);
				1703	}
				1704
				1705	static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
				1706	{
				1707	struct btrfs_plug_cb *plug;
				1708	plug = container_of(cb, struct btrfs_plug_cb, cb);
				1709
				1710	if (from_schedule) {
Omar Sandoval	a0cac0e	2019-09-16 11:30:57 -0700	[diff] [blame]	1711	btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
Qu Wenruo	d05a33a	2014-02-28 10:46:11 +0800	[diff] [blame]	1712	btrfs_queue_work(plug->info->rmw_workers,
				1713	&plug->work);
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1714	return;
				1715	}
				1716	run_plug(plug);
				1717	}
				1718
				1719	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1720	* our main entry point for writes from the rest of the FS.
				1721	*/
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	1722	int raid56_parity_write(struct btrfs_fs_info fs_info, struct bio bio,
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1723	struct btrfs_bio *bbio, u64 stripe_len)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1724	{
				1725	struct btrfs_raid_bio *rbio;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1726	struct btrfs_plug_cb *plug = NULL;
				1727	struct blk_plug_cb *cb;
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1728	int ret;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1729
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	1730	rbio = alloc_rbio(fs_info, bbio, stripe_len);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	1731	if (IS_ERR(rbio)) {
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	1732	btrfs_put_bbio(bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1733	return PTR_ERR(rbio);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	1734	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1735	bio_list_add(&rbio->bio_list, bio);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1736	rbio->bio_list_bytes = bio->bi_iter.bi_size;
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	1737	rbio->operation = BTRFS_RBIO_WRITE;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1738
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1739	btrfs_bio_counter_inc_noblocked(fs_info);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1740	rbio->generic_bio_cnt = 1;
				1741
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1742	/*
				1743	* don't plug on full rbios, just get them out the door
				1744	* as quickly as we can
				1745	*/
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1746	if (rbio_is_full(rbio)) {
				1747	ret = full_stripe_write(rbio);
				1748	if (ret)
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1749	btrfs_bio_counter_dec(fs_info);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1750	return ret;
				1751	}
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1752
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1753	cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1754	if (cb) {
				1755	plug = container_of(cb, struct btrfs_plug_cb, cb);
				1756	if (!plug->info) {
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1757	plug->info = fs_info;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1758	INIT_LIST_HEAD(&plug->rbio_list);
				1759	}
				1760	list_add_tail(&rbio->plug_list, &plug->rbio_list);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1761	ret = 0;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1762	} else {
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1763	ret = __raid56_parity_write(rbio);
				1764	if (ret)
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1765	btrfs_bio_counter_dec(fs_info);
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1766	}
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1767	return ret;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1768	}
				1769
				1770	/*
				1771	* all parity reconstruction happens here. We've read in everything
				1772	* we can find from the drives and this does the heavy lifting of
				1773	* sorting the good from the bad.
				1774	*/
				1775	static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
				1776	{
				1777	int pagenr, stripe;
				1778	void **pointers;
Ira Weiny	94a0b58	2021-02-16 18:48:24 -0800	[diff] [blame]	1779	void **unmap_array;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1780	int faila = -1, failb = -1;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1781	struct page *page;
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1782	blk_status_t err;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1783	int i;
				1784
David Sterba	31e818f	2015-02-20 18:00:26 +0100	[diff] [blame]	1785	pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1786	if (!pointers) {
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1787	err = BLK_STS_RESOURCE;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1788	goto cleanup_io;
				1789	}
				1790
Ira Weiny	94a0b58	2021-02-16 18:48:24 -0800	[diff] [blame]	1791	/*
				1792	* Store copy of pointers that does not get reordered during
				1793	* reconstruction so that kunmap_local works.
				1794	*/
				1795	unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
				1796	if (!unmap_array) {
				1797	err = BLK_STS_RESOURCE;
				1798	goto cleanup_pointers;
				1799	}
				1800
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1801	faila = rbio->faila;
				1802	failb = rbio->failb;
				1803
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	1804	if (rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
				1805	rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1806	spin_lock_irq(&rbio->bio_list_lock);
				1807	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				1808	spin_unlock_irq(&rbio->bio_list_lock);
				1809	}
				1810
				1811	index_rbio_pages(rbio);
				1812
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1813	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	1814	/*
				1815	* Now we just use bitmap to mark the horizontal stripes in
				1816	* which we have data when doing parity scrub.
				1817	*/
				1818	if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
				1819	!test_bit(pagenr, rbio->dbitmap))
				1820	continue;
				1821
Ira Weiny	94a0b58	2021-02-16 18:48:24 -0800	[diff] [blame]	1822	/*
				1823	* Setup our array of pointers with pages from each stripe
				1824	*
				1825	* NOTE: store a duplicate array of pointers to preserve the
				1826	* pointer order
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1827	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1828	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1829	/*
				1830	* if we're rebuilding a read, we have to use
				1831	* pages from the bio list
				1832	*/
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	1833	if ((rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
				1834	rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1835	(stripe == faila \|\| stripe == failb)) {
				1836	page = page_in_rbio(rbio, stripe, pagenr, 0);
				1837	} else {
				1838	page = rbio_stripe_page(rbio, stripe, pagenr);
				1839	}
Ira Weiny	94a0b58	2021-02-16 18:48:24 -0800	[diff] [blame]	1840	pointers[stripe] = kmap_local_page(page);
				1841	unmap_array[stripe] = pointers[stripe];
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1842	}
				1843
				1844	/* all raid6 handling here */
Zhao Lei	10f1190	2015-01-20 15:11:43 +0800	[diff] [blame]	1845	if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1846	/*
				1847	* single failure, rebuild from parity raid5
				1848	* style
				1849	*/
				1850	if (failb < 0) {
				1851	if (faila == rbio->nr_data) {
				1852	/*
				1853	* Just the P stripe has failed, without
				1854	* a bad data or Q stripe.
				1855	* TODO, we should redo the xor here.
				1856	*/
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1857	err = BLK_STS_IOERR;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1858	goto cleanup;
				1859	}
				1860	/*
				1861	* a single failure in raid6 is rebuilt
				1862	* in the pstripe code below
				1863	*/
				1864	goto pstripe;
				1865	}
				1866
				1867	/* make sure our ps and qs are in order */
Nikolay Borisov	b7d2083	2020-07-02 16:46:46 +0300	[diff] [blame]	1868	if (faila > failb)
				1869	swap(faila, failb);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1870
				1871	/* if the q stripe is failed, do a pstripe reconstruction
				1872	* from the xors.
				1873	* If both the q stripe and the P stripe are failed, we're
				1874	* here due to a crc mismatch and we can't give them the
				1875	* data they want
				1876	*/
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1877	if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) {
				1878	if (rbio->bbio->raid_map[faila] ==
				1879	RAID5_P_STRIPE) {
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1880	err = BLK_STS_IOERR;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1881	goto cleanup;
				1882	}
				1883	/*
				1884	* otherwise we have one bad data stripe and
				1885	* a good P stripe. raid5!
				1886	*/
				1887	goto pstripe;
				1888	}
				1889
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1890	if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1891	raid6_datap_recov(rbio->real_stripes,
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1892	PAGE_SIZE, faila, pointers);
				1893	} else {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1894	raid6_2data_recov(rbio->real_stripes,
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1895	PAGE_SIZE, faila, failb,
				1896	pointers);
				1897	}
				1898	} else {
				1899	void *p;
				1900
				1901	/* rebuild from P stripe here (raid5 or raid6) */
				1902	BUG_ON(failb != -1);
				1903	pstripe:
				1904	/* Copy parity block into failed block to start with */
David Sterba	69d2480	2018-06-29 10:56:44 +0200	[diff] [blame]	1905	copy_page(pointers[faila], pointers[rbio->nr_data]);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1906
				1907	/* rearrange the pointer array */
				1908	p = pointers[faila];
				1909	for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
				1910	pointers[stripe] = pointers[stripe + 1];
				1911	pointers[rbio->nr_data - 1] = p;
				1912
				1913	/* xor in the rest */
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1914	run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1915	}
				1916	/* if we're doing this rebuild as part of an rmw, go through
				1917	* and set all of our private rbio pages in the
				1918	* failed stripes as uptodate. This way finish_rmw will
				1919	* know they can be trusted. If this was a read reconstruction,
				1920	* other endio functions will fiddle the uptodate bits
				1921	*/
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	1922	if (rbio->operation == BTRFS_RBIO_WRITE) {
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1923	for (i = 0; i < rbio->stripe_npages; i++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1924	if (faila != -1) {
				1925	page = rbio_stripe_page(rbio, faila, i);
				1926	SetPageUptodate(page);
				1927	}
				1928	if (failb != -1) {
				1929	page = rbio_stripe_page(rbio, failb, i);
				1930	SetPageUptodate(page);
				1931	}
				1932	}
				1933	}
Ira Weiny	94a0b58	2021-02-16 18:48:24 -0800	[diff] [blame]	1934	for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--)
				1935	kunmap_local(unmap_array[stripe]);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1936	}
				1937
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1938	err = BLK_STS_OK;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1939	cleanup:
Ira Weiny	94a0b58	2021-02-16 18:48:24 -0800	[diff] [blame]	1940	kfree(unmap_array);
				1941	cleanup_pointers:
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1942	kfree(pointers);
				1943
				1944	cleanup_io:
Liu Bo	580c6ef	2018-03-22 09:20:11 +0800	[diff] [blame]	1945	/*
				1946	* Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
				1947	* valid rbio which is consistent with ondisk content, thus such a
				1948	* valid rbio can be cached to avoid further disk reads.
				1949	*/
				1950	if (rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
				1951	rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
Liu Bo	44ac474	2018-01-12 18:07:02 -0700	[diff] [blame]	1952	/*
				1953	* - In case of two failures, where rbio->failb != -1:
				1954	*
				1955	* Do not cache this rbio since the above read reconstruction
				1956	* (raid6_datap_recov() or raid6_2data_recov()) may have
				1957	* changed some content of stripes which are not identical to
				1958	* on-disk content any more, otherwise, a later write/recover
				1959	* may steal stripe_pages from this rbio and end up with
				1960	* corruptions or rebuild failures.
				1961	*
				1962	* - In case of single failure, where rbio->failb == -1:
				1963	*
				1964	* Cache this rbio iff the above read reconstruction is
Andrea Gelmini	52042d8	2018-11-28 12:05:13 +0100	[diff] [blame]	1965	* executed without problems.
Liu Bo	44ac474	2018-01-12 18:07:02 -0700	[diff] [blame]	1966	*/
				1967	if (err == BLK_STS_OK && rbio->failb < 0)
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1968	cache_rbio_pages(rbio);
				1969	else
				1970	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				1971
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	1972	rbio_orig_end_io(rbio, err);
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1973	} else if (err == BLK_STS_OK) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1974	rbio->faila = -1;
				1975	rbio->failb = -1;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	1976
				1977	if (rbio->operation == BTRFS_RBIO_WRITE)
				1978	finish_rmw(rbio);
				1979	else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
				1980	finish_parity_scrub(rbio, 0);
				1981	else
				1982	BUG();
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1983	} else {
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	1984	rbio_orig_end_io(rbio, err);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1985	}
				1986	}
				1987
				1988	/*
				1989	* This is called only for stripes we've read from disk to
				1990	* reconstruct the parity.
				1991	*/
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	1992	static void raid_recover_end_io(struct bio *bio)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1993	{
				1994	struct btrfs_raid_bio *rbio = bio->bi_private;
				1995
				1996	/*
				1997	* we only read stripe pages off the disk, set them
				1998	* up to date if there were no errors
				1999	*/
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	2000	if (bio->bi_status)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2001	fail_bio_stripe(rbio, bio);
				2002	else
				2003	set_bio_pages_uptodate(bio);
				2004	bio_put(bio);
				2005
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2006	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2007	return;
				2008
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2009	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2010	rbio_orig_end_io(rbio, BLK_STS_IOERR);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2011	else
				2012	__raid_recover_end_io(rbio);
				2013	}
				2014
				2015	/*
				2016	* reads everything we need off the disk to reconstruct
				2017	* the parity. endio handlers trigger final reconstruction
				2018	* when the IO is done.
				2019	*
				2020	* This is used both for reads from the higher layers and for
				2021	* parity construction required to finish a rmw cycle.
				2022	*/
				2023	static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
				2024	{
				2025	int bios_to_read = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2026	struct bio_list bio_list;
				2027	int ret;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2028	int pagenr;
				2029	int stripe;
				2030	struct bio *bio;
				2031
				2032	bio_list_init(&bio_list);
				2033
				2034	ret = alloc_rbio_pages(rbio);
				2035	if (ret)
				2036	goto cleanup;
				2037
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2038	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2039
				2040	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	2041	* read everything that hasn't failed. Thanks to the
				2042	* stripe cache, it is possible that some or all of these
				2043	* pages are going to be uptodate.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2044	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2045	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
Liu Bo	5588383	2014-06-24 15:39:16 +0800	[diff] [blame]	2046	if (rbio->faila == stripe \|\| rbio->failb == stripe) {
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2047	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2048	continue;
Liu Bo	5588383	2014-06-24 15:39:16 +0800	[diff] [blame]	2049	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2050
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	2051	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2052	struct page *p;
				2053
				2054	/*
				2055	* the rmw code may have already read this
				2056	* page in
				2057	*/
				2058	p = rbio_stripe_page(rbio, stripe, pagenr);
				2059	if (PageUptodate(p))
				2060	continue;
				2061
				2062	ret = rbio_add_io_page(rbio, &bio_list,
				2063	rbio_stripe_page(rbio, stripe, pagenr),
				2064	stripe, pagenr, rbio->stripe_len);
				2065	if (ret < 0)
				2066	goto cleanup;
				2067	}
				2068	}
				2069
				2070	bios_to_read = bio_list_size(&bio_list);
				2071	if (!bios_to_read) {
				2072	/*
				2073	* we might have no bios to read just because the pages
				2074	* were up to date, or we might have no bios to read because
				2075	* the devices were gone.
				2076	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2077	if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2078	__raid_recover_end_io(rbio);
Nikolay Borisov	813f8a0	2020-07-15 14:02:17 +0300	[diff] [blame]	2079	return 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2080	} else {
				2081	goto cleanup;
				2082	}
				2083	}
				2084
				2085	/*
				2086	* the bbio may be freed once we submit the last bio. Make sure
				2087	* not to touch it after that
				2088	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2089	atomic_set(&rbio->stripes_pending, bios_to_read);
Nikolay Borisov	bf28a60	2020-07-02 16:46:43 +0300	[diff] [blame]	2090	while ((bio = bio_list_pop(&bio_list))) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2091	bio->bi_private = rbio;
				2092	bio->bi_end_io = raid_recover_end_io;
David Sterba	ebcc326	2018-06-29 10:56:53 +0200	[diff] [blame]	2093	bio->bi_opf = REQ_OP_READ;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2094
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	2095	btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2096
Mike Christie	4e49ea4	2016-06-05 14:31:41 -0500	[diff] [blame]	2097	submit_bio(bio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2098	}
Nikolay Borisov	813f8a0	2020-07-15 14:02:17 +0300	[diff] [blame]	2099
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2100	return 0;
				2101
				2102	cleanup:
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2103	if (rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
				2104	rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2105	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	2106
				2107	while ((bio = bio_list_pop(&bio_list)))
				2108	bio_put(bio);
				2109
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2110	return -EIO;
				2111	}
				2112
				2113	/*
				2114	* the main entry point for reads from the higher layers. This
				2115	* is really only called when the normal read path had a failure,
				2116	* so we assume the bio they send down corresponds to a failed part
				2117	* of the drive.
				2118	*/
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2119	int raid56_parity_recover(struct btrfs_fs_info fs_info, struct bio bio,
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2120	struct btrfs_bio *bbio, u64 stripe_len,
				2121	int mirror_num, int generic_io)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2122	{
				2123	struct btrfs_raid_bio *rbio;
				2124	int ret;
				2125
Liu Bo	abad60c	2017-03-29 10:54:26 -0700	[diff] [blame]	2126	if (generic_io) {
				2127	ASSERT(bbio->mirror_num == mirror_num);
				2128	btrfs_io_bio(bio)->mirror_num = mirror_num;
				2129	}
				2130
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2131	rbio = alloc_rbio(fs_info, bbio, stripe_len);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	2132	if (IS_ERR(rbio)) {
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	2133	if (generic_io)
				2134	btrfs_put_bbio(bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2135	return PTR_ERR(rbio);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	2136	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2137
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	2138	rbio->operation = BTRFS_RBIO_READ_REBUILD;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2139	bio_list_add(&rbio->bio_list, bio);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	2140	rbio->bio_list_bytes = bio->bi_iter.bi_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2141
				2142	rbio->faila = find_logical_bio_stripe(rbio, bio);
				2143	if (rbio->faila == -1) {
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	2144	btrfs_warn(fs_info,
Liu Bo	e46a28c	2016-07-29 10:57:55 -0700	[diff] [blame]	2145	"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bbio has map_type %llu)",
David Sterba	1201b58	2020-11-26 15:41:27 +0100	[diff] [blame]	2146	__func__, bio->bi_iter.bi_sector << 9,
Liu Bo	e46a28c	2016-07-29 10:57:55 -0700	[diff] [blame]	2147	(u64)bio->bi_iter.bi_size, bbio->map_type);
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	2148	if (generic_io)
				2149	btrfs_put_bbio(bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2150	kfree(rbio);
				2151	return -EIO;
				2152	}
				2153
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	2154	if (generic_io) {
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	2155	btrfs_bio_counter_inc_noblocked(fs_info);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	2156	rbio->generic_bio_cnt = 1;
				2157	} else {
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	2158	btrfs_get_bbio(bbio);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	2159	}
				2160
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2161	/*
Liu Bo	8810f75	2018-01-02 13:36:41 -0700	[diff] [blame]	2162	* Loop retry:
				2163	* for 'mirror == 2', reconstruct from all other stripes.
				2164	* for 'mirror_num > 2', select a stripe to fail on every retry.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2165	*/
Liu Bo	8810f75	2018-01-02 13:36:41 -0700	[diff] [blame]	2166	if (mirror_num > 2) {
				2167	/*
				2168	* 'mirror == 3' is to fail the p stripe and
				2169	* reconstruct from the q stripe. 'mirror > 3' is to
				2170	* fail a data stripe and reconstruct from p+q stripe.
				2171	*/
				2172	rbio->failb = rbio->real_stripes - (mirror_num - 1);
				2173	ASSERT(rbio->failb > 0);
				2174	if (rbio->failb <= rbio->faila)
				2175	rbio->failb--;
				2176	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2177
				2178	ret = lock_stripe_add(rbio);
				2179
				2180	/*
				2181	* __raid56_parity_recover will end the bio with
				2182	* any errors it hits. We don't want to return
				2183	* its error value up the stack because our caller
				2184	* will end up calling bio_endio with any nonzero
				2185	* return
				2186	*/
				2187	if (ret == 0)
				2188	__raid56_parity_recover(rbio);
				2189	/*
				2190	* our rbio has been added to the list of
				2191	* rbios that will be handled after the
				2192	* currently lock owner is done
				2193	*/
				2194	return 0;
				2195
				2196	}
				2197
				2198	static void rmw_work(struct btrfs_work *work)
				2199	{
				2200	struct btrfs_raid_bio *rbio;
				2201
				2202	rbio = container_of(work, struct btrfs_raid_bio, work);
				2203	raid56_rmw_stripe(rbio);
				2204	}
				2205
				2206	static void read_rebuild_work(struct btrfs_work *work)
				2207	{
				2208	struct btrfs_raid_bio *rbio;
				2209
				2210	rbio = container_of(work, struct btrfs_raid_bio, work);
				2211	__raid56_parity_recover(rbio);
				2212	}
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2213
				2214	/*
				2215	* The following code is used to scrub/replace the parity stripe
				2216	*
Qu Wenruo	ae6529c	2017-03-29 09:33:21 +0800	[diff] [blame]	2217	* Caller must have already increased bio_counter for getting @bbio.
				2218	*
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2219	* Note: We need make sure all the pages that add into the scrub/replace
				2220	* raid bio are correct and not be changed during the scrub/replace. That
				2221	* is those pages just hold metadata or file data with checksum.
				2222	*/
				2223
				2224	struct btrfs_raid_bio *
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2225	raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info fs_info, struct bio bio,
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2226	struct btrfs_bio *bbio, u64 stripe_len,
				2227	struct btrfs_device *scrub_dev,
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2228	unsigned long *dbitmap, int stripe_nsectors)
				2229	{
				2230	struct btrfs_raid_bio *rbio;
				2231	int i;
				2232
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2233	rbio = alloc_rbio(fs_info, bbio, stripe_len);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2234	if (IS_ERR(rbio))
				2235	return NULL;
				2236	bio_list_add(&rbio->bio_list, bio);
				2237	/*
				2238	* This is a special bio which is used to hold the completion handler
				2239	* and make the scrub rbio is similar to the other types
				2240	*/
				2241	ASSERT(!bio->bi_iter.bi_size);
				2242	rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
				2243
Liu Bo	9cd3a7e	2017-08-03 13:53:31 -0600	[diff] [blame]	2244	/*
				2245	* After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted
				2246	* to the end position, so this search can start from the first parity
				2247	* stripe.
				2248	*/
				2249	for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2250	if (bbio->stripes[i].dev == scrub_dev) {
				2251	rbio->scrubp = i;
				2252	break;
				2253	}
				2254	}
Liu Bo	9cd3a7e	2017-08-03 13:53:31 -0600	[diff] [blame]	2255	ASSERT(i < rbio->real_stripes);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2256
				2257	/* Now we just support the sectorsize equals to page size */
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	2258	ASSERT(fs_info->sectorsize == PAGE_SIZE);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2259	ASSERT(rbio->stripe_npages == stripe_nsectors);
				2260	bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
				2261
Qu Wenruo	ae6529c	2017-03-29 09:33:21 +0800	[diff] [blame]	2262	/*
				2263	* We have already increased bio_counter when getting bbio, record it
				2264	* so we can free it at rbio_orig_end_io().
				2265	*/
				2266	rbio->generic_bio_cnt = 1;
				2267
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2268	return rbio;
				2269	}
				2270
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2271	/* Used for both parity scrub and missing. */
				2272	void raid56_add_scrub_pages(struct btrfs_raid_bio rbio, struct page page,
				2273	u64 logical)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2274	{
				2275	int stripe_offset;
				2276	int index;
				2277
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2278	ASSERT(logical >= rbio->bbio->raid_map[0]);
				2279	ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] +
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2280	rbio->stripe_len * rbio->nr_data);
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2281	stripe_offset = (int)(logical - rbio->bbio->raid_map[0]);
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	2282	index = stripe_offset >> PAGE_SHIFT;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2283	rbio->bio_pages[index] = page;
				2284	}
				2285
				2286	/*
				2287	* We just scrub the parity that we have correct data on the same horizontal,
				2288	* so we needn't allocate all pages for all the stripes.
				2289	*/
				2290	static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
				2291	{
				2292	int i;
				2293	int bit;
				2294	int index;
				2295	struct page *page;
				2296
				2297	for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2298	for (i = 0; i < rbio->real_stripes; i++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2299	index = i * rbio->stripe_npages + bit;
				2300	if (rbio->stripe_pages[index])
				2301	continue;
				2302
				2303	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				2304	if (!page)
				2305	return -ENOMEM;
				2306	rbio->stripe_pages[index] = page;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2307	}
				2308	}
				2309	return 0;
				2310	}
				2311
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2312	static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
				2313	int need_check)
				2314	{
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2315	struct btrfs_bio *bbio = rbio->bbio;
Kees Cook	1389053	2018-05-29 16:44:59 -0700	[diff] [blame]	2316	void **pointers = rbio->finish_pointers;
				2317	unsigned long *pbitmap = rbio->finish_pbitmap;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2318	int nr_data = rbio->nr_data;
				2319	int stripe;
				2320	int pagenr;
David Sterba	c17af96	2020-02-19 15:17:20 +0100	[diff] [blame]	2321	bool has_qstripe;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2322	struct page *p_page = NULL;
				2323	struct page *q_page = NULL;
				2324	struct bio_list bio_list;
				2325	struct bio *bio;
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2326	int is_replace = 0;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2327	int ret;
				2328
				2329	bio_list_init(&bio_list);
				2330
David Sterba	c17af96	2020-02-19 15:17:20 +0100	[diff] [blame]	2331	if (rbio->real_stripes - rbio->nr_data == 1)
				2332	has_qstripe = false;
				2333	else if (rbio->real_stripes - rbio->nr_data == 2)
				2334	has_qstripe = true;
				2335	else
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2336	BUG();
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2337
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2338	if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
				2339	is_replace = 1;
				2340	bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
				2341	}
				2342
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2343	/*
				2344	* Because the higher layers(scrubber) are unlikely to
				2345	* use this area of the disk again soon, so don't cache
				2346	* it.
				2347	*/
				2348	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				2349
				2350	if (!need_check)
				2351	goto writeback;
				2352
				2353	p_page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				2354	if (!p_page)
				2355	goto cleanup;
				2356	SetPageUptodate(p_page);
				2357
David Sterba	c17af96	2020-02-19 15:17:20 +0100	[diff] [blame]	2358	if (has_qstripe) {
Ira Weiny	d70cef0	2021-01-27 22:15:03 -0800	[diff] [blame]	2359	/* RAID6, allocate and map temp space for the Q stripe */
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2360	q_page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				2361	if (!q_page) {
				2362	__free_page(p_page);
				2363	goto cleanup;
				2364	}
				2365	SetPageUptodate(q_page);
Ira Weiny	94a0b58	2021-02-16 18:48:24 -0800	[diff] [blame]	2366	pointers[rbio->real_stripes - 1] = kmap_local_page(q_page);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2367	}
				2368
				2369	atomic_set(&rbio->error, 0);
				2370
Ira Weiny	d70cef0	2021-01-27 22:15:03 -0800	[diff] [blame]	2371	/* Map the parity stripe just once */
Ira Weiny	94a0b58	2021-02-16 18:48:24 -0800	[diff] [blame]	2372	pointers[nr_data] = kmap_local_page(p_page);
Ira Weiny	d70cef0	2021-01-27 22:15:03 -0800	[diff] [blame]	2373
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2374	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
				2375	struct page *p;
				2376	void *parity;
				2377	/* first collect one page from each data stripe */
				2378	for (stripe = 0; stripe < nr_data; stripe++) {
				2379	p = page_in_rbio(rbio, stripe, pagenr, 0);
Ira Weiny	94a0b58	2021-02-16 18:48:24 -0800	[diff] [blame]	2380	pointers[stripe] = kmap_local_page(p);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2381	}
				2382
David Sterba	c17af96	2020-02-19 15:17:20 +0100	[diff] [blame]	2383	if (has_qstripe) {
Ira Weiny	d70cef0	2021-01-27 22:15:03 -0800	[diff] [blame]	2384	/* RAID6, call the library function to fill in our P/Q */
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2385	raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2386	pointers);
				2387	} else {
				2388	/* raid5 */
David Sterba	69d2480	2018-06-29 10:56:44 +0200	[diff] [blame]	2389	copy_page(pointers[nr_data], pointers[0]);
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	2390	run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2391	}
				2392
Nicholas D Steeves	0132761	2016-05-19 21:18:45 -0400	[diff] [blame]	2393	/* Check scrubbing parity and repair it */
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2394	p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
Ira Weiny	58c1a35	2021-02-16 18:48:23 -0800	[diff] [blame]	2395	parity = kmap_local_page(p);
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	2396	if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE))
David Sterba	69d2480	2018-06-29 10:56:44 +0200	[diff] [blame]	2397	copy_page(parity, pointers[rbio->scrubp]);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2398	else
				2399	/* Parity is right, needn't writeback */
				2400	bitmap_clear(rbio->dbitmap, pagenr, 1);
Ira Weiny	58c1a35	2021-02-16 18:48:23 -0800	[diff] [blame]	2401	kunmap_local(parity);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2402
Ira Weiny	94a0b58	2021-02-16 18:48:24 -0800	[diff] [blame]	2403	for (stripe = nr_data - 1; stripe >= 0; stripe--)
				2404	kunmap_local(pointers[stripe]);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2405	}
				2406
Ira Weiny	94a0b58	2021-02-16 18:48:24 -0800	[diff] [blame]	2407	kunmap_local(pointers[nr_data]);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2408	__free_page(p_page);
Ira Weiny	d70cef0	2021-01-27 22:15:03 -0800	[diff] [blame]	2409	if (q_page) {
Ira Weiny	94a0b58	2021-02-16 18:48:24 -0800	[diff] [blame]	2410	kunmap_local(pointers[rbio->real_stripes - 1]);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2411	__free_page(q_page);
Ira Weiny	d70cef0	2021-01-27 22:15:03 -0800	[diff] [blame]	2412	}
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2413
				2414	writeback:
				2415	/*
				2416	* time to start writing. Make bios for everything from the
				2417	* higher layers (the bio_list in our rbio) and our p/q. Ignore
				2418	* everything else.
				2419	*/
				2420	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
				2421	struct page *page;
				2422
				2423	page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
				2424	ret = rbio_add_io_page(rbio, &bio_list,
				2425	page, rbio->scrubp, pagenr, rbio->stripe_len);
				2426	if (ret)
				2427	goto cleanup;
				2428	}
				2429
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2430	if (!is_replace)
				2431	goto submit_write;
				2432
				2433	for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
				2434	struct page *page;
				2435
				2436	page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
				2437	ret = rbio_add_io_page(rbio, &bio_list, page,
				2438	bbio->tgtdev_map[rbio->scrubp],
				2439	pagenr, rbio->stripe_len);
				2440	if (ret)
				2441	goto cleanup;
				2442	}
				2443
				2444	submit_write:
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2445	nr_data = bio_list_size(&bio_list);
				2446	if (!nr_data) {
				2447	/* Every parity is right */
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2448	rbio_orig_end_io(rbio, BLK_STS_OK);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2449	return;
				2450	}
				2451
				2452	atomic_set(&rbio->stripes_pending, nr_data);
				2453
Nikolay Borisov	bf28a60	2020-07-02 16:46:43 +0300	[diff] [blame]	2454	while ((bio = bio_list_pop(&bio_list))) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2455	bio->bi_private = rbio;
Zhao Lei	a6111d11b	2016-01-12 17:52:13 +0800	[diff] [blame]	2456	bio->bi_end_io = raid_write_end_io;
David Sterba	ebcc326	2018-06-29 10:56:53 +0200	[diff] [blame]	2457	bio->bi_opf = REQ_OP_WRITE;
Mike Christie	4e49ea4	2016-06-05 14:31:41 -0500	[diff] [blame]	2458
				2459	submit_bio(bio);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2460	}
				2461	return;
				2462
				2463	cleanup:
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2464	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	2465
				2466	while ((bio = bio_list_pop(&bio_list)))
				2467	bio_put(bio);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2468	}
				2469
				2470	static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
				2471	{
				2472	if (stripe >= 0 && stripe < rbio->nr_data)
				2473	return 1;
				2474	return 0;
				2475	}
				2476
				2477	/*
				2478	* While we're doing the parity check and repair, we could have errors
				2479	* in reading pages off the disk. This checks for errors and if we're
				2480	* not able to read the page it'll trigger parity reconstruction. The
				2481	* parity scrub will be finished after we've reconstructed the failed
				2482	* stripes
				2483	*/
				2484	static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
				2485	{
				2486	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
				2487	goto cleanup;
				2488
				2489	if (rbio->faila >= 0 \|\| rbio->failb >= 0) {
				2490	int dfail = 0, failp = -1;
				2491
				2492	if (is_data_stripe(rbio, rbio->faila))
				2493	dfail++;
				2494	else if (is_parity_stripe(rbio->faila))
				2495	failp = rbio->faila;
				2496
				2497	if (is_data_stripe(rbio, rbio->failb))
				2498	dfail++;
				2499	else if (is_parity_stripe(rbio->failb))
				2500	failp = rbio->failb;
				2501
				2502	/*
				2503	* Because we can not use a scrubbing parity to repair
				2504	* the data, so the capability of the repair is declined.
				2505	* (In the case of RAID5, we can not repair anything)
				2506	*/
				2507	if (dfail > rbio->bbio->max_errors - 1)
				2508	goto cleanup;
				2509
				2510	/*
				2511	* If all data is good, only parity is correctly, just
				2512	* repair the parity.
				2513	*/
				2514	if (dfail == 0) {
				2515	finish_parity_scrub(rbio, 0);
				2516	return;
				2517	}
				2518
				2519	/*
				2520	* Here means we got one corrupted data stripe and one
				2521	* corrupted parity on RAID6, if the corrupted parity
Nicholas D Steeves	0132761	2016-05-19 21:18:45 -0400	[diff] [blame]	2522	* is scrubbing parity, luckily, use the other one to repair
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2523	* the data, or we can not repair the data stripe.
				2524	*/
				2525	if (failp != rbio->scrubp)
				2526	goto cleanup;
				2527
				2528	__raid_recover_end_io(rbio);
				2529	} else {
				2530	finish_parity_scrub(rbio, 1);
				2531	}
				2532	return;
				2533
				2534	cleanup:
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2535	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2536	}
				2537
				2538	/*
				2539	* end io for the read phase of the rmw cycle. All the bios here are physical
				2540	* stripe bios we've read from the disk so we can recalculate the parity of the
				2541	* stripe.
				2542	*
				2543	* This will usually kick off finish_rmw once all the bios are read in, but it
				2544	* may trigger parity reconstruction if we had any errors along the way
				2545	*/
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	2546	static void raid56_parity_scrub_end_io(struct bio *bio)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2547	{
				2548	struct btrfs_raid_bio *rbio = bio->bi_private;
				2549
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	2550	if (bio->bi_status)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2551	fail_bio_stripe(rbio, bio);
				2552	else
				2553	set_bio_pages_uptodate(bio);
				2554
				2555	bio_put(bio);
				2556
				2557	if (!atomic_dec_and_test(&rbio->stripes_pending))
				2558	return;
				2559
				2560	/*
				2561	* this will normally call finish_rmw to start our write
				2562	* but if there are any failed stripes we'll reconstruct
				2563	* from parity first
				2564	*/
				2565	validate_rbio_for_parity_scrub(rbio);
				2566	}
				2567
				2568	static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
				2569	{
				2570	int bios_to_read = 0;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2571	struct bio_list bio_list;
				2572	int ret;
				2573	int pagenr;
				2574	int stripe;
				2575	struct bio *bio;
				2576
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	2577	bio_list_init(&bio_list);
				2578
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2579	ret = alloc_rbio_essential_pages(rbio);
				2580	if (ret)
				2581	goto cleanup;
				2582
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2583	atomic_set(&rbio->error, 0);
				2584	/*
				2585	* build a list of bios to read all the missing parts of this
				2586	* stripe
				2587	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2588	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2589	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
				2590	struct page *page;
				2591	/*
				2592	* we want to find all the pages missing from
				2593	* the rbio and read them from the disk. If
				2594	* page_in_rbio finds a page in the bio list
				2595	* we don't need to read it off the stripe.
				2596	*/
				2597	page = page_in_rbio(rbio, stripe, pagenr, 1);
				2598	if (page)
				2599	continue;
				2600
				2601	page = rbio_stripe_page(rbio, stripe, pagenr);
				2602	/*
				2603	* the bio cache may have handed us an uptodate
				2604	* page. If so, be happy and use it
				2605	*/
				2606	if (PageUptodate(page))
				2607	continue;
				2608
				2609	ret = rbio_add_io_page(rbio, &bio_list, page,
				2610	stripe, pagenr, rbio->stripe_len);
				2611	if (ret)
				2612	goto cleanup;
				2613	}
				2614	}
				2615
				2616	bios_to_read = bio_list_size(&bio_list);
				2617	if (!bios_to_read) {
				2618	/*
				2619	* this can happen if others have merged with
				2620	* us, it means there is nothing left to read.
				2621	* But if there are missing devices it may not be
				2622	* safe to do the full stripe write yet.
				2623	*/
				2624	goto finish;
				2625	}
				2626
				2627	/*
				2628	* the bbio may be freed once we submit the last bio. Make sure
				2629	* not to touch it after that
				2630	*/
				2631	atomic_set(&rbio->stripes_pending, bios_to_read);
Nikolay Borisov	bf28a60	2020-07-02 16:46:43 +0300	[diff] [blame]	2632	while ((bio = bio_list_pop(&bio_list))) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2633	bio->bi_private = rbio;
				2634	bio->bi_end_io = raid56_parity_scrub_end_io;
David Sterba	ebcc326	2018-06-29 10:56:53 +0200	[diff] [blame]	2635	bio->bi_opf = REQ_OP_READ;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2636
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	2637	btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2638
Mike Christie	4e49ea4	2016-06-05 14:31:41 -0500	[diff] [blame]	2639	submit_bio(bio);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2640	}
				2641	/* the actual write will happen once the reads are done */
				2642	return;
				2643
				2644	cleanup:
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2645	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	2646
				2647	while ((bio = bio_list_pop(&bio_list)))
				2648	bio_put(bio);
				2649
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2650	return;
				2651
				2652	finish:
				2653	validate_rbio_for_parity_scrub(rbio);
				2654	}
				2655
				2656	static void scrub_parity_work(struct btrfs_work *work)
				2657	{
				2658	struct btrfs_raid_bio *rbio;
				2659
				2660	rbio = container_of(work, struct btrfs_raid_bio, work);
				2661	raid56_parity_scrub_stripe(rbio);
				2662	}
				2663
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2664	void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
				2665	{
				2666	if (!lock_stripe_add(rbio))
David Sterba	a81b747	2018-06-29 10:57:03 +0200	[diff] [blame]	2667	start_async_work(rbio, scrub_parity_work);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2668	}
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2669
				2670	/* The following code is used for dev replace of a missing RAID 5/6 device. */
				2671
				2672	struct btrfs_raid_bio *
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2673	raid56_alloc_missing_rbio(struct btrfs_fs_info fs_info, struct bio bio,
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2674	struct btrfs_bio *bbio, u64 length)
				2675	{
				2676	struct btrfs_raid_bio *rbio;
				2677
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2678	rbio = alloc_rbio(fs_info, bbio, length);
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2679	if (IS_ERR(rbio))
				2680	return NULL;
				2681
				2682	rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
				2683	bio_list_add(&rbio->bio_list, bio);
				2684	/*
				2685	* This is a special bio which is used to hold the completion handler
				2686	* and make the scrub rbio is similar to the other types
				2687	*/
				2688	ASSERT(!bio->bi_iter.bi_size);
				2689
				2690	rbio->faila = find_logical_bio_stripe(rbio, bio);
				2691	if (rbio->faila == -1) {
				2692	BUG();
				2693	kfree(rbio);
				2694	return NULL;
				2695	}
				2696
Qu Wenruo	ae6529c	2017-03-29 09:33:21 +0800	[diff] [blame]	2697	/*
				2698	* When we get bbio, we have already increased bio_counter, record it
				2699	* so we can free it at rbio_orig_end_io()
				2700	*/
				2701	rbio->generic_bio_cnt = 1;
				2702
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2703	return rbio;
				2704	}
				2705
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2706	void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
				2707	{
				2708	if (!lock_stripe_add(rbio))
David Sterba	e66d8d5	2018-06-29 10:57:00 +0200	[diff] [blame]	2709	start_async_work(rbio, read_rebuild_work);
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2710	}