Blame - fs/btrfs/raid56.c - SHIFTPHONES/mainline/linux

blob: 8c31357f08ed9eb91784db8e468b3b5f49037e34 [file] [log] [blame]

David Sterba	c1d7c51	2018-04-03 19:23:33 +0200	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2	/*
				3	* Copyright (C) 2012 Fusion-io All rights reserved.
				4	* Copyright (C) 2012 Intel Corp. All rights reserved.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	5	*/
David Sterba	c1d7c51	2018-04-03 19:23:33 +0200	[diff] [blame]	6
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	7	#include <linux/sched.h>
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	8	#include <linux/bio.h>
				9	#include <linux/slab.h>
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	10	#include <linux/blkdev.h>
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	11	#include <linux/raid/pq.h>
				12	#include <linux/hash.h>
				13	#include <linux/list_sort.h>
				14	#include <linux/raid/xor.h>
David Sterba	818e010	2017-05-31 18:40:02 +0200	[diff] [blame]	15	#include <linux/mm.h>
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	16	#include "ctree.h"
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	17	#include "disk-io.h"
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	18	#include "volumes.h"
				19	#include "raid56.h"
				20	#include "async-thread.h"
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	21
				22	/* set when additional merges to this rbio are not allowed */
				23	#define RBIO_RMW_LOCKED_BIT 1
				24
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	25	/*
				26	* set when this rbio is sitting in the hash, but it is just a cache
				27	* of past RMW
				28	*/
				29	#define RBIO_CACHE_BIT 2
				30
				31	/*
				32	* set when it is safe to trust the stripe_pages for caching
				33	*/
				34	#define RBIO_CACHE_READY_BIT 3
				35
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	36	#define RBIO_CACHE_SIZE 1024
				37
David Sterba	8a95334	2019-08-21 19:06:17 +0200	[diff] [blame]	38	#define BTRFS_STRIPE_HASH_TABLE_BITS 11
				39
				40	/* Used by the raid56 code to lock stripes for read/modify/write */
				41	struct btrfs_stripe_hash {
				42	struct list_head hash_list;
				43	spinlock_t lock;
				44	};
				45
				46	/* Used by the raid56 code to lock stripes for read/modify/write */
				47	struct btrfs_stripe_hash_table {
				48	struct list_head stripe_cache;
				49	spinlock_t cache_lock;
				50	int cache_size;
				51	struct btrfs_stripe_hash table[];
				52	};
				53
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	54	enum btrfs_rbio_ops {
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	55	BTRFS_RBIO_WRITE,
				56	BTRFS_RBIO_READ_REBUILD,
				57	BTRFS_RBIO_PARITY_SCRUB,
				58	BTRFS_RBIO_REBUILD_MISSING,
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	59	};
				60
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	61	struct btrfs_raid_bio {
				62	struct btrfs_fs_info *fs_info;
				63	struct btrfs_bio *bbio;
				64
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	65	/* while we're doing rmw on a stripe
				66	* we put it into a hash table so we can
				67	* lock the stripe and merge more rbios
				68	* into it.
				69	*/
				70	struct list_head hash_list;
				71
				72	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	73	* LRU list for the stripe cache
				74	*/
				75	struct list_head stripe_cache;
				76
				77	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	78	* for scheduling work in the helper threads
				79	*/
				80	struct btrfs_work work;
				81
				82	/*
				83	* bio list and bio_list_lock are used
				84	* to add more bios into the stripe
				85	* in hopes of avoiding the full rmw
				86	*/
				87	struct bio_list bio_list;
				88	spinlock_t bio_list_lock;
				89
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	90	/* also protected by the bio_list_lock, the
				91	* plug list is used by the plugging code
				92	* to collect partial bios while plugged. The
				93	* stripe locking code also uses it to hand off
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	94	* the stripe lock to the next pending IO
				95	*/
				96	struct list_head plug_list;
				97
				98	/*
				99	* flags that tell us if it is safe to
				100	* merge with this bio
				101	*/
				102	unsigned long flags;
				103
				104	/* size of each individual stripe on disk */
				105	int stripe_len;
				106
				107	/* number of data stripes (no p/q) */
				108	int nr_data;
				109
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	110	int real_stripes;
				111
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	112	int stripe_npages;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	113	/*
				114	* set if we're doing a parity rebuild
				115	* for a read from higher up, which is handled
				116	* differently from a parity rebuild as part of
				117	* rmw
				118	*/
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	119	enum btrfs_rbio_ops operation;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	120
				121	/* first bad stripe */
				122	int faila;
				123
				124	/* second bad stripe (for raid6 use) */
				125	int failb;
				126
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	127	int scrubp;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	128	/*
				129	* number of pages needed to represent the full
				130	* stripe
				131	*/
				132	int nr_pages;
				133
				134	/*
				135	* size of all the bios in the bio_list. This
				136	* helps us decide if the rbio maps to a full
				137	* stripe or not
				138	*/
				139	int bio_list_bytes;
				140
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	141	int generic_bio_cnt;
				142
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	143	refcount_t refs;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	144
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	145	atomic_t stripes_pending;
				146
				147	atomic_t error;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	148	/*
				149	* these are two arrays of pointers. We allocate the
				150	* rbio big enough to hold them both and setup their
				151	* locations when the rbio is allocated
				152	*/
				153
				154	/* pointers to pages that we allocated for
				155	* reading/writing stripes directly from the disk (including P/Q)
				156	*/
				157	struct page **stripe_pages;
				158
				159	/*
				160	* pointers to the pages in the bio_list. Stored
				161	* here for faster lookup
				162	*/
				163	struct page **bio_pages;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	164
				165	/*
				166	* bitmap to record which horizontal stripe has data
				167	*/
				168	unsigned long *dbitmap;
Kees Cook	1389053	2018-05-29 16:44:59 -0700	[diff] [blame]	169
				170	/* allocated with real_stripes-many pointers for finish_() calls /
				171	void **finish_pointers;
				172
				173	/* allocated with stripe_npages-many bits for finish_() calls /
				174	unsigned long *finish_pbitmap;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	175	};
				176
				177	static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
				178	static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
				179	static void rmw_work(struct btrfs_work *work);
				180	static void read_rebuild_work(struct btrfs_work *work);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	181	static int fail_bio_stripe(struct btrfs_raid_bio rbio, struct bio bio);
				182	static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
				183	static void __free_raid_bio(struct btrfs_raid_bio *rbio);
				184	static void index_rbio_pages(struct btrfs_raid_bio *rbio);
				185	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
				186
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	187	static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
				188	int need_check);
David Sterba	a81b747	2018-06-29 10:57:03 +0200	[diff] [blame]	189	static void scrub_parity_work(struct btrfs_work *work);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	190
David Sterba	ac63885	2018-06-29 10:56:56 +0200	[diff] [blame]	191	static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func)
				192	{
Omar Sandoval	a0cac0e	2019-09-16 11:30:57 -0700	[diff] [blame]	193	btrfs_init_work(&rbio->work, work_func, NULL, NULL);
David Sterba	ac63885	2018-06-29 10:56:56 +0200	[diff] [blame]	194	btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
				195	}
				196
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	197	/*
				198	* the stripe hash table is used for locking, and to collect
				199	* bios in hopes of making a full stripe
				200	*/
				201	int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
				202	{
				203	struct btrfs_stripe_hash_table *table;
				204	struct btrfs_stripe_hash_table *x;
				205	struct btrfs_stripe_hash *cur;
				206	struct btrfs_stripe_hash *h;
				207	int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
				208	int i;
				209
				210	if (info->stripe_hash_table)
				211	return 0;
				212
David Sterba	83c8266	2013-03-01 15:03:00 +0000	[diff] [blame]	213	/*
				214	* The table is large, starting with order 4 and can go as high as
				215	* order 7 in case lock debugging is turned on.
				216	*
				217	* Try harder to allocate and fallback to vmalloc to lower the chance
				218	* of a failing mount.
				219	*/
David Sterba	ee787f9	2019-03-29 02:07:02 +0100	[diff] [blame]	220	table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
David Sterba	818e010	2017-05-31 18:40:02 +0200	[diff] [blame]	221	if (!table)
				222	return -ENOMEM;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	223
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	224	spin_lock_init(&table->cache_lock);
				225	INIT_LIST_HEAD(&table->stripe_cache);
				226
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	227	h = table->table;
				228
				229	for (i = 0; i < num_entries; i++) {
				230	cur = h + i;
				231	INIT_LIST_HEAD(&cur->hash_list);
				232	spin_lock_init(&cur->lock);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	233	}
				234
				235	x = cmpxchg(&info->stripe_hash_table, NULL, table);
Yang Li	fe3b7bb	2021-01-21 16:19:47 +0800	[diff] [blame]	236	kvfree(x);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	237	return 0;
				238	}
				239
				240	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	241	* caching an rbio means to copy anything from the
				242	* bio_pages array into the stripe_pages array. We
				243	* use the page uptodate bit in the stripe cache array
				244	* to indicate if it has valid data
				245	*
				246	* once the caching is done, we set the cache ready
				247	* bit.
				248	*/
				249	static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
				250	{
				251	int i;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	252	int ret;
				253
				254	ret = alloc_rbio_pages(rbio);
				255	if (ret)
				256	return;
				257
				258	for (i = 0; i < rbio->nr_pages; i++) {
				259	if (!rbio->bio_pages[i])
				260	continue;
				261
Ira Weiny	80cc838	2021-02-09 22:22:20 -0800	[diff] [blame]	262	copy_highpage(rbio->stripe_pages[i], rbio->bio_pages[i]);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	263	SetPageUptodate(rbio->stripe_pages[i]);
				264	}
				265	set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				266	}
				267
				268	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	269	* we hash on the first logical address of the stripe
				270	*/
				271	static int rbio_bucket(struct btrfs_raid_bio *rbio)
				272	{
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	273	u64 num = rbio->bbio->raid_map[0];
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	274
				275	/*
				276	* we shift down quite a bit. We're using byte
				277	* addressing, and most of the lower bits are zeros.
				278	* This tends to upset hash_64, and it consistently
				279	* returns just one or two different values.
				280	*
				281	* shifting off the lower bits fixes things.
				282	*/
				283	return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
				284	}
				285
				286	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	287	* stealing an rbio means taking all the uptodate pages from the stripe
				288	* array in the source rbio and putting them into the destination rbio
				289	*/
				290	static void steal_rbio(struct btrfs_raid_bio src, struct btrfs_raid_bio dest)
				291	{
				292	int i;
				293	struct page *s;
				294	struct page *d;
				295
				296	if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
				297	return;
				298
				299	for (i = 0; i < dest->nr_pages; i++) {
				300	s = src->stripe_pages[i];
				301	if (!s \|\| !PageUptodate(s)) {
				302	continue;
				303	}
				304
				305	d = dest->stripe_pages[i];
				306	if (d)
				307	__free_page(d);
				308
				309	dest->stripe_pages[i] = s;
				310	src->stripe_pages[i] = NULL;
				311	}
				312	}
				313
				314	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	315	* merging means we take the bio_list from the victim and
				316	* splice it into the destination. The victim should
				317	* be discarded afterwards.
				318	*
				319	* must be called with dest->rbio_list_lock held
				320	*/
				321	static void merge_rbio(struct btrfs_raid_bio *dest,
				322	struct btrfs_raid_bio *victim)
				323	{
				324	bio_list_merge(&dest->bio_list, &victim->bio_list);
				325	dest->bio_list_bytes += victim->bio_list_bytes;
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	326	dest->generic_bio_cnt += victim->generic_bio_cnt;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	327	bio_list_init(&victim->bio_list);
				328	}
				329
				330	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	331	* used to prune items that are in the cache. The caller
				332	* must hold the hash table lock.
				333	*/
				334	static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
				335	{
				336	int bucket = rbio_bucket(rbio);
				337	struct btrfs_stripe_hash_table *table;
				338	struct btrfs_stripe_hash *h;
				339	int freeit = 0;
				340
				341	/*
				342	* check the bit again under the hash table lock.
				343	*/
				344	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
				345	return;
				346
				347	table = rbio->fs_info->stripe_hash_table;
				348	h = table->table + bucket;
				349
				350	/* hold the lock for the bucket because we may be
				351	* removing it from the hash table
				352	*/
				353	spin_lock(&h->lock);
				354
				355	/*
				356	* hold the lock for the bio list because we need
				357	* to make sure the bio list is empty
				358	*/
				359	spin_lock(&rbio->bio_list_lock);
				360
				361	if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
				362	list_del_init(&rbio->stripe_cache);
				363	table->cache_size -= 1;
				364	freeit = 1;
				365
				366	/* if the bio list isn't empty, this rbio is
				367	* still involved in an IO. We take it out
				368	* of the cache list, and drop the ref that
				369	* was held for the list.
				370	*
				371	* If the bio_list was empty, we also remove
				372	* the rbio from the hash_table, and drop
				373	* the corresponding ref
				374	*/
				375	if (bio_list_empty(&rbio->bio_list)) {
				376	if (!list_empty(&rbio->hash_list)) {
				377	list_del_init(&rbio->hash_list);
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	378	refcount_dec(&rbio->refs);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	379	BUG_ON(!list_empty(&rbio->plug_list));
				380	}
				381	}
				382	}
				383
				384	spin_unlock(&rbio->bio_list_lock);
				385	spin_unlock(&h->lock);
				386
				387	if (freeit)
				388	__free_raid_bio(rbio);
				389	}
				390
				391	/*
				392	* prune a given rbio from the cache
				393	*/
				394	static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
				395	{
				396	struct btrfs_stripe_hash_table *table;
				397	unsigned long flags;
				398
				399	if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
				400	return;
				401
				402	table = rbio->fs_info->stripe_hash_table;
				403
				404	spin_lock_irqsave(&table->cache_lock, flags);
				405	__remove_rbio_from_cache(rbio);
				406	spin_unlock_irqrestore(&table->cache_lock, flags);
				407	}
				408
				409	/*
				410	* remove everything in the cache
				411	*/
Eric Sandeen	48a3b63	2013-04-25 20:41:01 +0000	[diff] [blame]	412	static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	413	{
				414	struct btrfs_stripe_hash_table *table;
				415	unsigned long flags;
				416	struct btrfs_raid_bio *rbio;
				417
				418	table = info->stripe_hash_table;
				419
				420	spin_lock_irqsave(&table->cache_lock, flags);
				421	while (!list_empty(&table->stripe_cache)) {
				422	rbio = list_entry(table->stripe_cache.next,
				423	struct btrfs_raid_bio,
				424	stripe_cache);
				425	__remove_rbio_from_cache(rbio);
				426	}
				427	spin_unlock_irqrestore(&table->cache_lock, flags);
				428	}
				429
				430	/*
				431	* remove all cached entries and free the hash table
				432	* used by unmount
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	433	*/
				434	void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
				435	{
				436	if (!info->stripe_hash_table)
				437	return;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	438	btrfs_clear_rbio_cache(info);
Wang Shilong	f749303	2014-11-22 21:13:10 +0800	[diff] [blame]	439	kvfree(info->stripe_hash_table);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	440	info->stripe_hash_table = NULL;
				441	}
				442
				443	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	444	* insert an rbio into the stripe cache. It
				445	* must have already been prepared by calling
				446	* cache_rbio_pages
				447	*
				448	* If this rbio was already cached, it gets
				449	* moved to the front of the lru.
				450	*
				451	* If the size of the rbio cache is too big, we
				452	* prune an item.
				453	*/
				454	static void cache_rbio(struct btrfs_raid_bio *rbio)
				455	{
				456	struct btrfs_stripe_hash_table *table;
				457	unsigned long flags;
				458
				459	if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
				460	return;
				461
				462	table = rbio->fs_info->stripe_hash_table;
				463
				464	spin_lock_irqsave(&table->cache_lock, flags);
				465	spin_lock(&rbio->bio_list_lock);
				466
				467	/* bump our ref if we were not in the list before */
				468	if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	469	refcount_inc(&rbio->refs);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	470
				471	if (!list_empty(&rbio->stripe_cache)){
				472	list_move(&rbio->stripe_cache, &table->stripe_cache);
				473	} else {
				474	list_add(&rbio->stripe_cache, &table->stripe_cache);
				475	table->cache_size += 1;
				476	}
				477
				478	spin_unlock(&rbio->bio_list_lock);
				479
				480	if (table->cache_size > RBIO_CACHE_SIZE) {
				481	struct btrfs_raid_bio *found;
				482
				483	found = list_entry(table->stripe_cache.prev,
				484	struct btrfs_raid_bio,
				485	stripe_cache);
				486
				487	if (found != rbio)
				488	__remove_rbio_from_cache(found);
				489	}
				490
				491	spin_unlock_irqrestore(&table->cache_lock, flags);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	492	}
				493
				494	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	495	* helper function to run the xor_blocks api. It is only
				496	* able to do MAX_XOR_BLOCKS at a time, so we need to
				497	* loop through.
				498	*/
				499	static void run_xor(void **pages, int src_cnt, ssize_t len)
				500	{
				501	int src_off = 0;
				502	int xor_src_cnt = 0;
				503	void *dest = pages[src_cnt];
				504
				505	while(src_cnt > 0) {
				506	xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
				507	xor_blocks(xor_src_cnt, len, dest, pages + src_off);
				508
				509	src_cnt -= xor_src_cnt;
				510	src_off += xor_src_cnt;
				511	}
				512	}
				513
				514	/*
David Sterba	176571a	2018-06-29 10:57:05 +0200	[diff] [blame]	515	* Returns true if the bio list inside this rbio covers an entire stripe (no
				516	* rmw required).
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	517	*/
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	518	static int rbio_is_full(struct btrfs_raid_bio *rbio)
				519	{
				520	unsigned long flags;
David Sterba	176571a	2018-06-29 10:57:05 +0200	[diff] [blame]	521	unsigned long size = rbio->bio_list_bytes;
				522	int ret = 1;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	523
				524	spin_lock_irqsave(&rbio->bio_list_lock, flags);
David Sterba	176571a	2018-06-29 10:57:05 +0200	[diff] [blame]	525	if (size != rbio->nr_data * rbio->stripe_len)
				526	ret = 0;
				527	BUG_ON(size > rbio->nr_data * rbio->stripe_len);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	528	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
David Sterba	176571a	2018-06-29 10:57:05 +0200	[diff] [blame]	529
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	530	return ret;
				531	}
				532
				533	/*
				534	* returns 1 if it is safe to merge two rbios together.
				535	* The merging is safe if the two rbios correspond to
				536	* the same stripe and if they are both going in the same
				537	* direction (read vs write), and if neither one is
				538	* locked for final IO
				539	*
				540	* The caller is responsible for locking such that
				541	* rmw_locked is safe to test
				542	*/
				543	static int rbio_can_merge(struct btrfs_raid_bio *last,
				544	struct btrfs_raid_bio *cur)
				545	{
				546	if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) \|\|
				547	test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
				548	return 0;
				549
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	550	/*
				551	* we can't merge with cached rbios, since the
				552	* idea is that when we merge the destination
				553	* rbio is going to run our IO for us. We can
Nicholas D Steeves	0132761	2016-05-19 21:18:45 -0400	[diff] [blame]	554	* steal from cached rbios though, other functions
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	555	* handle that.
				556	*/
				557	if (test_bit(RBIO_CACHE_BIT, &last->flags) \|\|
				558	test_bit(RBIO_CACHE_BIT, &cur->flags))
				559	return 0;
				560
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	561	if (last->bbio->raid_map[0] !=
				562	cur->bbio->raid_map[0])
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	563	return 0;
				564
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	565	/* we can't merge with different operations */
				566	if (last->operation != cur->operation)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	567	return 0;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	568	/*
				569	* We've need read the full stripe from the drive.
				570	* check and repair the parity and write the new results.
				571	*
				572	* We're not allowed to add any new bios to the
				573	* bio list here, anyone else that wants to
				574	* change this stripe needs to do their own rmw.
				575	*/
Liu Bo	db34be1	2017-12-04 15:40:35 -0700	[diff] [blame]	576	if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	577	return 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	578
Liu Bo	db34be1	2017-12-04 15:40:35 -0700	[diff] [blame]	579	if (last->operation == BTRFS_RBIO_REBUILD_MISSING)
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	580	return 0;
				581
Liu Bo	cc54ff6	2017-12-11 14:56:31 -0700	[diff] [blame]	582	if (last->operation == BTRFS_RBIO_READ_REBUILD) {
				583	int fa = last->faila;
				584	int fb = last->failb;
				585	int cur_fa = cur->faila;
				586	int cur_fb = cur->failb;
				587
				588	if (last->faila >= last->failb) {
				589	fa = last->failb;
				590	fb = last->faila;
				591	}
				592
				593	if (cur->faila >= cur->failb) {
				594	cur_fa = cur->failb;
				595	cur_fb = cur->faila;
				596	}
				597
				598	if (fa != cur_fa \|\| fb != cur_fb)
				599	return 0;
				600	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	601	return 1;
				602	}
				603
Zhao Lei	b7178a5	2015-03-03 20:38:46 +0800	[diff] [blame]	604	static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe,
				605	int index)
				606	{
				607	return stripe * rbio->stripe_npages + index;
				608	}
				609
				610	/*
				611	* these are just the pages from the rbio array, not from anything
				612	* the FS sent down to us
				613	*/
				614	static struct page rbio_stripe_page(struct btrfs_raid_bio rbio, int stripe,
				615	int index)
				616	{
				617	return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)];
				618	}
				619
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	620	/*
				621	* helper to index into the pstripe
				622	*/
				623	static struct page rbio_pstripe_page(struct btrfs_raid_bio rbio, int index)
				624	{
Zhao Lei	b7178a5	2015-03-03 20:38:46 +0800	[diff] [blame]	625	return rbio_stripe_page(rbio, rbio->nr_data, index);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	626	}
				627
				628	/*
				629	* helper to index into the qstripe, returns null
				630	* if there is no qstripe
				631	*/
				632	static struct page rbio_qstripe_page(struct btrfs_raid_bio rbio, int index)
				633	{
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	634	if (rbio->nr_data + 1 == rbio->real_stripes)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	635	return NULL;
Zhao Lei	b7178a5	2015-03-03 20:38:46 +0800	[diff] [blame]	636	return rbio_stripe_page(rbio, rbio->nr_data + 1, index);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	637	}
				638
				639	/*
				640	* The first stripe in the table for a logical address
				641	* has the lock. rbios are added in one of three ways:
				642	*
				643	* 1) Nobody has the stripe locked yet. The rbio is given
				644	* the lock and 0 is returned. The caller must start the IO
				645	* themselves.
				646	*
				647	* 2) Someone has the stripe locked, but we're able to merge
				648	* with the lock owner. The rbio is freed and the IO will
				649	* start automatically along with the existing rbio. 1 is returned.
				650	*
				651	* 3) Someone has the stripe locked, but we're not able to merge.
				652	* The rbio is added to the lock owner's plug list, or merged into
				653	* an rbio already on the plug list. When the lock owner unlocks,
				654	* the next rbio on the list is run and the IO is started automatically.
				655	* 1 is returned
				656	*
				657	* If we return 0, the caller still owns the rbio and must continue with
				658	* IO submission. If we return 1, the caller must assume the rbio has
				659	* already been freed.
				660	*/
				661	static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
				662	{
Johannes Thumshirn	721860d	2019-10-18 11:58:21 +0200	[diff] [blame]	663	struct btrfs_stripe_hash *h;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	664	struct btrfs_raid_bio *cur;
				665	struct btrfs_raid_bio *pending;
				666	unsigned long flags;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	667	struct btrfs_raid_bio *freeit = NULL;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	668	struct btrfs_raid_bio *cache_drop = NULL;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	669	int ret = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	670
Johannes Thumshirn	721860d	2019-10-18 11:58:21 +0200	[diff] [blame]	671	h = rbio->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
				672
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	673	spin_lock_irqsave(&h->lock, flags);
				674	list_for_each_entry(cur, &h->hash_list, hash_list) {
Johannes Thumshirn	9d6cb1b	2019-10-18 11:58:20 +0200	[diff] [blame]	675	if (cur->bbio->raid_map[0] != rbio->bbio->raid_map[0])
				676	continue;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	677
Johannes Thumshirn	9d6cb1b	2019-10-18 11:58:20 +0200	[diff] [blame]	678	spin_lock(&cur->bio_list_lock);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	679
Johannes Thumshirn	9d6cb1b	2019-10-18 11:58:20 +0200	[diff] [blame]	680	/* Can we steal this cached rbio's pages? */
				681	if (bio_list_empty(&cur->bio_list) &&
				682	list_empty(&cur->plug_list) &&
				683	test_bit(RBIO_CACHE_BIT, &cur->flags) &&
				684	!test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
				685	list_del_init(&cur->hash_list);
				686	refcount_dec(&cur->refs);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	687
Johannes Thumshirn	9d6cb1b	2019-10-18 11:58:20 +0200	[diff] [blame]	688	steal_rbio(cur, rbio);
				689	cache_drop = cur;
				690	spin_unlock(&cur->bio_list_lock);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	691
Johannes Thumshirn	9d6cb1b	2019-10-18 11:58:20 +0200	[diff] [blame]	692	goto lockit;
				693	}
				694
				695	/* Can we merge into the lock owner? */
				696	if (rbio_can_merge(cur, rbio)) {
				697	merge_rbio(cur, rbio);
				698	spin_unlock(&cur->bio_list_lock);
				699	freeit = rbio;
				700	ret = 1;
				701	goto out;
				702	}
				703
				704
				705	/*
				706	* We couldn't merge with the running rbio, see if we can merge
				707	* with the pending ones. We don't have to check for rmw_locked
				708	* because there is no way they are inside finish_rmw right now
				709	*/
				710	list_for_each_entry(pending, &cur->plug_list, plug_list) {
				711	if (rbio_can_merge(pending, rbio)) {
				712	merge_rbio(pending, rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	713	spin_unlock(&cur->bio_list_lock);
				714	freeit = rbio;
				715	ret = 1;
				716	goto out;
				717	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	718	}
Johannes Thumshirn	9d6cb1b	2019-10-18 11:58:20 +0200	[diff] [blame]	719
				720	/*
				721	* No merging, put us on the tail of the plug list, our rbio
				722	* will be started with the currently running rbio unlocks
				723	*/
				724	list_add_tail(&rbio->plug_list, &cur->plug_list);
				725	spin_unlock(&cur->bio_list_lock);
				726	ret = 1;
				727	goto out;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	728	}
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	729	lockit:
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	730	refcount_inc(&rbio->refs);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	731	list_add(&rbio->hash_list, &h->hash_list);
				732	out:
				733	spin_unlock_irqrestore(&h->lock, flags);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	734	if (cache_drop)
				735	remove_rbio_from_cache(cache_drop);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	736	if (freeit)
				737	__free_raid_bio(freeit);
				738	return ret;
				739	}
				740
				741	/*
				742	* called as rmw or parity rebuild is completed. If the plug list has more
				743	* rbios waiting for this stripe, the next one on the list will be started
				744	*/
				745	static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
				746	{
				747	int bucket;
				748	struct btrfs_stripe_hash *h;
				749	unsigned long flags;
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	750	int keep_cache = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	751
				752	bucket = rbio_bucket(rbio);
				753	h = rbio->fs_info->stripe_hash_table->table + bucket;
				754
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	755	if (list_empty(&rbio->plug_list))
				756	cache_rbio(rbio);
				757
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	758	spin_lock_irqsave(&h->lock, flags);
				759	spin_lock(&rbio->bio_list_lock);
				760
				761	if (!list_empty(&rbio->hash_list)) {
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	762	/*
				763	* if we're still cached and there is no other IO
				764	* to perform, just leave this rbio here for others
				765	* to steal from later
				766	*/
				767	if (list_empty(&rbio->plug_list) &&
				768	test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
				769	keep_cache = 1;
				770	clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				771	BUG_ON(!bio_list_empty(&rbio->bio_list));
				772	goto done;
				773	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	774
				775	list_del_init(&rbio->hash_list);
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	776	refcount_dec(&rbio->refs);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	777
				778	/*
				779	* we use the plug list to hold all the rbios
				780	* waiting for the chance to lock this stripe.
				781	* hand the lock over to one of them.
				782	*/
				783	if (!list_empty(&rbio->plug_list)) {
				784	struct btrfs_raid_bio *next;
				785	struct list_head *head = rbio->plug_list.next;
				786
				787	next = list_entry(head, struct btrfs_raid_bio,
				788	plug_list);
				789
				790	list_del_init(&rbio->plug_list);
				791
				792	list_add(&next->hash_list, &h->hash_list);
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	793	refcount_inc(&next->refs);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	794	spin_unlock(&rbio->bio_list_lock);
				795	spin_unlock_irqrestore(&h->lock, flags);
				796
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	797	if (next->operation == BTRFS_RBIO_READ_REBUILD)
David Sterba	e66d8d5	2018-06-29 10:57:00 +0200	[diff] [blame]	798	start_async_work(next, read_rebuild_work);
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	799	else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
				800	steal_rbio(rbio, next);
David Sterba	e66d8d5	2018-06-29 10:57:00 +0200	[diff] [blame]	801	start_async_work(next, read_rebuild_work);
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	802	} else if (next->operation == BTRFS_RBIO_WRITE) {
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	803	steal_rbio(rbio, next);
David Sterba	cf6a4a7	2018-06-29 10:56:58 +0200	[diff] [blame]	804	start_async_work(next, rmw_work);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	805	} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
				806	steal_rbio(rbio, next);
David Sterba	a81b747	2018-06-29 10:57:03 +0200	[diff] [blame]	807	start_async_work(next, scrub_parity_work);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	808	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	809
				810	goto done_nolock;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	811	}
				812	}
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	813	done:
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	814	spin_unlock(&rbio->bio_list_lock);
				815	spin_unlock_irqrestore(&h->lock, flags);
				816
				817	done_nolock:
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	818	if (!keep_cache)
				819	remove_rbio_from_cache(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	820	}
				821
				822	static void __free_raid_bio(struct btrfs_raid_bio *rbio)
				823	{
				824	int i;
				825
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	826	if (!refcount_dec_and_test(&rbio->refs))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	827	return;
				828
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	829	WARN_ON(!list_empty(&rbio->stripe_cache));
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	830	WARN_ON(!list_empty(&rbio->hash_list));
				831	WARN_ON(!bio_list_empty(&rbio->bio_list));
				832
				833	for (i = 0; i < rbio->nr_pages; i++) {
				834	if (rbio->stripe_pages[i]) {
				835	__free_page(rbio->stripe_pages[i]);
				836	rbio->stripe_pages[i] = NULL;
				837	}
				838	}
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	839
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	840	btrfs_put_bbio(rbio->bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	841	kfree(rbio);
				842	}
				843
Liu Bo	7583d8d	2018-01-09 18:36:25 -0700	[diff] [blame]	844	static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	845	{
Liu Bo	7583d8d	2018-01-09 18:36:25 -0700	[diff] [blame]	846	struct bio *next;
				847
				848	while (cur) {
				849	next = cur->bi_next;
				850	cur->bi_next = NULL;
				851	cur->bi_status = err;
				852	bio_endio(cur);
				853	cur = next;
				854	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	855	}
				856
				857	/*
				858	* this frees the rbio and runs through all the bios in the
				859	* bio_list and calls end_io on them
				860	*/
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	861	static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	862	{
				863	struct bio *cur = bio_list_get(&rbio->bio_list);
Liu Bo	7583d8d	2018-01-09 18:36:25 -0700	[diff] [blame]	864	struct bio *extra;
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	865
				866	if (rbio->generic_bio_cnt)
				867	btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
				868
Liu Bo	7583d8d	2018-01-09 18:36:25 -0700	[diff] [blame]	869	/*
				870	* At this moment, rbio->bio_list is empty, however since rbio does not
				871	* always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
				872	* hash list, rbio may be merged with others so that rbio->bio_list
				873	* becomes non-empty.
				874	* Once unlock_stripe() is done, rbio->bio_list will not be updated any
				875	* more and we can call bio_endio() on all queued bios.
				876	*/
				877	unlock_stripe(rbio);
				878	extra = bio_list_get(&rbio->bio_list);
				879	__free_raid_bio(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	880
Liu Bo	7583d8d	2018-01-09 18:36:25 -0700	[diff] [blame]	881	rbio_endio_bio_list(cur, err);
				882	if (extra)
				883	rbio_endio_bio_list(extra, err);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	884	}
				885
				886	/*
				887	* end io function used by finish_rmw. When we finally
				888	* get here, we've written a full stripe
				889	*/
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	890	static void raid_write_end_io(struct bio *bio)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	891	{
				892	struct btrfs_raid_bio *rbio = bio->bi_private;
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	893	blk_status_t err = bio->bi_status;
Zhao Lei	a6111d11b	2016-01-12 17:52:13 +0800	[diff] [blame]	894	int max_errors;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	895
				896	if (err)
				897	fail_bio_stripe(rbio, bio);
				898
				899	bio_put(bio);
				900
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	901	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	902	return;
				903
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	904	err = BLK_STS_OK;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	905
				906	/* OK, we have read all the stripes we need to. */
Zhao Lei	a6111d11b	2016-01-12 17:52:13 +0800	[diff] [blame]	907	max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
				908	0 : rbio->bbio->max_errors;
				909	if (atomic_read(&rbio->error) > max_errors)
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	910	err = BLK_STS_IOERR;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	911
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	912	rbio_orig_end_io(rbio, err);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	913	}
				914
				915	/*
				916	* the read/modify/write code wants to use the original bio for
				917	* any pages it included, and then use the rbio for everything
				918	* else. This function decides if a given index (stripe number)
				919	* and page number in that stripe fall inside the original bio
				920	* or the rbio.
				921	*
				922	* if you set bio_list_only, you'll get a NULL back for any ranges
				923	* that are outside the bio_list
				924	*
				925	* This doesn't take any refs on anything, you get a bare page pointer
				926	* and the caller must bump refs as required.
				927	*
				928	* You must call index_rbio_pages once before you can trust
				929	* the answers from this function.
				930	*/
				931	static struct page page_in_rbio(struct btrfs_raid_bio rbio,
				932	int index, int pagenr, int bio_list_only)
				933	{
				934	int chunk_page;
				935	struct page *p = NULL;
				936
				937	chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
				938
				939	spin_lock_irq(&rbio->bio_list_lock);
				940	p = rbio->bio_pages[chunk_page];
				941	spin_unlock_irq(&rbio->bio_list_lock);
				942
				943	if (p \|\| bio_list_only)
				944	return p;
				945
				946	return rbio->stripe_pages[chunk_page];
				947	}
				948
				949	/*
				950	* number of pages we need for the entire stripe across all the
				951	* drives
				952	*/
				953	static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
				954	{
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	955	return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	956	}
				957
				958	/*
				959	* allocation and initial setup for the btrfs_raid_bio. Not
				960	* this does not allocate any pages for rbio->pages.
				961	*/
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	962	static struct btrfs_raid_bio alloc_rbio(struct btrfs_fs_info fs_info,
				963	struct btrfs_bio *bbio,
				964	u64 stripe_len)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	965	{
				966	struct btrfs_raid_bio *rbio;
				967	int nr_data = 0;
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	968	int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
				969	int num_pages = rbio_nr_pages(stripe_len, real_stripes);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	970	int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	971	void *p;
				972
Kees Cook	1389053	2018-05-29 16:44:59 -0700	[diff] [blame]	973	rbio = kzalloc(sizeof(*rbio) +
				974	sizeof(rbio->stripe_pages) num_pages +
				975	sizeof(rbio->bio_pages) num_pages +
				976	sizeof(rbio->finish_pointers) real_stripes +
				977	sizeof(rbio->dbitmap) BITS_TO_LONGS(stripe_npages) +
				978	sizeof(rbio->finish_pbitmap)
				979	BITS_TO_LONGS(stripe_npages),
				980	GFP_NOFS);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	981	if (!rbio)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	982	return ERR_PTR(-ENOMEM);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	983
				984	bio_list_init(&rbio->bio_list);
				985	INIT_LIST_HEAD(&rbio->plug_list);
				986	spin_lock_init(&rbio->bio_list_lock);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	987	INIT_LIST_HEAD(&rbio->stripe_cache);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	988	INIT_LIST_HEAD(&rbio->hash_list);
				989	rbio->bbio = bbio;
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	990	rbio->fs_info = fs_info;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	991	rbio->stripe_len = stripe_len;
				992	rbio->nr_pages = num_pages;
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	993	rbio->real_stripes = real_stripes;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	994	rbio->stripe_npages = stripe_npages;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	995	rbio->faila = -1;
				996	rbio->failb = -1;
Elena Reshetova	dec9557	2017-03-03 10:55:26 +0200	[diff] [blame]	997	refcount_set(&rbio->refs, 1);
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	998	atomic_set(&rbio->error, 0);
				999	atomic_set(&rbio->stripes_pending, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1000
				1001	/*
Kees Cook	1389053	2018-05-29 16:44:59 -0700	[diff] [blame]	1002	* the stripe_pages, bio_pages, etc arrays point to the extra
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1003	* memory we allocated past the end of the rbio
				1004	*/
				1005	p = rbio + 1;
Kees Cook	1389053	2018-05-29 16:44:59 -0700	[diff] [blame]	1006	#define CONSUME_ALLOC(ptr, count) do { \
				1007	ptr = p; \
				1008	p = (unsigned char )p + sizeof((ptr)) * (count); \
				1009	} while (0)
				1010	CONSUME_ALLOC(rbio->stripe_pages, num_pages);
				1011	CONSUME_ALLOC(rbio->bio_pages, num_pages);
				1012	CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
				1013	CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages));
				1014	CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages));
				1015	#undef CONSUME_ALLOC
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1016
Zhao Lei	10f1190	2015-01-20 15:11:43 +0800	[diff] [blame]	1017	if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
				1018	nr_data = real_stripes - 1;
				1019	else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1020	nr_data = real_stripes - 2;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1021	else
Zhao Lei	10f1190	2015-01-20 15:11:43 +0800	[diff] [blame]	1022	BUG();
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1023
				1024	rbio->nr_data = nr_data;
				1025	return rbio;
				1026	}
				1027
				1028	/* allocate pages for all the stripes in the bio, including parity */
				1029	static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
				1030	{
				1031	int i;
				1032	struct page *page;
				1033
				1034	for (i = 0; i < rbio->nr_pages; i++) {
				1035	if (rbio->stripe_pages[i])
				1036	continue;
				1037	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				1038	if (!page)
				1039	return -ENOMEM;
				1040	rbio->stripe_pages[i] = page;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1041	}
				1042	return 0;
				1043	}
				1044
Zhao Lei	b7178a5	2015-03-03 20:38:46 +0800	[diff] [blame]	1045	/* only allocate pages for p/q stripes */
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1046	static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
				1047	{
				1048	int i;
				1049	struct page *page;
				1050
Zhao Lei	b7178a5	2015-03-03 20:38:46 +0800	[diff] [blame]	1051	i = rbio_stripe_page_index(rbio, rbio->nr_data, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1052
				1053	for (; i < rbio->nr_pages; i++) {
				1054	if (rbio->stripe_pages[i])
				1055	continue;
				1056	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				1057	if (!page)
				1058	return -ENOMEM;
				1059	rbio->stripe_pages[i] = page;
				1060	}
				1061	return 0;
				1062	}
				1063
				1064	/*
				1065	* add a single page from a specific stripe into our list of bios for IO
				1066	* this will try to merge into existing bios if possible, and returns
				1067	* zero if all went well.
				1068	*/
Eric Sandeen	48a3b63	2013-04-25 20:41:01 +0000	[diff] [blame]	1069	static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
				1070	struct bio_list *bio_list,
				1071	struct page *page,
				1072	int stripe_nr,
				1073	unsigned long page_index,
				1074	unsigned long bio_max_len)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1075	{
				1076	struct bio *last = bio_list->tail;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1077	int ret;
				1078	struct bio *bio;
				1079	struct btrfs_bio_stripe *stripe;
				1080	u64 disk_start;
				1081
				1082	stripe = &rbio->bbio->stripes[stripe_nr];
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1083	disk_start = stripe->physical + (page_index << PAGE_SHIFT);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1084
				1085	/* if the device is missing, just fail this stripe */
				1086	if (!stripe->dev->bdev)
				1087	return fail_rbio_index(rbio, stripe_nr);
				1088
				1089	/* see if we can add this page onto our existing bio */
				1090	if (last) {
David Sterba	1201b58	2020-11-26 15:41:27 +0100	[diff] [blame]	1091	u64 last_end = last->bi_iter.bi_sector << 9;
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1092	last_end += last->bi_iter.bi_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1093
				1094	/*
				1095	* we can't merge these if they are from different
				1096	* devices or if they are not contiguous
				1097	*/
Nikolay Borisov	f90ae76	2020-07-02 16:46:42 +0300	[diff] [blame]	1098	if (last_end == disk_start && !last->bi_status &&
Christoph Hellwig	309dca30	2021-01-24 11:02:34 +0100	[diff] [blame]	1099	last->bi_bdev == stripe->dev->bdev) {
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1100	ret = bio_add_page(last, page, PAGE_SIZE, 0);
				1101	if (ret == PAGE_SIZE)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1102	return 0;
				1103	}
				1104	}
				1105
				1106	/* put a new bio on the list */
David Sterba	c5e4c3d	2017-06-12 17:29:41 +0200	[diff] [blame]	1107	bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
Nikolay Borisov	c31efbd	2020-07-03 11:14:27 +0300	[diff] [blame]	1108	btrfs_io_bio(bio)->device = stripe->dev;
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1109	bio->bi_iter.bi_size = 0;
Christoph Hellwig	74d4699	2017-08-23 19:10:32 +0200	[diff] [blame]	1110	bio_set_dev(bio, stripe->dev->bdev);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1111	bio->bi_iter.bi_sector = disk_start >> 9;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1112
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1113	bio_add_page(bio, page, PAGE_SIZE, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1114	bio_list_add(bio_list, bio);
				1115	return 0;
				1116	}
				1117
				1118	/*
				1119	* while we're doing the read/modify/write cycle, we could
				1120	* have errors in reading pages off the disk. This checks
				1121	* for errors and if we're not able to read the page it'll
				1122	* trigger parity reconstruction. The rmw will be finished
				1123	* after we've reconstructed the failed stripes
				1124	*/
				1125	static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
				1126	{
				1127	if (rbio->faila >= 0 \|\| rbio->failb >= 0) {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1128	BUG_ON(rbio->faila == rbio->real_stripes - 1);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1129	__raid56_parity_recover(rbio);
				1130	} else {
				1131	finish_rmw(rbio);
				1132	}
				1133	}
				1134
				1135	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1136	* helper function to walk our bio list and populate the bio_pages array with
				1137	* the result. This seems expensive, but it is faster than constantly
				1138	* searching through the bio list as we setup the IO in finish_rmw or stripe
				1139	* reconstruction.
				1140	*
				1141	* This must be called before you trust the answers from page_in_rbio
				1142	*/
				1143	static void index_rbio_pages(struct btrfs_raid_bio *rbio)
				1144	{
				1145	struct bio *bio;
				1146	u64 start;
				1147	unsigned long stripe_offset;
				1148	unsigned long page_index;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1149
				1150	spin_lock_irq(&rbio->bio_list_lock);
				1151	bio_list_for_each(bio, &rbio->bio_list) {
Filipe Manana	6592e58	2017-07-12 23:36:02 +0100	[diff] [blame]	1152	struct bio_vec bvec;
				1153	struct bvec_iter iter;
				1154	int i = 0;
				1155
David Sterba	1201b58	2020-11-26 15:41:27 +0100	[diff] [blame]	1156	start = bio->bi_iter.bi_sector << 9;
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1157	stripe_offset = start - rbio->bbio->raid_map[0];
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1158	page_index = stripe_offset >> PAGE_SHIFT;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1159
Filipe Manana	6592e58	2017-07-12 23:36:02 +0100	[diff] [blame]	1160	if (bio_flagged(bio, BIO_CLONED))
				1161	bio->bi_iter = btrfs_io_bio(bio)->iter;
				1162
				1163	bio_for_each_segment(bvec, bio, iter) {
				1164	rbio->bio_pages[page_index + i] = bvec.bv_page;
				1165	i++;
				1166	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1167	}
				1168	spin_unlock_irq(&rbio->bio_list_lock);
				1169	}
				1170
				1171	/*
				1172	* this is called from one of two situations. We either
				1173	* have a full stripe from the higher layers, or we've read all
				1174	* the missing bits off disk.
				1175	*
				1176	* This will calculate the parity and then send down any
				1177	* changed blocks.
				1178	*/
				1179	static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
				1180	{
				1181	struct btrfs_bio *bbio = rbio->bbio;
Kees Cook	1389053	2018-05-29 16:44:59 -0700	[diff] [blame]	1182	void **pointers = rbio->finish_pointers;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1183	int nr_data = rbio->nr_data;
				1184	int stripe;
				1185	int pagenr;
David Sterba	c17af96	2020-02-19 15:17:20 +0100	[diff] [blame]	1186	bool has_qstripe;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1187	struct bio_list bio_list;
				1188	struct bio *bio;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1189	int ret;
				1190
				1191	bio_list_init(&bio_list);
				1192
David Sterba	c17af96	2020-02-19 15:17:20 +0100	[diff] [blame]	1193	if (rbio->real_stripes - rbio->nr_data == 1)
				1194	has_qstripe = false;
				1195	else if (rbio->real_stripes - rbio->nr_data == 2)
				1196	has_qstripe = true;
				1197	else
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1198	BUG();
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1199
				1200	/* at this point we either have a full stripe,
				1201	* or we've read the full stripe from the drive.
				1202	* recalculate the parity and write the new results.
				1203	*
				1204	* We're not allowed to add any new bios to the
				1205	* bio list here, anyone else that wants to
				1206	* change this stripe needs to do their own rmw.
				1207	*/
				1208	spin_lock_irq(&rbio->bio_list_lock);
				1209	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				1210	spin_unlock_irq(&rbio->bio_list_lock);
				1211
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1212	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1213
				1214	/*
				1215	* now that we've set rmw_locked, run through the
				1216	* bio list one last time and map the page pointers
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1217	*
				1218	* We don't cache full rbios because we're assuming
				1219	* the higher layers are unlikely to use this area of
				1220	* the disk again soon. If they do use it again,
				1221	* hopefully they will send another full bio.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1222	*/
				1223	index_rbio_pages(rbio);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1224	if (!rbio_is_full(rbio))
				1225	cache_rbio_pages(rbio);
				1226	else
				1227	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1228
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1229	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1230	struct page *p;
				1231	/* first collect one page from each data stripe */
				1232	for (stripe = 0; stripe < nr_data; stripe++) {
				1233	p = page_in_rbio(rbio, stripe, pagenr, 0);
				1234	pointers[stripe] = kmap(p);
				1235	}
				1236
				1237	/* then add the parity stripe */
				1238	p = rbio_pstripe_page(rbio, pagenr);
				1239	SetPageUptodate(p);
				1240	pointers[stripe++] = kmap(p);
				1241
David Sterba	c17af96	2020-02-19 15:17:20 +0100	[diff] [blame]	1242	if (has_qstripe) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1243
				1244	/*
				1245	* raid6, add the qstripe and call the
				1246	* library function to fill in our p/q
				1247	*/
				1248	p = rbio_qstripe_page(rbio, pagenr);
				1249	SetPageUptodate(p);
				1250	pointers[stripe++] = kmap(p);
				1251
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1252	raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1253	pointers);
				1254	} else {
				1255	/* raid5 */
David Sterba	69d2480	2018-06-29 10:56:44 +0200	[diff] [blame]	1256	copy_page(pointers[nr_data], pointers[0]);
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1257	run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1258	}
				1259
				1260
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1261	for (stripe = 0; stripe < rbio->real_stripes; stripe++)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1262	kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
				1263	}
				1264
				1265	/*
				1266	* time to start writing. Make bios for everything from the
				1267	* higher layers (the bio_list in our rbio) and our p/q. Ignore
				1268	* everything else.
				1269	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1270	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1271	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1272	struct page *page;
				1273	if (stripe < rbio->nr_data) {
				1274	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1275	if (!page)
				1276	continue;
				1277	} else {
				1278	page = rbio_stripe_page(rbio, stripe, pagenr);
				1279	}
				1280
				1281	ret = rbio_add_io_page(rbio, &bio_list,
				1282	page, stripe, pagenr, rbio->stripe_len);
				1283	if (ret)
				1284	goto cleanup;
				1285	}
				1286	}
				1287
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1288	if (likely(!bbio->num_tgtdevs))
				1289	goto write_data;
				1290
				1291	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
				1292	if (!bbio->tgtdev_map[stripe])
				1293	continue;
				1294
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1295	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1296	struct page *page;
				1297	if (stripe < rbio->nr_data) {
				1298	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1299	if (!page)
				1300	continue;
				1301	} else {
				1302	page = rbio_stripe_page(rbio, stripe, pagenr);
				1303	}
				1304
				1305	ret = rbio_add_io_page(rbio, &bio_list, page,
				1306	rbio->bbio->tgtdev_map[stripe],
				1307	pagenr, rbio->stripe_len);
				1308	if (ret)
				1309	goto cleanup;
				1310	}
				1311	}
				1312
				1313	write_data:
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1314	atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
				1315	BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1316
Nikolay Borisov	bf28a60	2020-07-02 16:46:43 +0300	[diff] [blame]	1317	while ((bio = bio_list_pop(&bio_list))) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1318	bio->bi_private = rbio;
				1319	bio->bi_end_io = raid_write_end_io;
David Sterba	ebcc326	2018-06-29 10:56:53 +0200	[diff] [blame]	1320	bio->bi_opf = REQ_OP_WRITE;
Mike Christie	4e49ea4	2016-06-05 14:31:41 -0500	[diff] [blame]	1321
				1322	submit_bio(bio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1323	}
				1324	return;
				1325
				1326	cleanup:
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1327	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	1328
				1329	while ((bio = bio_list_pop(&bio_list)))
				1330	bio_put(bio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1331	}
				1332
				1333	/*
				1334	* helper to find the stripe number for a given bio. Used to figure out which
				1335	* stripe has failed. This expects the bio to correspond to a physical disk,
				1336	* so it looks up based on physical sector numbers.
				1337	*/
				1338	static int find_bio_stripe(struct btrfs_raid_bio *rbio,
				1339	struct bio *bio)
				1340	{
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1341	u64 physical = bio->bi_iter.bi_sector;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1342	int i;
				1343	struct btrfs_bio_stripe *stripe;
				1344
				1345	physical <<= 9;
				1346
				1347	for (i = 0; i < rbio->bbio->num_stripes; i++) {
				1348	stripe = &rbio->bbio->stripes[i];
Nikolay Borisov	8302586	2020-07-02 16:46:45 +0300	[diff] [blame]	1349	if (in_range(physical, stripe->physical, rbio->stripe_len) &&
Christoph Hellwig	309dca30	2021-01-24 11:02:34 +0100	[diff] [blame]	1350	stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1351	return i;
				1352	}
				1353	}
				1354	return -1;
				1355	}
				1356
				1357	/*
				1358	* helper to find the stripe number for a given
				1359	* bio (before mapping). Used to figure out which stripe has
				1360	* failed. This looks up based on logical block numbers.
				1361	*/
				1362	static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
				1363	struct bio *bio)
				1364	{
David Sterba	1201b58	2020-11-26 15:41:27 +0100	[diff] [blame]	1365	u64 logical = bio->bi_iter.bi_sector << 9;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1366	int i;
				1367
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1368	for (i = 0; i < rbio->nr_data; i++) {
Nikolay Borisov	8302586	2020-07-02 16:46:45 +0300	[diff] [blame]	1369	u64 stripe_start = rbio->bbio->raid_map[i];
				1370
				1371	if (in_range(logical, stripe_start, rbio->stripe_len))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1372	return i;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1373	}
				1374	return -1;
				1375	}
				1376
				1377	/*
				1378	* returns -EIO if we had too many failures
				1379	*/
				1380	static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
				1381	{
				1382	unsigned long flags;
				1383	int ret = 0;
				1384
				1385	spin_lock_irqsave(&rbio->bio_list_lock, flags);
				1386
				1387	/* we already know this stripe is bad, move on */
				1388	if (rbio->faila == failed \|\| rbio->failb == failed)
				1389	goto out;
				1390
				1391	if (rbio->faila == -1) {
				1392	/* first failure on this rbio */
				1393	rbio->faila = failed;
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1394	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1395	} else if (rbio->failb == -1) {
				1396	/* second failure on this rbio */
				1397	rbio->failb = failed;
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1398	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1399	} else {
				1400	ret = -EIO;
				1401	}
				1402	out:
				1403	spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
				1404
				1405	return ret;
				1406	}
				1407
				1408	/*
				1409	* helper to fail a stripe based on a physical disk
				1410	* bio.
				1411	*/
				1412	static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
				1413	struct bio *bio)
				1414	{
				1415	int failed = find_bio_stripe(rbio, bio);
				1416
				1417	if (failed < 0)
				1418	return -EIO;
				1419
				1420	return fail_rbio_index(rbio, failed);
				1421	}
				1422
				1423	/*
				1424	* this sets each page in the bio uptodate. It should only be used on private
				1425	* rbio pages, nothing that comes in from the higher layers
				1426	*/
				1427	static void set_bio_pages_uptodate(struct bio *bio)
				1428	{
Liu Bo	0198e5b	2018-01-12 18:07:01 -0700	[diff] [blame]	1429	struct bio_vec *bvec;
Ming Lei	6dc4f10	2019-02-15 19:13:19 +0800	[diff] [blame]	1430	struct bvec_iter_all iter_all;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1431
Liu Bo	0198e5b	2018-01-12 18:07:01 -0700	[diff] [blame]	1432	ASSERT(!bio_flagged(bio, BIO_CLONED));
Filipe Manana	6592e58	2017-07-12 23:36:02 +0100	[diff] [blame]	1433
Christoph Hellwig	2b070cf	2019-04-25 09:03:00 +0200	[diff] [blame]	1434	bio_for_each_segment_all(bvec, bio, iter_all)
Liu Bo	0198e5b	2018-01-12 18:07:01 -0700	[diff] [blame]	1435	SetPageUptodate(bvec->bv_page);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1436	}
				1437
				1438	/*
				1439	* end io for the read phase of the rmw cycle. All the bios here are physical
				1440	* stripe bios we've read from the disk so we can recalculate the parity of the
				1441	* stripe.
				1442	*
				1443	* This will usually kick off finish_rmw once all the bios are read in, but it
				1444	* may trigger parity reconstruction if we had any errors along the way
				1445	*/
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	1446	static void raid_rmw_end_io(struct bio *bio)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1447	{
				1448	struct btrfs_raid_bio *rbio = bio->bi_private;
				1449
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	1450	if (bio->bi_status)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1451	fail_bio_stripe(rbio, bio);
				1452	else
				1453	set_bio_pages_uptodate(bio);
				1454
				1455	bio_put(bio);
				1456
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1457	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1458	return;
				1459
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1460	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1461	goto cleanup;
				1462
				1463	/*
				1464	* this will normally call finish_rmw to start our write
				1465	* but if there are any failed stripes we'll reconstruct
				1466	* from parity first
				1467	*/
				1468	validate_rbio_for_rmw(rbio);
				1469	return;
				1470
				1471	cleanup:
				1472
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1473	rbio_orig_end_io(rbio, BLK_STS_IOERR);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1474	}
				1475
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1476	/*
				1477	* the stripe must be locked by the caller. It will
				1478	* unlock after all the writes are done
				1479	*/
				1480	static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
				1481	{
				1482	int bios_to_read = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1483	struct bio_list bio_list;
				1484	int ret;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1485	int pagenr;
				1486	int stripe;
				1487	struct bio *bio;
				1488
				1489	bio_list_init(&bio_list);
				1490
				1491	ret = alloc_rbio_pages(rbio);
				1492	if (ret)
				1493	goto cleanup;
				1494
				1495	index_rbio_pages(rbio);
				1496
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1497	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1498	/*
				1499	* build a list of bios to read all the missing parts of this
				1500	* stripe
				1501	*/
				1502	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1503	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1504	struct page *page;
				1505	/*
				1506	* we want to find all the pages missing from
				1507	* the rbio and read them from the disk. If
				1508	* page_in_rbio finds a page in the bio list
				1509	* we don't need to read it off the stripe.
				1510	*/
				1511	page = page_in_rbio(rbio, stripe, pagenr, 1);
				1512	if (page)
				1513	continue;
				1514
				1515	page = rbio_stripe_page(rbio, stripe, pagenr);
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1516	/*
				1517	* the bio cache may have handed us an uptodate
				1518	* page. If so, be happy and use it
				1519	*/
				1520	if (PageUptodate(page))
				1521	continue;
				1522
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1523	ret = rbio_add_io_page(rbio, &bio_list, page,
				1524	stripe, pagenr, rbio->stripe_len);
				1525	if (ret)
				1526	goto cleanup;
				1527	}
				1528	}
				1529
				1530	bios_to_read = bio_list_size(&bio_list);
				1531	if (!bios_to_read) {
				1532	/*
				1533	* this can happen if others have merged with
				1534	* us, it means there is nothing left to read.
				1535	* But if there are missing devices it may not be
				1536	* safe to do the full stripe write yet.
				1537	*/
				1538	goto finish;
				1539	}
				1540
				1541	/*
				1542	* the bbio may be freed once we submit the last bio. Make sure
				1543	* not to touch it after that
				1544	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	1545	atomic_set(&rbio->stripes_pending, bios_to_read);
Nikolay Borisov	bf28a60	2020-07-02 16:46:43 +0300	[diff] [blame]	1546	while ((bio = bio_list_pop(&bio_list))) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1547	bio->bi_private = rbio;
				1548	bio->bi_end_io = raid_rmw_end_io;
David Sterba	ebcc326	2018-06-29 10:56:53 +0200	[diff] [blame]	1549	bio->bi_opf = REQ_OP_READ;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1550
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1551	btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1552
Mike Christie	4e49ea4	2016-06-05 14:31:41 -0500	[diff] [blame]	1553	submit_bio(bio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1554	}
				1555	/* the actual write will happen once the reads are done */
				1556	return 0;
				1557
				1558	cleanup:
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1559	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	1560
				1561	while ((bio = bio_list_pop(&bio_list)))
				1562	bio_put(bio);
				1563
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1564	return -EIO;
				1565
				1566	finish:
				1567	validate_rbio_for_rmw(rbio);
				1568	return 0;
				1569	}
				1570
				1571	/*
				1572	* if the upper layers pass in a full stripe, we thank them by only allocating
				1573	* enough pages to hold the parity, and sending it all down quickly.
				1574	*/
				1575	static int full_stripe_write(struct btrfs_raid_bio *rbio)
				1576	{
				1577	int ret;
				1578
				1579	ret = alloc_rbio_parity_pages(rbio);
Miao Xie	3cd846d	2013-07-22 16:36:57 +0800	[diff] [blame]	1580	if (ret) {
				1581	__free_raid_bio(rbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1582	return ret;
Miao Xie	3cd846d	2013-07-22 16:36:57 +0800	[diff] [blame]	1583	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1584
				1585	ret = lock_stripe_add(rbio);
				1586	if (ret == 0)
				1587	finish_rmw(rbio);
				1588	return 0;
				1589	}
				1590
				1591	/*
				1592	* partial stripe writes get handed over to async helpers.
				1593	* We're really hoping to merge a few more writes into this
				1594	* rbio before calculating new parity
				1595	*/
				1596	static int partial_stripe_write(struct btrfs_raid_bio *rbio)
				1597	{
				1598	int ret;
				1599
				1600	ret = lock_stripe_add(rbio);
				1601	if (ret == 0)
David Sterba	cf6a4a7	2018-06-29 10:56:58 +0200	[diff] [blame]	1602	start_async_work(rbio, rmw_work);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1603	return 0;
				1604	}
				1605
				1606	/*
				1607	* sometimes while we were reading from the drive to
				1608	* recalculate parity, enough new bios come into create
				1609	* a full stripe. So we do a check here to see if we can
				1610	* go directly to finish_rmw
				1611	*/
				1612	static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
				1613	{
				1614	/* head off into rmw land if we don't have a full stripe */
				1615	if (!rbio_is_full(rbio))
				1616	return partial_stripe_write(rbio);
				1617	return full_stripe_write(rbio);
				1618	}
				1619
				1620	/*
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1621	* We use plugging call backs to collect full stripes.
				1622	* Any time we get a partial stripe write while plugged
				1623	* we collect it into a list. When the unplug comes down,
				1624	* we sort the list by logical block number and merge
				1625	* everything we can into the same rbios
				1626	*/
				1627	struct btrfs_plug_cb {
				1628	struct blk_plug_cb cb;
				1629	struct btrfs_fs_info *info;
				1630	struct list_head rbio_list;
				1631	struct btrfs_work work;
				1632	};
				1633
				1634	/*
				1635	* rbios on the plug list are sorted for easier merging.
				1636	*/
				1637	static int plug_cmp(void priv, struct list_head a, struct list_head *b)
				1638	{
				1639	struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
				1640	plug_list);
				1641	struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
				1642	plug_list);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1643	u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
				1644	u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1645
				1646	if (a_sector < b_sector)
				1647	return -1;
				1648	if (a_sector > b_sector)
				1649	return 1;
				1650	return 0;
				1651	}
				1652
				1653	static void run_plug(struct btrfs_plug_cb *plug)
				1654	{
				1655	struct btrfs_raid_bio *cur;
				1656	struct btrfs_raid_bio *last = NULL;
				1657
				1658	/*
				1659	* sort our plug list then try to merge
				1660	* everything we can in hopes of creating full
				1661	* stripes.
				1662	*/
				1663	list_sort(NULL, &plug->rbio_list, plug_cmp);
				1664	while (!list_empty(&plug->rbio_list)) {
				1665	cur = list_entry(plug->rbio_list.next,
				1666	struct btrfs_raid_bio, plug_list);
				1667	list_del_init(&cur->plug_list);
				1668
				1669	if (rbio_is_full(cur)) {
David Sterba	c7b562c	2018-06-29 10:57:10 +0200	[diff] [blame]	1670	int ret;
				1671
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1672	/* we have a full stripe, send it down */
David Sterba	c7b562c	2018-06-29 10:57:10 +0200	[diff] [blame]	1673	ret = full_stripe_write(cur);
				1674	BUG_ON(ret);
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1675	continue;
				1676	}
				1677	if (last) {
				1678	if (rbio_can_merge(last, cur)) {
				1679	merge_rbio(last, cur);
				1680	__free_raid_bio(cur);
				1681	continue;
				1682
				1683	}
				1684	__raid56_parity_write(last);
				1685	}
				1686	last = cur;
				1687	}
				1688	if (last) {
				1689	__raid56_parity_write(last);
				1690	}
				1691	kfree(plug);
				1692	}
				1693
				1694	/*
				1695	* if the unplug comes from schedule, we have to push the
				1696	* work off to a helper thread
				1697	*/
				1698	static void unplug_work(struct btrfs_work *work)
				1699	{
				1700	struct btrfs_plug_cb *plug;
				1701	plug = container_of(work, struct btrfs_plug_cb, work);
				1702	run_plug(plug);
				1703	}
				1704
				1705	static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
				1706	{
				1707	struct btrfs_plug_cb *plug;
				1708	plug = container_of(cb, struct btrfs_plug_cb, cb);
				1709
				1710	if (from_schedule) {
Omar Sandoval	a0cac0e	2019-09-16 11:30:57 -0700	[diff] [blame]	1711	btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
Qu Wenruo	d05a33a	2014-02-28 10:46:11 +0800	[diff] [blame]	1712	btrfs_queue_work(plug->info->rmw_workers,
				1713	&plug->work);
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1714	return;
				1715	}
				1716	run_plug(plug);
				1717	}
				1718
				1719	/*
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1720	* our main entry point for writes from the rest of the FS.
				1721	*/
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	1722	int raid56_parity_write(struct btrfs_fs_info fs_info, struct bio bio,
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1723	struct btrfs_bio *bbio, u64 stripe_len)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1724	{
				1725	struct btrfs_raid_bio *rbio;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1726	struct btrfs_plug_cb *plug = NULL;
				1727	struct blk_plug_cb *cb;
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1728	int ret;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1729
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	1730	rbio = alloc_rbio(fs_info, bbio, stripe_len);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	1731	if (IS_ERR(rbio)) {
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	1732	btrfs_put_bbio(bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1733	return PTR_ERR(rbio);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	1734	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1735	bio_list_add(&rbio->bio_list, bio);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	1736	rbio->bio_list_bytes = bio->bi_iter.bi_size;
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	1737	rbio->operation = BTRFS_RBIO_WRITE;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1738
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1739	btrfs_bio_counter_inc_noblocked(fs_info);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1740	rbio->generic_bio_cnt = 1;
				1741
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1742	/*
				1743	* don't plug on full rbios, just get them out the door
				1744	* as quickly as we can
				1745	*/
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1746	if (rbio_is_full(rbio)) {
				1747	ret = full_stripe_write(rbio);
				1748	if (ret)
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1749	btrfs_bio_counter_dec(fs_info);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1750	return ret;
				1751	}
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1752
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1753	cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1754	if (cb) {
				1755	plug = container_of(cb, struct btrfs_plug_cb, cb);
				1756	if (!plug->info) {
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1757	plug->info = fs_info;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1758	INIT_LIST_HEAD(&plug->rbio_list);
				1759	}
				1760	list_add_tail(&rbio->plug_list, &plug->rbio_list);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1761	ret = 0;
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1762	} else {
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1763	ret = __raid56_parity_write(rbio);
				1764	if (ret)
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	1765	btrfs_bio_counter_dec(fs_info);
Chris Mason	6ac0f48	2013-01-31 14:42:28 -0500	[diff] [blame]	1766	}
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	1767	return ret;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1768	}
				1769
				1770	/*
				1771	* all parity reconstruction happens here. We've read in everything
				1772	* we can find from the drives and this does the heavy lifting of
				1773	* sorting the good from the bad.
				1774	*/
				1775	static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
				1776	{
				1777	int pagenr, stripe;
				1778	void **pointers;
				1779	int faila = -1, failb = -1;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1780	struct page *page;
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1781	blk_status_t err;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1782	int i;
				1783
David Sterba	31e818f	2015-02-20 18:00:26 +0100	[diff] [blame]	1784	pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1785	if (!pointers) {
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1786	err = BLK_STS_RESOURCE;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1787	goto cleanup_io;
				1788	}
				1789
				1790	faila = rbio->faila;
				1791	failb = rbio->failb;
				1792
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	1793	if (rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
				1794	rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1795	spin_lock_irq(&rbio->bio_list_lock);
				1796	set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
				1797	spin_unlock_irq(&rbio->bio_list_lock);
				1798	}
				1799
				1800	index_rbio_pages(rbio);
				1801
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1802	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	1803	/*
				1804	* Now we just use bitmap to mark the horizontal stripes in
				1805	* which we have data when doing parity scrub.
				1806	*/
				1807	if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
				1808	!test_bit(pagenr, rbio->dbitmap))
				1809	continue;
				1810
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1811	/* setup our array of pointers with pages
				1812	* from each stripe
				1813	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1814	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1815	/*
				1816	* if we're rebuilding a read, we have to use
				1817	* pages from the bio list
				1818	*/
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	1819	if ((rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
				1820	rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1821	(stripe == faila \|\| stripe == failb)) {
				1822	page = page_in_rbio(rbio, stripe, pagenr, 0);
				1823	} else {
				1824	page = rbio_stripe_page(rbio, stripe, pagenr);
				1825	}
				1826	pointers[stripe] = kmap(page);
				1827	}
				1828
				1829	/* all raid6 handling here */
Zhao Lei	10f1190	2015-01-20 15:11:43 +0800	[diff] [blame]	1830	if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1831	/*
				1832	* single failure, rebuild from parity raid5
				1833	* style
				1834	*/
				1835	if (failb < 0) {
				1836	if (faila == rbio->nr_data) {
				1837	/*
				1838	* Just the P stripe has failed, without
				1839	* a bad data or Q stripe.
				1840	* TODO, we should redo the xor here.
				1841	*/
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1842	err = BLK_STS_IOERR;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1843	goto cleanup;
				1844	}
				1845	/*
				1846	* a single failure in raid6 is rebuilt
				1847	* in the pstripe code below
				1848	*/
				1849	goto pstripe;
				1850	}
				1851
				1852	/* make sure our ps and qs are in order */
Nikolay Borisov	b7d2083	2020-07-02 16:46:46 +0300	[diff] [blame]	1853	if (faila > failb)
				1854	swap(faila, failb);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1855
				1856	/* if the q stripe is failed, do a pstripe reconstruction
				1857	* from the xors.
				1858	* If both the q stripe and the P stripe are failed, we're
				1859	* here due to a crc mismatch and we can't give them the
				1860	* data they want
				1861	*/
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1862	if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) {
				1863	if (rbio->bbio->raid_map[faila] ==
				1864	RAID5_P_STRIPE) {
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1865	err = BLK_STS_IOERR;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1866	goto cleanup;
				1867	}
				1868	/*
				1869	* otherwise we have one bad data stripe and
				1870	* a good P stripe. raid5!
				1871	*/
				1872	goto pstripe;
				1873	}
				1874
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	1875	if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1876	raid6_datap_recov(rbio->real_stripes,
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1877	PAGE_SIZE, faila, pointers);
				1878	} else {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1879	raid6_2data_recov(rbio->real_stripes,
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1880	PAGE_SIZE, faila, failb,
				1881	pointers);
				1882	}
				1883	} else {
				1884	void *p;
				1885
				1886	/* rebuild from P stripe here (raid5 or raid6) */
				1887	BUG_ON(failb != -1);
				1888	pstripe:
				1889	/* Copy parity block into failed block to start with */
David Sterba	69d2480	2018-06-29 10:56:44 +0200	[diff] [blame]	1890	copy_page(pointers[faila], pointers[rbio->nr_data]);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1891
				1892	/* rearrange the pointer array */
				1893	p = pointers[faila];
				1894	for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
				1895	pointers[stripe] = pointers[stripe + 1];
				1896	pointers[rbio->nr_data - 1] = p;
				1897
				1898	/* xor in the rest */
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	1899	run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1900	}
				1901	/* if we're doing this rebuild as part of an rmw, go through
				1902	* and set all of our private rbio pages in the
				1903	* failed stripes as uptodate. This way finish_rmw will
				1904	* know they can be trusted. If this was a read reconstruction,
				1905	* other endio functions will fiddle the uptodate bits
				1906	*/
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	1907	if (rbio->operation == BTRFS_RBIO_WRITE) {
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	1908	for (i = 0; i < rbio->stripe_npages; i++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1909	if (faila != -1) {
				1910	page = rbio_stripe_page(rbio, faila, i);
				1911	SetPageUptodate(page);
				1912	}
				1913	if (failb != -1) {
				1914	page = rbio_stripe_page(rbio, failb, i);
				1915	SetPageUptodate(page);
				1916	}
				1917	}
				1918	}
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	1919	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1920	/*
				1921	* if we're rebuilding a read, we have to use
				1922	* pages from the bio list
				1923	*/
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	1924	if ((rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
				1925	rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1926	(stripe == faila \|\| stripe == failb)) {
				1927	page = page_in_rbio(rbio, stripe, pagenr, 0);
				1928	} else {
				1929	page = rbio_stripe_page(rbio, stripe, pagenr);
				1930	}
				1931	kunmap(page);
				1932	}
				1933	}
				1934
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1935	err = BLK_STS_OK;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1936	cleanup:
				1937	kfree(pointers);
				1938
				1939	cleanup_io:
Liu Bo	580c6ef	2018-03-22 09:20:11 +0800	[diff] [blame]	1940	/*
				1941	* Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
				1942	* valid rbio which is consistent with ondisk content, thus such a
				1943	* valid rbio can be cached to avoid further disk reads.
				1944	*/
				1945	if (rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
				1946	rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
Liu Bo	44ac474	2018-01-12 18:07:02 -0700	[diff] [blame]	1947	/*
				1948	* - In case of two failures, where rbio->failb != -1:
				1949	*
				1950	* Do not cache this rbio since the above read reconstruction
				1951	* (raid6_datap_recov() or raid6_2data_recov()) may have
				1952	* changed some content of stripes which are not identical to
				1953	* on-disk content any more, otherwise, a later write/recover
				1954	* may steal stripe_pages from this rbio and end up with
				1955	* corruptions or rebuild failures.
				1956	*
				1957	* - In case of single failure, where rbio->failb == -1:
				1958	*
				1959	* Cache this rbio iff the above read reconstruction is
Andrea Gelmini	52042d8	2018-11-28 12:05:13 +0100	[diff] [blame]	1960	* executed without problems.
Liu Bo	44ac474	2018-01-12 18:07:02 -0700	[diff] [blame]	1961	*/
				1962	if (err == BLK_STS_OK && rbio->failb < 0)
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	1963	cache_rbio_pages(rbio);
				1964	else
				1965	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				1966
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	1967	rbio_orig_end_io(rbio, err);
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	1968	} else if (err == BLK_STS_OK) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1969	rbio->faila = -1;
				1970	rbio->failb = -1;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	1971
				1972	if (rbio->operation == BTRFS_RBIO_WRITE)
				1973	finish_rmw(rbio);
				1974	else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
				1975	finish_parity_scrub(rbio, 0);
				1976	else
				1977	BUG();
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1978	} else {
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	1979	rbio_orig_end_io(rbio, err);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1980	}
				1981	}
				1982
				1983	/*
				1984	* This is called only for stripes we've read from disk to
				1985	* reconstruct the parity.
				1986	*/
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	1987	static void raid_recover_end_io(struct bio *bio)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1988	{
				1989	struct btrfs_raid_bio *rbio = bio->bi_private;
				1990
				1991	/*
				1992	* we only read stripe pages off the disk, set them
				1993	* up to date if there were no errors
				1994	*/
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	1995	if (bio->bi_status)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	1996	fail_bio_stripe(rbio, bio);
				1997	else
				1998	set_bio_pages_uptodate(bio);
				1999	bio_put(bio);
				2000
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2001	if (!atomic_dec_and_test(&rbio->stripes_pending))
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2002	return;
				2003
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2004	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2005	rbio_orig_end_io(rbio, BLK_STS_IOERR);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2006	else
				2007	__raid_recover_end_io(rbio);
				2008	}
				2009
				2010	/*
				2011	* reads everything we need off the disk to reconstruct
				2012	* the parity. endio handlers trigger final reconstruction
				2013	* when the IO is done.
				2014	*
				2015	* This is used both for reads from the higher layers and for
				2016	* parity construction required to finish a rmw cycle.
				2017	*/
				2018	static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
				2019	{
				2020	int bios_to_read = 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2021	struct bio_list bio_list;
				2022	int ret;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2023	int pagenr;
				2024	int stripe;
				2025	struct bio *bio;
				2026
				2027	bio_list_init(&bio_list);
				2028
				2029	ret = alloc_rbio_pages(rbio);
				2030	if (ret)
				2031	goto cleanup;
				2032
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2033	atomic_set(&rbio->error, 0);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2034
				2035	/*
Chris Mason	4ae10b3	2013-01-31 14:42:09 -0500	[diff] [blame]	2036	* read everything that hasn't failed. Thanks to the
				2037	* stripe cache, it is possible that some or all of these
				2038	* pages are going to be uptodate.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2039	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2040	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
Liu Bo	5588383	2014-06-24 15:39:16 +0800	[diff] [blame]	2041	if (rbio->faila == stripe \|\| rbio->failb == stripe) {
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2042	atomic_inc(&rbio->error);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2043	continue;
Liu Bo	5588383	2014-06-24 15:39:16 +0800	[diff] [blame]	2044	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2045
Zhao Lei	915e229	2015-03-03 20:42:48 +0800	[diff] [blame]	2046	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2047	struct page *p;
				2048
				2049	/*
				2050	* the rmw code may have already read this
				2051	* page in
				2052	*/
				2053	p = rbio_stripe_page(rbio, stripe, pagenr);
				2054	if (PageUptodate(p))
				2055	continue;
				2056
				2057	ret = rbio_add_io_page(rbio, &bio_list,
				2058	rbio_stripe_page(rbio, stripe, pagenr),
				2059	stripe, pagenr, rbio->stripe_len);
				2060	if (ret < 0)
				2061	goto cleanup;
				2062	}
				2063	}
				2064
				2065	bios_to_read = bio_list_size(&bio_list);
				2066	if (!bios_to_read) {
				2067	/*
				2068	* we might have no bios to read just because the pages
				2069	* were up to date, or we might have no bios to read because
				2070	* the devices were gone.
				2071	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2072	if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2073	__raid_recover_end_io(rbio);
Nikolay Borisov	813f8a0	2020-07-15 14:02:17 +0300	[diff] [blame]	2074	return 0;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2075	} else {
				2076	goto cleanup;
				2077	}
				2078	}
				2079
				2080	/*
				2081	* the bbio may be freed once we submit the last bio. Make sure
				2082	* not to touch it after that
				2083	*/
Miao Xie	b89e1b0	2014-10-15 11:18:44 +0800	[diff] [blame]	2084	atomic_set(&rbio->stripes_pending, bios_to_read);
Nikolay Borisov	bf28a60	2020-07-02 16:46:43 +0300	[diff] [blame]	2085	while ((bio = bio_list_pop(&bio_list))) {
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2086	bio->bi_private = rbio;
				2087	bio->bi_end_io = raid_recover_end_io;
David Sterba	ebcc326	2018-06-29 10:56:53 +0200	[diff] [blame]	2088	bio->bi_opf = REQ_OP_READ;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2089
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	2090	btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2091
Mike Christie	4e49ea4	2016-06-05 14:31:41 -0500	[diff] [blame]	2092	submit_bio(bio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2093	}
Nikolay Borisov	813f8a0	2020-07-15 14:02:17 +0300	[diff] [blame]	2094
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2095	return 0;
				2096
				2097	cleanup:
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2098	if (rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
				2099	rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2100	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	2101
				2102	while ((bio = bio_list_pop(&bio_list)))
				2103	bio_put(bio);
				2104
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2105	return -EIO;
				2106	}
				2107
				2108	/*
				2109	* the main entry point for reads from the higher layers. This
				2110	* is really only called when the normal read path had a failure,
				2111	* so we assume the bio they send down corresponds to a failed part
				2112	* of the drive.
				2113	*/
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2114	int raid56_parity_recover(struct btrfs_fs_info fs_info, struct bio bio,
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2115	struct btrfs_bio *bbio, u64 stripe_len,
				2116	int mirror_num, int generic_io)
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2117	{
				2118	struct btrfs_raid_bio *rbio;
				2119	int ret;
				2120
Liu Bo	abad60c	2017-03-29 10:54:26 -0700	[diff] [blame]	2121	if (generic_io) {
				2122	ASSERT(bbio->mirror_num == mirror_num);
				2123	btrfs_io_bio(bio)->mirror_num = mirror_num;
				2124	}
				2125
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2126	rbio = alloc_rbio(fs_info, bbio, stripe_len);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	2127	if (IS_ERR(rbio)) {
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	2128	if (generic_io)
				2129	btrfs_put_bbio(bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2130	return PTR_ERR(rbio);
Miao Xie	af8e2d1	2014-10-23 14:42:50 +0800	[diff] [blame]	2131	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2132
Miao Xie	1b94b55	2014-11-06 16:14:21 +0800	[diff] [blame]	2133	rbio->operation = BTRFS_RBIO_READ_REBUILD;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2134	bio_list_add(&rbio->bio_list, bio);
Kent Overstreet	4f024f3	2013-10-11 15:44:27 -0700	[diff] [blame]	2135	rbio->bio_list_bytes = bio->bi_iter.bi_size;
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2136
				2137	rbio->faila = find_logical_bio_stripe(rbio, bio);
				2138	if (rbio->faila == -1) {
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	2139	btrfs_warn(fs_info,
Liu Bo	e46a28c	2016-07-29 10:57:55 -0700	[diff] [blame]	2140	"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bbio has map_type %llu)",
David Sterba	1201b58	2020-11-26 15:41:27 +0100	[diff] [blame]	2141	__func__, bio->bi_iter.bi_sector << 9,
Liu Bo	e46a28c	2016-07-29 10:57:55 -0700	[diff] [blame]	2142	(u64)bio->bi_iter.bi_size, bbio->map_type);
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	2143	if (generic_io)
				2144	btrfs_put_bbio(bbio);
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2145	kfree(rbio);
				2146	return -EIO;
				2147	}
				2148
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	2149	if (generic_io) {
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	2150	btrfs_bio_counter_inc_noblocked(fs_info);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	2151	rbio->generic_bio_cnt = 1;
				2152	} else {
Zhao Lei	6e9606d	2015-01-20 15:11:34 +0800	[diff] [blame]	2153	btrfs_get_bbio(bbio);
Miao Xie	4245215	2014-11-25 16:39:28 +0800	[diff] [blame]	2154	}
				2155
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2156	/*
Liu Bo	8810f75	2018-01-02 13:36:41 -0700	[diff] [blame]	2157	* Loop retry:
				2158	* for 'mirror == 2', reconstruct from all other stripes.
				2159	* for 'mirror_num > 2', select a stripe to fail on every retry.
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2160	*/
Liu Bo	8810f75	2018-01-02 13:36:41 -0700	[diff] [blame]	2161	if (mirror_num > 2) {
				2162	/*
				2163	* 'mirror == 3' is to fail the p stripe and
				2164	* reconstruct from the q stripe. 'mirror > 3' is to
				2165	* fail a data stripe and reconstruct from p+q stripe.
				2166	*/
				2167	rbio->failb = rbio->real_stripes - (mirror_num - 1);
				2168	ASSERT(rbio->failb > 0);
				2169	if (rbio->failb <= rbio->faila)
				2170	rbio->failb--;
				2171	}
David Woodhouse	53b381b	2013-01-29 18:40:14 -0500	[diff] [blame]	2172
				2173	ret = lock_stripe_add(rbio);
				2174
				2175	/*
				2176	* __raid56_parity_recover will end the bio with
				2177	* any errors it hits. We don't want to return
				2178	* its error value up the stack because our caller
				2179	* will end up calling bio_endio with any nonzero
				2180	* return
				2181	*/
				2182	if (ret == 0)
				2183	__raid56_parity_recover(rbio);
				2184	/*
				2185	* our rbio has been added to the list of
				2186	* rbios that will be handled after the
				2187	* currently lock owner is done
				2188	*/
				2189	return 0;
				2190
				2191	}
				2192
				2193	static void rmw_work(struct btrfs_work *work)
				2194	{
				2195	struct btrfs_raid_bio *rbio;
				2196
				2197	rbio = container_of(work, struct btrfs_raid_bio, work);
				2198	raid56_rmw_stripe(rbio);
				2199	}
				2200
				2201	static void read_rebuild_work(struct btrfs_work *work)
				2202	{
				2203	struct btrfs_raid_bio *rbio;
				2204
				2205	rbio = container_of(work, struct btrfs_raid_bio, work);
				2206	__raid56_parity_recover(rbio);
				2207	}
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2208
				2209	/*
				2210	* The following code is used to scrub/replace the parity stripe
				2211	*
Qu Wenruo	ae6529c	2017-03-29 09:33:21 +0800	[diff] [blame]	2212	* Caller must have already increased bio_counter for getting @bbio.
				2213	*
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2214	* Note: We need make sure all the pages that add into the scrub/replace
				2215	* raid bio are correct and not be changed during the scrub/replace. That
				2216	* is those pages just hold metadata or file data with checksum.
				2217	*/
				2218
				2219	struct btrfs_raid_bio *
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2220	raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info fs_info, struct bio bio,
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2221	struct btrfs_bio *bbio, u64 stripe_len,
				2222	struct btrfs_device *scrub_dev,
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2223	unsigned long *dbitmap, int stripe_nsectors)
				2224	{
				2225	struct btrfs_raid_bio *rbio;
				2226	int i;
				2227
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2228	rbio = alloc_rbio(fs_info, bbio, stripe_len);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2229	if (IS_ERR(rbio))
				2230	return NULL;
				2231	bio_list_add(&rbio->bio_list, bio);
				2232	/*
				2233	* This is a special bio which is used to hold the completion handler
				2234	* and make the scrub rbio is similar to the other types
				2235	*/
				2236	ASSERT(!bio->bi_iter.bi_size);
				2237	rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
				2238
Liu Bo	9cd3a7e	2017-08-03 13:53:31 -0600	[diff] [blame]	2239	/*
				2240	* After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted
				2241	* to the end position, so this search can start from the first parity
				2242	* stripe.
				2243	*/
				2244	for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2245	if (bbio->stripes[i].dev == scrub_dev) {
				2246	rbio->scrubp = i;
				2247	break;
				2248	}
				2249	}
Liu Bo	9cd3a7e	2017-08-03 13:53:31 -0600	[diff] [blame]	2250	ASSERT(i < rbio->real_stripes);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2251
				2252	/* Now we just support the sectorsize equals to page size */
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	2253	ASSERT(fs_info->sectorsize == PAGE_SIZE);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2254	ASSERT(rbio->stripe_npages == stripe_nsectors);
				2255	bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
				2256
Qu Wenruo	ae6529c	2017-03-29 09:33:21 +0800	[diff] [blame]	2257	/*
				2258	* We have already increased bio_counter when getting bbio, record it
				2259	* so we can free it at rbio_orig_end_io().
				2260	*/
				2261	rbio->generic_bio_cnt = 1;
				2262
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2263	return rbio;
				2264	}
				2265
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2266	/* Used for both parity scrub and missing. */
				2267	void raid56_add_scrub_pages(struct btrfs_raid_bio rbio, struct page page,
				2268	u64 logical)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2269	{
				2270	int stripe_offset;
				2271	int index;
				2272
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2273	ASSERT(logical >= rbio->bbio->raid_map[0]);
				2274	ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] +
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2275	rbio->stripe_len * rbio->nr_data);
Zhao Lei	8e5cfb5	2015-01-20 15:11:33 +0800	[diff] [blame]	2276	stripe_offset = (int)(logical - rbio->bbio->raid_map[0]);
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	2277	index = stripe_offset >> PAGE_SHIFT;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2278	rbio->bio_pages[index] = page;
				2279	}
				2280
				2281	/*
				2282	* We just scrub the parity that we have correct data on the same horizontal,
				2283	* so we needn't allocate all pages for all the stripes.
				2284	*/
				2285	static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
				2286	{
				2287	int i;
				2288	int bit;
				2289	int index;
				2290	struct page *page;
				2291
				2292	for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2293	for (i = 0; i < rbio->real_stripes; i++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2294	index = i * rbio->stripe_npages + bit;
				2295	if (rbio->stripe_pages[index])
				2296	continue;
				2297
				2298	page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				2299	if (!page)
				2300	return -ENOMEM;
				2301	rbio->stripe_pages[index] = page;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2302	}
				2303	}
				2304	return 0;
				2305	}
				2306
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2307	static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
				2308	int need_check)
				2309	{
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2310	struct btrfs_bio *bbio = rbio->bbio;
Kees Cook	1389053	2018-05-29 16:44:59 -0700	[diff] [blame]	2311	void **pointers = rbio->finish_pointers;
				2312	unsigned long *pbitmap = rbio->finish_pbitmap;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2313	int nr_data = rbio->nr_data;
				2314	int stripe;
				2315	int pagenr;
David Sterba	c17af96	2020-02-19 15:17:20 +0100	[diff] [blame]	2316	bool has_qstripe;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2317	struct page *p_page = NULL;
				2318	struct page *q_page = NULL;
				2319	struct bio_list bio_list;
				2320	struct bio *bio;
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2321	int is_replace = 0;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2322	int ret;
				2323
				2324	bio_list_init(&bio_list);
				2325
David Sterba	c17af96	2020-02-19 15:17:20 +0100	[diff] [blame]	2326	if (rbio->real_stripes - rbio->nr_data == 1)
				2327	has_qstripe = false;
				2328	else if (rbio->real_stripes - rbio->nr_data == 2)
				2329	has_qstripe = true;
				2330	else
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2331	BUG();
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2332
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2333	if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
				2334	is_replace = 1;
				2335	bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
				2336	}
				2337
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2338	/*
				2339	* Because the higher layers(scrubber) are unlikely to
				2340	* use this area of the disk again soon, so don't cache
				2341	* it.
				2342	*/
				2343	clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
				2344
				2345	if (!need_check)
				2346	goto writeback;
				2347
				2348	p_page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				2349	if (!p_page)
				2350	goto cleanup;
				2351	SetPageUptodate(p_page);
				2352
David Sterba	c17af96	2020-02-19 15:17:20 +0100	[diff] [blame]	2353	if (has_qstripe) {
Ira Weiny	d70cef0	2021-01-27 22:15:03 -0800	[diff] [blame]	2354	/* RAID6, allocate and map temp space for the Q stripe */
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2355	q_page = alloc_page(GFP_NOFS \| __GFP_HIGHMEM);
				2356	if (!q_page) {
				2357	__free_page(p_page);
				2358	goto cleanup;
				2359	}
				2360	SetPageUptodate(q_page);
Ira Weiny	d70cef0	2021-01-27 22:15:03 -0800	[diff] [blame]	2361	pointers[rbio->real_stripes - 1] = kmap(q_page);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2362	}
				2363
				2364	atomic_set(&rbio->error, 0);
				2365
Ira Weiny	d70cef0	2021-01-27 22:15:03 -0800	[diff] [blame]	2366	/* Map the parity stripe just once */
				2367	pointers[nr_data] = kmap(p_page);
				2368
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2369	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
				2370	struct page *p;
				2371	void *parity;
				2372	/* first collect one page from each data stripe */
				2373	for (stripe = 0; stripe < nr_data; stripe++) {
				2374	p = page_in_rbio(rbio, stripe, pagenr, 0);
				2375	pointers[stripe] = kmap(p);
				2376	}
				2377
David Sterba	c17af96	2020-02-19 15:17:20 +0100	[diff] [blame]	2378	if (has_qstripe) {
Ira Weiny	d70cef0	2021-01-27 22:15:03 -0800	[diff] [blame]	2379	/* RAID6, call the library function to fill in our P/Q */
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2380	raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2381	pointers);
				2382	} else {
				2383	/* raid5 */
David Sterba	69d2480	2018-06-29 10:56:44 +0200	[diff] [blame]	2384	copy_page(pointers[nr_data], pointers[0]);
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	2385	run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2386	}
				2387
Nicholas D Steeves	0132761	2016-05-19 21:18:45 -0400	[diff] [blame]	2388	/* Check scrubbing parity and repair it */
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2389	p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
				2390	parity = kmap(p);
Kirill A. Shutemov	09cbfea	2016-04-01 15:29:47 +0300	[diff] [blame]	2391	if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE))
David Sterba	69d2480	2018-06-29 10:56:44 +0200	[diff] [blame]	2392	copy_page(parity, pointers[rbio->scrubp]);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2393	else
				2394	/* Parity is right, needn't writeback */
				2395	bitmap_clear(rbio->dbitmap, pagenr, 1);
				2396	kunmap(p);
				2397
Andrea Righi	3897b6f	2019-03-14 08:56:28 +0100	[diff] [blame]	2398	for (stripe = 0; stripe < nr_data; stripe++)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2399	kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
				2400	}
				2401
Ira Weiny	d70cef0	2021-01-27 22:15:03 -0800	[diff] [blame]	2402	kunmap(p_page);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2403	__free_page(p_page);
Ira Weiny	d70cef0	2021-01-27 22:15:03 -0800	[diff] [blame]	2404	if (q_page) {
				2405	kunmap(q_page);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2406	__free_page(q_page);
Ira Weiny	d70cef0	2021-01-27 22:15:03 -0800	[diff] [blame]	2407	}
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2408
				2409	writeback:
				2410	/*
				2411	* time to start writing. Make bios for everything from the
				2412	* higher layers (the bio_list in our rbio) and our p/q. Ignore
				2413	* everything else.
				2414	*/
				2415	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
				2416	struct page *page;
				2417
				2418	page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
				2419	ret = rbio_add_io_page(rbio, &bio_list,
				2420	page, rbio->scrubp, pagenr, rbio->stripe_len);
				2421	if (ret)
				2422	goto cleanup;
				2423	}
				2424
Miao Xie	7603597	2014-11-14 17:45:42 +0800	[diff] [blame]	2425	if (!is_replace)
				2426	goto submit_write;
				2427
				2428	for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
				2429	struct page *page;
				2430
				2431	page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
				2432	ret = rbio_add_io_page(rbio, &bio_list, page,
				2433	bbio->tgtdev_map[rbio->scrubp],
				2434	pagenr, rbio->stripe_len);
				2435	if (ret)
				2436	goto cleanup;
				2437	}
				2438
				2439	submit_write:
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2440	nr_data = bio_list_size(&bio_list);
				2441	if (!nr_data) {
				2442	/* Every parity is right */
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2443	rbio_orig_end_io(rbio, BLK_STS_OK);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2444	return;
				2445	}
				2446
				2447	atomic_set(&rbio->stripes_pending, nr_data);
				2448
Nikolay Borisov	bf28a60	2020-07-02 16:46:43 +0300	[diff] [blame]	2449	while ((bio = bio_list_pop(&bio_list))) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2450	bio->bi_private = rbio;
Zhao Lei	a6111d11b	2016-01-12 17:52:13 +0800	[diff] [blame]	2451	bio->bi_end_io = raid_write_end_io;
David Sterba	ebcc326	2018-06-29 10:56:53 +0200	[diff] [blame]	2452	bio->bi_opf = REQ_OP_WRITE;
Mike Christie	4e49ea4	2016-06-05 14:31:41 -0500	[diff] [blame]	2453
				2454	submit_bio(bio);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2455	}
				2456	return;
				2457
				2458	cleanup:
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2459	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	2460
				2461	while ((bio = bio_list_pop(&bio_list)))
				2462	bio_put(bio);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2463	}
				2464
				2465	static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
				2466	{
				2467	if (stripe >= 0 && stripe < rbio->nr_data)
				2468	return 1;
				2469	return 0;
				2470	}
				2471
				2472	/*
				2473	* While we're doing the parity check and repair, we could have errors
				2474	* in reading pages off the disk. This checks for errors and if we're
				2475	* not able to read the page it'll trigger parity reconstruction. The
				2476	* parity scrub will be finished after we've reconstructed the failed
				2477	* stripes
				2478	*/
				2479	static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
				2480	{
				2481	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
				2482	goto cleanup;
				2483
				2484	if (rbio->faila >= 0 \|\| rbio->failb >= 0) {
				2485	int dfail = 0, failp = -1;
				2486
				2487	if (is_data_stripe(rbio, rbio->faila))
				2488	dfail++;
				2489	else if (is_parity_stripe(rbio->faila))
				2490	failp = rbio->faila;
				2491
				2492	if (is_data_stripe(rbio, rbio->failb))
				2493	dfail++;
				2494	else if (is_parity_stripe(rbio->failb))
				2495	failp = rbio->failb;
				2496
				2497	/*
				2498	* Because we can not use a scrubbing parity to repair
				2499	* the data, so the capability of the repair is declined.
				2500	* (In the case of RAID5, we can not repair anything)
				2501	*/
				2502	if (dfail > rbio->bbio->max_errors - 1)
				2503	goto cleanup;
				2504
				2505	/*
				2506	* If all data is good, only parity is correctly, just
				2507	* repair the parity.
				2508	*/
				2509	if (dfail == 0) {
				2510	finish_parity_scrub(rbio, 0);
				2511	return;
				2512	}
				2513
				2514	/*
				2515	* Here means we got one corrupted data stripe and one
				2516	* corrupted parity on RAID6, if the corrupted parity
Nicholas D Steeves	0132761	2016-05-19 21:18:45 -0400	[diff] [blame]	2517	* is scrubbing parity, luckily, use the other one to repair
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2518	* the data, or we can not repair the data stripe.
				2519	*/
				2520	if (failp != rbio->scrubp)
				2521	goto cleanup;
				2522
				2523	__raid_recover_end_io(rbio);
				2524	} else {
				2525	finish_parity_scrub(rbio, 1);
				2526	}
				2527	return;
				2528
				2529	cleanup:
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2530	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2531	}
				2532
				2533	/*
				2534	* end io for the read phase of the rmw cycle. All the bios here are physical
				2535	* stripe bios we've read from the disk so we can recalculate the parity of the
				2536	* stripe.
				2537	*
				2538	* This will usually kick off finish_rmw once all the bios are read in, but it
				2539	* may trigger parity reconstruction if we had any errors along the way
				2540	*/
Christoph Hellwig	4246a0b	2015-07-20 15:29:37 +0200	[diff] [blame]	2541	static void raid56_parity_scrub_end_io(struct bio *bio)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2542	{
				2543	struct btrfs_raid_bio *rbio = bio->bi_private;
				2544
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	2545	if (bio->bi_status)
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2546	fail_bio_stripe(rbio, bio);
				2547	else
				2548	set_bio_pages_uptodate(bio);
				2549
				2550	bio_put(bio);
				2551
				2552	if (!atomic_dec_and_test(&rbio->stripes_pending))
				2553	return;
				2554
				2555	/*
				2556	* this will normally call finish_rmw to start our write
				2557	* but if there are any failed stripes we'll reconstruct
				2558	* from parity first
				2559	*/
				2560	validate_rbio_for_parity_scrub(rbio);
				2561	}
				2562
				2563	static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
				2564	{
				2565	int bios_to_read = 0;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2566	struct bio_list bio_list;
				2567	int ret;
				2568	int pagenr;
				2569	int stripe;
				2570	struct bio *bio;
				2571
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	2572	bio_list_init(&bio_list);
				2573
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2574	ret = alloc_rbio_essential_pages(rbio);
				2575	if (ret)
				2576	goto cleanup;
				2577
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2578	atomic_set(&rbio->error, 0);
				2579	/*
				2580	* build a list of bios to read all the missing parts of this
				2581	* stripe
				2582	*/
Miao Xie	2c8cdd6	2014-11-14 16:06:25 +0800	[diff] [blame]	2583	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2584	for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
				2585	struct page *page;
				2586	/*
				2587	* we want to find all the pages missing from
				2588	* the rbio and read them from the disk. If
				2589	* page_in_rbio finds a page in the bio list
				2590	* we don't need to read it off the stripe.
				2591	*/
				2592	page = page_in_rbio(rbio, stripe, pagenr, 1);
				2593	if (page)
				2594	continue;
				2595
				2596	page = rbio_stripe_page(rbio, stripe, pagenr);
				2597	/*
				2598	* the bio cache may have handed us an uptodate
				2599	* page. If so, be happy and use it
				2600	*/
				2601	if (PageUptodate(page))
				2602	continue;
				2603
				2604	ret = rbio_add_io_page(rbio, &bio_list, page,
				2605	stripe, pagenr, rbio->stripe_len);
				2606	if (ret)
				2607	goto cleanup;
				2608	}
				2609	}
				2610
				2611	bios_to_read = bio_list_size(&bio_list);
				2612	if (!bios_to_read) {
				2613	/*
				2614	* this can happen if others have merged with
				2615	* us, it means there is nothing left to read.
				2616	* But if there are missing devices it may not be
				2617	* safe to do the full stripe write yet.
				2618	*/
				2619	goto finish;
				2620	}
				2621
				2622	/*
				2623	* the bbio may be freed once we submit the last bio. Make sure
				2624	* not to touch it after that
				2625	*/
				2626	atomic_set(&rbio->stripes_pending, bios_to_read);
Nikolay Borisov	bf28a60	2020-07-02 16:46:43 +0300	[diff] [blame]	2627	while ((bio = bio_list_pop(&bio_list))) {
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2628	bio->bi_private = rbio;
				2629	bio->bi_end_io = raid56_parity_scrub_end_io;
David Sterba	ebcc326	2018-06-29 10:56:53 +0200	[diff] [blame]	2630	bio->bi_opf = REQ_OP_READ;
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2631
Jeff Mahoney	0b246af	2016-06-22 18:54:23 -0400	[diff] [blame]	2632	btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2633
Mike Christie	4e49ea4	2016-06-05 14:31:41 -0500	[diff] [blame]	2634	submit_bio(bio);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2635	}
				2636	/* the actual write will happen once the reads are done */
				2637	return;
				2638
				2639	cleanup:
Omar Sandoval	58efbc9	2017-08-22 23:45:59 -0700	[diff] [blame]	2640	rbio_orig_end_io(rbio, BLK_STS_IOERR);
Liu Bo	785884f	2017-09-22 12:11:18 -0600	[diff] [blame]	2641
				2642	while ((bio = bio_list_pop(&bio_list)))
				2643	bio_put(bio);
				2644
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2645	return;
				2646
				2647	finish:
				2648	validate_rbio_for_parity_scrub(rbio);
				2649	}
				2650
				2651	static void scrub_parity_work(struct btrfs_work *work)
				2652	{
				2653	struct btrfs_raid_bio *rbio;
				2654
				2655	rbio = container_of(work, struct btrfs_raid_bio, work);
				2656	raid56_parity_scrub_stripe(rbio);
				2657	}
				2658
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2659	void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
				2660	{
				2661	if (!lock_stripe_add(rbio))
David Sterba	a81b747	2018-06-29 10:57:03 +0200	[diff] [blame]	2662	start_async_work(rbio, scrub_parity_work);
Miao Xie	5a6ac9e	2014-11-06 17:20:58 +0800	[diff] [blame]	2663	}
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2664
				2665	/* The following code is used for dev replace of a missing RAID 5/6 device. */
				2666
				2667	struct btrfs_raid_bio *
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2668	raid56_alloc_missing_rbio(struct btrfs_fs_info fs_info, struct bio bio,
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2669	struct btrfs_bio *bbio, u64 length)
				2670	{
				2671	struct btrfs_raid_bio *rbio;
				2672
Jeff Mahoney	2ff7e61	2016-06-22 18:54:24 -0400	[diff] [blame]	2673	rbio = alloc_rbio(fs_info, bbio, length);
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2674	if (IS_ERR(rbio))
				2675	return NULL;
				2676
				2677	rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
				2678	bio_list_add(&rbio->bio_list, bio);
				2679	/*
				2680	* This is a special bio which is used to hold the completion handler
				2681	* and make the scrub rbio is similar to the other types
				2682	*/
				2683	ASSERT(!bio->bi_iter.bi_size);
				2684
				2685	rbio->faila = find_logical_bio_stripe(rbio, bio);
				2686	if (rbio->faila == -1) {
				2687	BUG();
				2688	kfree(rbio);
				2689	return NULL;
				2690	}
				2691
Qu Wenruo	ae6529c	2017-03-29 09:33:21 +0800	[diff] [blame]	2692	/*
				2693	* When we get bbio, we have already increased bio_counter, record it
				2694	* so we can free it at rbio_orig_end_io()
				2695	*/
				2696	rbio->generic_bio_cnt = 1;
				2697
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2698	return rbio;
				2699	}
				2700
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2701	void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
				2702	{
				2703	if (!lock_stripe_add(rbio))
David Sterba	e66d8d5	2018-06-29 10:57:00 +0200	[diff] [blame]	2704	start_async_work(rbio, read_rebuild_work);
Omar Sandoval	b4ee178	2015-06-19 11:52:50 -0700	[diff] [blame]	2705	}